In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
import math
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
from implementations import *
from pathlib import Path
import zipfile

my_file = Path("../data/train.csv")
if not my_file.is_file():
    with zipfile.ZipFile('../data/train.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('../data')

DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Exploratory data analysis and feature processing
As a first step, we obtain the number of events and features.

In [3]:
events = pd.read_csv(DATA_TRAIN_PATH)
n_events = events.shape[0]
n_features = events.shape[1] - 2
print('Number of events:', n_events)
print('Number of features:', n_features)
events.head()

Number of events: 250000
Number of features: 30


Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000,s,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,...,-0.277,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,100001,b,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,...,-1.916,164.546,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226
2,100002,b,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,...,-2.186,260.414,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251
3,100003,b,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,...,0.06,86.062,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
4,100004,b,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,...,-0.871,53.131,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0


We now obtain the percentage of events for each prediction.

In [4]:
events['Prediction'].value_counts()/n_events * 100

b    65.7332
s    34.2668
Name: Prediction, dtype: float64

Now we know that we only have two possible predictions (b or s). This is why we can think about this problem as a **Binary Classification** in which **Y** can take two values $Y \in {b, s}$ where b and s are the class labels.

In [5]:
events.describe()

Unnamed: 0,Id,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,224999.5,-49.023079,49.239819,81.181982,57.895962,-708.420675,-601.237051,-709.356603,2.3731,18.917332,...,-0.010119,209.797178,0.979176,-348.329567,-399.254314,-399.259788,-692.381204,-709.121609,-709.118631,73.064591
std,72168.927986,406.345647,35.344886,40.828691,63.655682,454.480565,657.972302,453.019877,0.782911,22.273494,...,1.812223,126.499506,0.977426,532.962789,489.338286,489.333883,479.875496,453.384624,453.389017,98.015662
min,100000.0,-999.0,0.0,6.329,0.0,-999.0,-999.0,-999.0,0.208,0.0,...,-3.142,13.678,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
25%,162499.75,78.10075,19.241,59.38875,14.06875,-999.0,-999.0,-999.0,1.81,2.841,...,-1.575,123.0175,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
50%,224999.5,105.012,46.524,73.752,38.4675,-999.0,-999.0,-999.0,2.4915,12.3155,...,-0.024,179.739,1.0,38.96,-1.872,-2.093,-999.0,-999.0,-999.0,40.5125
75%,287499.25,130.60625,73.598,92.259,79.169,0.49,83.446,-4.593,2.961,27.591,...,1.561,263.37925,2.0,75.349,0.433,0.503,33.703,-2.457,-2.275,109.93375
max,349999.0,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433


### Dealing with missig values
We can now check the number of missing values per feature.

In [6]:
columns = []
for columnName in events.columns[2:]:
    column = events[columnName].to_numpy()
    n = column[np.where(column == -999.0)].shape[0]
    if n > 0:
        columns.append(columnName)
        print('{columnName} is missing {n} values.'.format(columnName=columnName, n=n))

DER_mass_MMC is missing 38114 values.
DER_deltaeta_jet_jet is missing 177457 values.
DER_mass_jet_jet is missing 177457 values.
DER_prodeta_jet_jet is missing 177457 values.
DER_lep_eta_centrality is missing 177457 values.
PRI_jet_leading_pt is missing 99913 values.
PRI_jet_leading_eta is missing 99913 values.
PRI_jet_leading_phi is missing 99913 values.
PRI_jet_subleading_pt is missing 177457 values.
PRI_jet_subleading_eta is missing 177457 values.
PRI_jet_subleading_phi is missing 177457 values.


According to the documentation, the value for the mass is -999.0 when the topology of the event was too far from the expected one. We can see that there are 38114 missing values for this feature (DER_mass_MMC).  

Regarding the other missing values, the missing values depend on the number of jets of the event (PRI_jet_num):
- If it is 0, a specific set S of features presents missing values.
- If it is 1, only a specific subset $S' \subset S$ of the features presents missing values.
- If it is either 2 or 3, there are no missing values.

To replace this values, we are going to use the median of the rest of the values for each feature. Other option will be to use the mean, but we will stick with the first option as it is more robust when we have outliers.

In [7]:
for columnName in columns:
    values_positions = np.where(events[columnName] > -999)
    median = np.median(events[[columnName]].iloc[values_positions])
    print('The median for the {columnName} is {median}.'.format(columnName=columnName, median=median))

The median for the DER_mass_MMC is 112.406.
The median for the DER_deltaeta_jet_jet is 2.107.
The median for the DER_mass_jet_jet is 225.885.
The median for the DER_prodeta_jet_jet is -0.244.
The median for the DER_lep_eta_centrality is 0.454.
The median for the PRI_jet_leading_pt is 65.561.
The median for the PRI_jet_leading_eta is 0.0.
The median for the PRI_jet_leading_phi is -0.033.
The median for the PRI_jet_subleading_pt is 47.902.
The median for the PRI_jet_subleading_eta is -0.01.
The median for the PRI_jet_subleading_phi is -0.002.


In [8]:
for columnName in columns:
    nan_positions = np.where(events[columnName] == -999)
    for i in nan_positions:
        events[columnName][i] = median
events.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000,s,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,...,-0.277,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,100001,b,160.937,68.768,103.235,48.146,-0.002,-0.002,-0.002,3.473,...,-1.916,164.546,1,46.226,0.725,1.158,-0.002,-0.002,-0.002,46.226
2,100002,b,-0.002,162.172,125.953,35.635,-0.002,-0.002,-0.002,3.148,...,-2.186,260.414,1,44.251,2.053,-2.028,-0.002,-0.002,-0.002,44.251
3,100003,b,143.905,81.417,80.943,0.414,-0.002,-0.002,-0.002,3.31,...,0.06,86.062,0,-0.002,-0.002,-0.002,-0.002,-0.002,-0.002,0.0
4,100004,b,175.864,16.915,134.805,16.405,-0.002,-0.002,-0.002,3.891,...,-0.871,53.131,0,-0.002,-0.002,-0.002,-0.002,-0.002,-0.002,0.0


In [9]:
events.describe()

Unnamed: 0,Id,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,224999.5,103.28016,49.239819,81.181982,57.895962,0.696077,107.879702,-0.239851,2.3731,18.917332,...,-0.010119,209.797178,0.979176,50.921982,-0.002765,-0.008239,16.735549,-0.004857,-0.001879,73.064591
std,72168.927986,68.566562,35.344886,40.828691,63.655682,1.439739,272.699511,1.966312,0.782911,22.273494,...,1.812223,126.499506,0.977426,62.734025,1.382702,1.405057,31.339638,1.094455,0.978743,98.015662
min,100000.0,-0.002,0.0,6.329,0.0,-0.002,-0.002,-18.066,0.208,0.0,...,-3.142,13.678,0.0,-0.002,-4.499,-3.142,-0.002,-4.5,-3.142,0.0
25%,162499.75,78.10075,19.241,59.38875,14.06875,-0.002,-0.002,-0.002,1.81,2.841,...,-1.575,123.0175,0.0,-0.002,-0.433,-0.556,-0.002,-0.002,-0.002,0.0
50%,224999.5,105.012,46.524,73.752,38.4675,-0.002,-0.002,-0.002,2.4915,12.3155,...,-0.024,179.739,1.0,38.96,-0.002,-0.002,-0.002,-0.002,-0.002,40.5125
75%,287499.25,130.60625,73.598,92.259,79.169,0.49,83.446,-0.002,2.961,27.591,...,1.561,263.37925,2.0,75.349,0.433,0.503,33.703,-0.002,-0.002,109.93375
max,349999.0,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433


### Relationship between the features
We now check if there are any obvious **relationships between the features**.

In [10]:
corr = events.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Id,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
Id,1.0,0.0011,-0.006059,-0.001851,0.002073,0.001267,0.000959,-0.002511,-0.001349,-0.000581,0.001427,-0.001209,0.003393,0.001202,0.002829,0.001573,-0.000467,6.1e-05,0.001567,-0.001934,0.000345,-0.001141,0.002312,0.000175,0.002425,0.002319,0.001674,-6.5e-05,-0.001465,0.001958,0.001024
DER_mass_MMC,0.0011,1.0,-0.171895,0.658955,0.143551,0.087398,0.07319,-0.02775,0.454144,0.03796,0.210713,0.034265,0.244082,0.076395,0.256038,0.003288,-0.00593,0.282277,0.008688,-0.002509,-0.087873,0.004723,0.199829,0.152053,0.136551,0.001577,0.002178,0.093436,0.001303,-0.002432,0.126654
DER_mass_transverse_met_lep,-0.006059,-0.171895,1.0,0.190109,-0.249116,-0.181832,-0.166375,0.100248,0.043251,0.017758,-0.146837,0.349504,-0.419757,-0.166062,-0.145464,-0.002109,0.001132,0.310648,-0.006777,0.00034,0.183716,-0.015925,-0.167811,-0.210537,-0.232283,0.000157,0.006188,-0.158188,0.001713,0.003761,-0.210009
DER_mass_vis,-0.001851,0.658955,0.190109,1.0,-0.062562,-0.038798,-0.044463,0.02437,0.579712,-0.000702,0.088685,0.09749,-0.090846,-0.044088,0.290011,0.002127,-0.003624,0.405482,0.002196,-0.002018,-0.08733,-0.001467,0.0533,-0.02686,-0.055404,0.002082,0.004561,-0.042269,0.001364,-0.000395,-0.052902
DER_pt_h,0.002073,0.143551,-0.249116,-0.062562,1.0,0.38144,0.418998,-0.115577,-0.539379,0.310501,0.832733,0.089187,0.539356,0.372279,0.407421,0.001665,0.005248,0.360939,0.008354,-0.002923,0.679585,0.008585,0.782547,0.623401,0.874578,0.002042,-0.001244,0.564093,-0.002015,-0.004903,0.808616
DER_deltaeta_jet_jet,0.001267,0.087398,-0.181832,-0.038798,0.38144,1.0,0.877451,-0.678572,-0.216071,0.136083,0.468766,-0.007274,0.310177,0.829791,0.150843,0.002172,0.001393,0.090695,0.004432,0.000129,0.204095,0.002889,0.415091,0.628915,0.409217,-0.001994,-0.000823,0.601025,-0.006253,0.000951,0.498463
DER_mass_jet_jet,0.000959,0.07319,-0.166375,-0.044463,0.418998,0.877451,1.0,-0.705885,-0.235595,0.147097,0.524928,-0.002108,0.279774,0.700467,0.179802,0.00055,0.002518,0.116189,0.003903,-0.001554,0.260303,0.003741,0.459242,0.521999,0.476641,-0.001293,0.000166,0.618477,-0.004929,-0.001597,0.552401
DER_prodeta_jet_jet,-0.002511,-0.02775,0.100248,0.02437,-0.115577,-0.678572,-0.705885,1.0,0.062268,0.026707,-0.118669,0.034634,-0.119182,-0.526277,-0.063927,0.001217,0.000138,-0.005008,0.000179,0.000112,-0.050691,0.000424,-0.089553,-0.133343,-0.121373,-0.002092,-0.001956,-0.15648,-0.00171,0.002058,-0.124342
DER_deltar_tau_lep,-0.001349,0.454144,0.043251,0.579712,-0.539379,-0.216071,-0.235595,0.062268,1.0,-0.148081,-0.432603,0.047046,-0.205441,-0.215263,-0.202035,0.003632,-0.011229,-0.069957,0.000699,-0.000776,-0.402345,-0.00157,-0.407002,-0.347904,-0.480736,1.7e-05,0.00635,-0.318732,0.002262,0.003854,-0.448737
DER_pt_tot,-0.000581,0.03796,0.017758,-0.000702,0.310501,0.136083,0.147097,0.026707,-0.148081,1.0,0.38116,0.039193,0.178448,0.126257,0.095754,0.003596,0.001452,0.109617,0.007987,-0.004249,0.269739,0.002515,0.448925,0.360409,0.268569,-0.005394,0.002968,0.35321,-0.000581,-0.00341,0.403382


Having a look at the table below, we find that there are some features that have a correlation bigger than 0.9. We can considere this values as an obvious relationship between the features and so we will remove then.

In [11]:
# Calculate the absolute of all the correlations
corr = corr.abs()

# Select upper triangle of correlation matrix
mask = np.triu(np.ones(corr.shape), k=1)
upperTriangle = corr.where(mask.astype(bool))

# Find index of feature columns with correlation greater than 0.9
features_to_drop = []
for column in upperTriangle.columns:
    if any(upperTriangle[column] > 0.9):
        features_to_drop.append(column)

# Drop selected features
events = events.drop(events[features_to_drop], axis=1)
print('Columns removed: ')
features_to_drop

Columns removed: 


['PRI_met_sumet', 'PRI_jet_leading_pt', 'PRI_jet_all_pt']

In [12]:
events.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_jet_num,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi
0,100000,s,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,...,2.273,-2.414,16.824,-0.277,2,2.15,0.444,46.062,1.24,-2.475
1,100001,b,160.937,68.768,103.235,48.146,-0.002,-0.002,-0.002,3.473,...,0.501,0.103,44.704,-1.916,1,0.725,1.158,-0.002,-0.002,-0.002
2,100002,b,-0.002,162.172,125.953,35.635,-0.002,-0.002,-0.002,3.148,...,-0.953,1.052,54.283,-2.186,1,2.053,-2.028,-0.002,-0.002,-0.002
3,100003,b,143.905,81.417,80.943,0.414,-0.002,-0.002,-0.002,3.31,...,-0.522,-3.1,31.082,0.06,0,-0.002,-0.002,-0.002,-0.002,-0.002
4,100004,b,175.864,16.915,134.805,16.405,-0.002,-0.002,-0.002,3.891,...,0.798,1.569,2.723,-0.871,0,-0.002,-0.002,-0.002,-0.002,-0.002


In [13]:
events.describe()

Unnamed: 0,Id,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_jet_num,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,224999.5,103.28016,49.239819,81.181982,57.895962,0.696077,107.879702,-0.239851,2.3731,18.917332,...,-0.019507,0.043543,41.717235,-0.010119,0.979176,-0.002765,-0.008239,16.735549,-0.004857,-0.001879
std,72168.927986,68.566562,35.344886,40.828691,63.655682,1.439739,272.699511,1.966312,0.782911,22.273494,...,1.264982,1.816611,32.894693,1.812223,0.977426,1.382702,1.405057,31.339638,1.094455,0.978743
min,100000.0,-0.002,0.0,6.329,0.0,-0.002,-0.002,-18.066,0.208,0.0,...,-2.505,-3.142,0.109,-3.142,0.0,-4.499,-3.142,-0.002,-4.5,-3.142
25%,162499.75,78.10075,19.241,59.38875,14.06875,-0.002,-0.002,-0.002,1.81,2.841,...,-1.014,-1.522,21.398,-1.575,0.0,-0.433,-0.556,-0.002,-0.002,-0.002
50%,224999.5,105.012,46.524,73.752,38.4675,-0.002,-0.002,-0.002,2.4915,12.3155,...,-0.045,0.086,34.802,-0.024,1.0,-0.002,-0.002,-0.002,-0.002,-0.002
75%,287499.25,130.60625,73.598,92.259,79.169,0.49,83.446,-0.002,2.961,27.591,...,0.959,1.618,51.895,1.561,2.0,0.433,0.503,33.703,-0.002,-0.002
max,349999.0,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,...,2.503,3.142,2842.617,3.142,3.0,4.499,3.141,721.456,4.5,3.142


## Generate predictions and save ouput in csv format for submission:

Obtein weights by appying the different models.

- **Least squares**

In [14]:
w_LS, loss_LS = least_squares(y, tX)
print('Loss with Least Squares: ', loss_LS)

Loss with Least Squares:  0.3396868094770702


- **Stochastic Gradient descent**

In [15]:
initial_w = w_LS
max_iters = 50
gamma = 1e-20

w_SGD, loss_SGD = least_squares_SGD(y, tX, initial_w, max_iters, gamma)
print("Loss with Stochastic Gradient descent", loss_SGD)

Loss with Stochastic Gradient descent 0.01577939179745203


- **Logistic regression**

In [16]:
initial_w = w_LS
max_iters = 100
gamma = 1e-30

w_LR, loss_LR = logistic_regression(y, tX, initial_w, max_iters, gamma)
print("Loss with Logistic regression", loss_LR)

Loss with Logistic regression 63635.49587432659


Open the test dataset.

In [17]:
my_file = Path("../data/test.csv")
if not my_file.is_file():
    with zipfile.ZipFile('../data/test.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('../data')

DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

events_test = pd.read_csv(DATA_TRAIN_PATH)

Generate submission csv file.

In [18]:
OUTPUT_PATH = '../data/submission.csv'
# Using weights of SGD
weights = w_SGD
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)