# EPFL Machine Learning Higgs

## Loading and preprocessing

In [24]:
%load_ext autoreload
%autoreload 2

import os
import csv
import numpy as np
from helpers import load_data, one_hot_encode, standardize

In [22]:
data_directory = '../data'
train_dataset_path = os.path.join(data_directory, 'train.csv')
public_test_dataset_path = os.path.join(data_directory, 'test.csv')

# Loading the data
_, Y_train_public, feature_names, X_train_public = load_data(train_dataset_path)
ids_test_public, _, _, X_test_public = load_data(public_test_dataset_path)

In [25]:
# We need to deal with -999 somehow (missing values)
# For now just assume a naive approach and set the values to 0
# Since they are floats, we add an epsilon against numerical errors
EPSILON = 1E-4
mask_train = np.abs(X_train_public + 999) <= EPSILON
mask_test = np.abs(X_test_public + 999) <= EPSILON
print('Proportion of missing values:', np.sum(mask_train)/(mask_train.shape[0]*mask_train.shape[1]))
print('Proportion of missing values:', np.sum(mask_test)/(mask_test.shape[0]*mask_test.shape[1]))

X_train_public[mask_train] = 0
X_test_public[mask_test] = 0

# We will standardize the data based on the mean and standard deviation of the !!! public train dataset !!! (is this okay? We are basically estimating the population mean and std tis way)
# We can also try normalizing between 0-1, since some values stay quite large
# ! The method standardizes in-place !
continuous_column_idxs = np.where(feature_names != "PRI_jet_num")[0]
column_means, column_stds = standardize(X_train_public, continuous_column_idxs)
_, _ = standardize(X_test_public, continuous_column_idxs, column_means, column_stds)

Proportion of missing values: 0.0
Proportion of missing values: 0.0


In [59]:
# We will need to add interaction terms to deal with co-linearity somewhere (like x1*x2 instead of just x1 and x2)


In [26]:
# Note to self: Make sure to standardize before one-hot encoding, otherwise specifying the columns could be annoying

# Then we will need to notice the discrete-valued column, since this needs to be one-hot encoded
# In our dataset, only "PRI_jet_num" is discrete.
discrete_column_idxs = np.where(feature_names == "PRI_jet_num")[0]

# Update the features by one-hot encoding the discrete ones, but only update the feature names at the end
# They will be the same for the train and test set anyway
X_train_public, _ = one_hot_encode(X_train_public, discrete_column_idxs, feature_names)
X_test_public, feature_names = one_hot_encode(X_test_public, discrete_column_idxs, feature_names)

In [27]:
# Since this is a binary classification problem, we do not need to one-hot encode the y-vector, but we can just use binary values
positive_sample = 'b'
negative_sample = 's'
Y_train_public = np.expand_dims((Y_train_public == positive_sample).astype(np.int32), axis=1)

# ! There are quite some more positive than negative samples, maybe we could try to weigh negative samples more or something? !
print('Number of positive samples:', np.sum(Y_train_public))
print('Number of negative samples:', len(Y_train_public) - np.sum(Y_train_public))

Number of positive samples: 164333
Number of negative samples: 85667


# Training and analysis

After preprocessing, the new features of the public train- and test dataset are stored in "X_train_public" and "X_test_public". The labels are binary values stored in "Y_train_public". Furthermore, the feature column names are found in "feature_names"

In [28]:
# We will do k-fold cross validation to create subsets of the training and testing datasets
print(feature_names)
print(X_train_public.shape, X_test_public.shape)
print(Y_train_public.shape)

['DER_mass_MMC' 'DER_mass_transverse_met_lep' 'DER_mass_vis' 'DER_pt_h'
 'DER_deltaeta_jet_jet' 'DER_mass_jet_jet' 'DER_prodeta_jet_jet'
 'DER_deltar_tau_lep' 'DER_pt_tot' 'DER_sum_pt' 'DER_pt_ratio_lep_tau'
 'DER_met_phi_centrality' 'DER_lep_eta_centrality' 'PRI_tau_pt'
 'PRI_tau_eta' 'PRI_tau_phi' 'PRI_lep_pt' 'PRI_lep_eta' 'PRI_lep_phi'
 'PRI_met' 'PRI_met_phi' 'PRI_met_sumet' 'PRI_jet_leading_pt'
 'PRI_jet_leading_eta' 'PRI_jet_leading_phi' 'PRI_jet_subleading_pt'
 'PRI_jet_subleading_eta' 'PRI_jet_subleading_phi' 'PRI_jet_all_pt'
 'PRI_jet_num_0' 'PRI_jet_num_1' 'PRI_jet_num_2' 'PRI_jet_num_3']
(250000, 33) (568238, 33)
(250000, 1)


In [57]:
# Now we can do some training
from implementations import least_squares_GD, ridge_regression

w, loss = least_squares_GD(Y_train_public, X_train_public, np.zeros(shape=(X_train_public.shape[1], 1)), 500, 0.1)

w_optim, loss_optim = ridge_regression(Y_train_public, X_train_public, lambda_=0)

print('GD loss:', loss)
print('"Optimal" loss:', loss_optim)

100%|██████████| 500/500 [00:09<00:00, 50.59it/s]


GD loss: 0.08500413180707353
"Optimal" loss: [[0.08444842]]


In [None]:
# Now we can evaluate the training

# ! We can actually also do some hyperparameter tuning (using ROC curves to determine the right cut-off probability) !

# Inference

In [None]:
# Do predictions on the public test dataset


In [76]:
# Placeholder, array with True of False whether it belongs to the positive or negative sample
predictions = np.random.randint(2, size=ids_test_public.shape).astype(bool)

submission_file_name = 'submission_0.csv'
with open(os.path.join(data_directory, submission_file_name), mode='w', newline='', encoding='utf-8') as submission_file:
    writer = csv.writer(submission_file, delimiter=',')
    writer.writerow(['Id', 'Prediction'])
    for i, prediction in enumerate(predictions):
        writer.writerow([ids_test_public[i], positive_sample if prediction else negative_sample])
