## Imports and global variables

In [15]:
import numpy as np

import sys
sys.path.append("./src/")

In [16]:
from implementations import *
from helpers import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
DATA_FOLDER = './data/'
TRAIN_FILE = './train.csv'
TEST_FILE = './test.csv'

NAN_VALUE = -999.0
INTEGER_COLUMN = 22 # 24 in raw csv file, but 23 when id and prediction column are removed

# For debug purpose only
SUB_SAMPLE = False

# Import dataset

In [18]:
# Load data
x_tr, y_tr = load_data(DATA_FOLDER + TRAIN_FILE, sub_sample=SUB_SAMPLE)
x_te, y_te = load_data(DATA_FOLDER + TEST_FILE, sub_sample=SUB_SAMPLE)

print("x_tr shape : {}, y_tr shape : {}".format(x_tr.shape, y_tr.shape))
print("x_te shape : {}, y_te shape : {}".format(x_te.shape, y_te.shape))

# Define missing values as NAN
x_tr[x_tr == NAN_VALUE] = np.nan
x_te[x_te == NAN_VALUE] = np.nan

x_tr shape : (250000, 30), y_tr shape : (250000, 1)
x_te shape : (568238, 30), y_te shape : (568238, 1)


# Data exploration

Features with too much NAN values

In [19]:
MAX_NAN_RATIO = 0.5

nb_nan = np.count_nonzero(np.isnan(x_tr), axis=0)
nan_ratio = nb_nan / x_tr.shape[1]


print("Nb Columns with > {:.2f} nan ratio : {}".format(MAX_NAN_RATIO, np.count_nonzero(nan_ratio >= MAX_NAN_RATIO)))

col_names = []
with open(DATA_FOLDER + TRAIN_FILE) as dataset:
    col_names = dataset.readline().split(',')
    nan_col_names = [col_name for col_idx, col_name in enumerate(col_names[2:]) if nan_ratio[col_idx]]

    print("Columns with > {:.2f} nan ratio :".format(MAX_NAN_RATIO))
    print(nan_col_names)


Nb Columns with > 0.50 nan ratio : 11
Columns with > 0.50 nan ratio :
['DER_mass_MMC', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi']


# Feature engineering / Data cleaning

Basic transformations
- Remove features which contains too much NAN values, because they don't contain ennough information
- Replace labels chars by integers
- Standardize training df
- Replace missing values

In [20]:
# Remove columns with too much NAN
x_tr = x_tr[:, nan_ratio <= MAX_NAN_RATIO]
x_te = x_te[:, nan_ratio <= MAX_NAN_RATIO]

print("x_tr shape : {}".format(x_tr.shape))
print("x_te shape : {}".format(x_te.shape))

# Standardize before replacing missing values
x_tr, mean_x, std_x = standardize_training(x_tr, missing_values=True)
x_te = standardize_test(x_te, mean_x, std_x)

# Replace missing data by the mean
x_tr = replace_nan_by_means(x_tr, mean_data=mean_x)
x_te = replace_nan_by_means(x_te, mean_data=mean_x)

assert(x_tr[np.isnan(x_tr)].shape[0] == 0)
assert(x_te[np.isnan(x_te)].shape[0] == 0)

print("x_tr range :{} {}".format(np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range :{} {}".format(np.nanmin(x_te), np.nanmax(x_te)))

x_tr shape : (250000, 19)
x_te shape : (568238, 19)
x_tr range :-2.7654533479598604 126.4322216845244
x_te range :-2.7284120321456804 36.86455391749545


# Model fitting and predictions

In [21]:
# Add offset term to x
xt_tr = add_offset(x_tr)
xt_te = add_offset(x_te)

# Fit a model
w, loss_tr = least_squares(y_tr, x_tr)
loss_te = compute_mse(y_te, x_te, w)

print("Training loss : {}".format(loss_tr))
print("Test loss : {}".format(loss_te))

Training loss : 0.14683885668675498
Test loss : 0.5242436378887273
