# Predictions
This is used to run the whole pipeline of preprocessing and make predictions.  
It doesn't contain any graphs or analysis tool in order to speed up a full run of the notebook.  

## Imports and global variables

In [17]:
import math
import numpy as np
import matplotlib.pyplot as plt
import sys
import csv

sys.path.append("../")
sys.path.append("../src/")
# import self-defined modules
from src.helpers import *
from src.implementations import *
from src.cross_validation import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
DATA_FOLDER = "../data/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
NAN_VALUE = -999.0
INTEGER_COLUMN = (
    22  # 24 in raw csv file, but 23 when id and prediction column are removed
)

# For debug purpose only
SUB_SAMPLE = False

# Import dataset

In [19]:
# Load data
y_tr, x_tr, _ = load_csv_data(DATA_FOLDER + TRAIN_FILE, sub_sample=SUB_SAMPLE)
y_te, x_te, ids_tests = load_csv_data(DATA_FOLDER + TEST_FILE, sub_sample=SUB_SAMPLE)
print("x_tr shape : {}, y_tr shape : {}".format(x_tr.shape, y_tr.shape))
print("x_te shape : {}, y_te shape : {}".format(x_te.shape, y_te.shape))

# Define missing values as NAN
x_tr[x_tr == NAN_VALUE] = np.nan
x_te[x_te == NAN_VALUE] = np.nan

# Get columns names
col_names = []
with open(DATA_FOLDER + TRAIN_FILE) as dataset:
    col_names = dataset.readline().split(",")

x_tr shape : (250000, 30), y_tr shape : (250000,)
x_te shape : (568238, 30), y_te shape : (568238,)


Initial range of values

In [20]:
print("x_tr range :{} {}".format(np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range :{} {}".format(np.nanmin(x_te), np.nanmax(x_te)))

x_tr range :-18.066 4974.979
x_te range :-19.012 4794.827


## Pre-processing

In [21]:
# before pre-processing
print("Before pre-processing:")
print("x_tr shape : {}".format(x_tr.shape))
print("x_te shape : {}".format(x_te.shape))
print("x_tr range :{} {}".format(np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range :{} {}".format(np.nanmin(x_te), np.nanmax(x_te)))

Before pre-processing:
x_tr shape : (250000, 30)
x_te shape : (568238, 30)
x_tr range :-18.066 4974.979
x_te range :-19.012 4794.827


In [22]:
# apply log transformation
cols_to_log_transform = ["DER_pt_h", "DER_pt_tot", "PRI_met", "PRI_met_sumet"]
cols_idx = [get_col_idx(col, col_names) for col in cols_to_log_transform]

x_tr, x_te = log_transform(x_tr, x_te, cols_idx)

# Remove columns with too much NAN
x_tr = remove_nan_columns(x_tr, 1)
x_te = remove_nan_columns(x_te, 1)

# Replace missing data by the mean
mean_x = np.nanvar(x_tr, axis=0)
x_tr = replace_nan_by_means(x_tr, mean_data=mean_x)
x_te = replace_nan_by_means(x_te, mean_data=mean_x)


assert x_tr[np.isnan(x_tr)].shape[0] == 0
assert x_te[np.isnan(x_te)].shape[0] == 0

# Standardize after replacing missing values
IDs_degrees = np.array([10, 13, 15])
x_tr = transform(x_tr, IDs_degrees)
x_te = transform(x_te, IDs_degrees)

In [23]:
# plot features after pre-processing
print("After pre-processing:")
print("x_tr shape : {}".format(x_tr.shape))
print("x_te shape : {}".format(x_te.shape))
print("x_tr range :{} {}".format(np.min(x_tr), np.max(x_tr)))
print("x_te range :{} {}".format(np.min(x_te), np.max(x_te)))

After pre-processing:
x_tr shape : (250000, 22)
x_te shape : (568238, 22)
x_tr range :-2.0760373982511515 3.0
x_te range :-2.0734828021024287 3.0


## Model fitting

In [24]:
# Fit a model
w, loss_tr = least_squares(y_tr, x_tr)

print("Training loss : {}".format(loss_tr))

Training loss : 0.363579193312102


## Cross-validation

In [25]:
import pandas as pd  # TODO

# test colin
# w_init, _ = least_squares(y_tr, xt_tr)
# w_init = w_init*0.0
# lambdas=[1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
results, best_res = run_cross_validation(
    y_tr,
    x_tr,
    3,
    is_regression=True,
    lambdas=[0, 0.5, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    degrees=[1, 2, 3, 4],
)
# run_cross_validation(y_tr, xt_tr, 5, is_regression=False, lambdas=[0.0], gammas=[1e-3], initial_w=w_init, degree=0, max_iters=10000)

In [26]:
pd.DataFrame(results)

Unnamed: 0,lambda,gamma,degree,tr,te
0,0.0,0.0,1,"{'acc': 0.7320109280437123, 'f1': 0.5551148898...","{'acc': 0.7318909275637102, 'f1': 0.5548677990..."
1,0.0,0.0,2,"{'acc': 0.7753611014444058, 'f1': 0.6506382345...","{'acc': 0.7755271021084084, 'f1': 0.6508277956..."
2,0.0,0.0,3,"{'acc': 0.7911331645326581, 'f1': 0.6781310514...","{'acc': 0.7909551638206552, 'f1': 0.6778600827..."
3,0.0,0.0,4,"{'acc': 0.7966011864047456, 'f1': 0.6871745936...","{'acc': 0.7964951859807439, 'f1': 0.6869276863..."
4,0.5,0.0,1,"{'acc': 0.7063868255473023, 'f1': 0.4413643703...","{'acc': 0.7062348249392997, 'f1': 0.4408187846..."
5,0.5,0.0,2,"{'acc': 0.7312149248596994, 'f1': 0.4724424382...","{'acc': 0.7311509246036985, 'f1': 0.4723906394..."
6,0.5,0.0,3,"{'acc': 0.740938963755855, 'f1': 0.51030474268...","{'acc': 0.7408909635638542, 'f1': 0.5100450718..."
7,0.5,0.0,4,"{'acc': 0.755443021772087, 'f1': 0.56870966101...","{'acc': 0.7553790215160862, 'f1': 0.5685758636..."
8,0.1,0.0,1,"{'acc': 0.7211768847075387, 'f1': 0.5080202725...","{'acc': 0.7210948843795375, 'f1': 0.5078234895..."
9,0.1,0.0,2,"{'acc': 0.7686590746362986, 'f1': 0.6145906460...","{'acc': 0.7684910739642957, 'f1': 0.6143284455..."


In [27]:
best_res

{'lambda': 0,
 'gamma': 0.0,
 'degree': 4,
 'acc': 0.7964951859807439,
 'f1': 0.6869276863454647}

In [28]:
pd.DataFrame(best_res, index=["best_res"])

Unnamed: 0,lambda,gamma,degree,acc,f1
best_res,0,0.0,4,0.796495,0.686928


# Run one CV for each PRI_jet_num model

In [30]:
# Run multiple CV, on for each PRI_jet_num
id = {}
y_pred = {}
best_results = []
results = []

lambdas = [0.5, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
gammas = [0.5e-7, 0.5e-07, 0.5e-06, 0.5e-6]
degrees = [1, 2, 3, 4]

# run all CV
for i in range(4):
    print(f"===== Jet_num : {i} =====")
    y_r, x_r = get_split_by_jet_data(
        y_tr, x_tr, i, jet_column=17
    )  # 17 because no offset
    # 0, 1e-1, 1e-2, 1e-3
    res, best_res = run_cross_validation(
        y_r, x_r, 3, is_regression=True, lambdas=lambdas, degrees=degrees
    )

    best_results.append(best_res)
    results.append(res)

===== Jet_num ===== 0
===== Jet_num ===== 1
===== Jet_num ===== 2
===== Jet_num ===== 3


In [31]:
for i, best_res in enumerate(best_results):
    print(i, best_res)
    print("----------------")

0 {'lambda': 1e-05, 'gamma': 0.0, 'degree': 4, 'acc': 0.8411602209944752, 'f1': 0.6510049057783657}
----------------
1 {'lambda': 0.0001, 'gamma': 0.0, 'degree': 4, 'acc': 0.780834107087589, 'f1': 0.6750973244087315}
----------------
2 {'lambda': 0.0001, 'gamma': 0.0, 'degree': 4, 'acc': 0.7811389666329225, 'f1': 0.7943995052104169}
----------------
3 {'lambda': 0.001, 'gamma': 0.0, 'degree': 4, 'acc': 0.7861848041869699, 'f1': 0.5794656944225623}
----------------


## Predictions

### Predictions with regression and threshold

In [32]:
print(y_tr.shape)
print(y_tr[y_tr == 1].shape, y_te.shape)
print(x_tr.shape)

(250000,)
(85667,) (568238,)
(250000, 22)


### Predictions with log_reg

In [33]:
# # logistic_regression(y, tx, initial_w, max_iters, gamma)
# # Fit the model
# for i in range(1, 5):

#     print("build poly {}", i)
#     xtr = x_tr.copy()
#     xte = x_te.copy()

#     xtr = build_poly(xtr, 3)
#     xte = build_poly(xte, 3)

#     assert(xtr[np.isnan(xtr)].shape[0] == 0)
#     assert(xte[np.isnan(xte)].shape[0] == 0)

#     #
#     # =====
#     #
#     print(xtr.shape)
#     id = {}
#     y_pred = {}
#     # lambdas = [0,0,-2000,0]
#     lambdas = [13-3,1e-3,1e-3,1e-3]
#     gammas = [0.5e-7,0.5e-07,0.5e-06,0.5e-6]
#     for i in range(0,4):
#         y_r,x_r = get_split_by_jet_data(y_tr,xtr,i)
#         y_e,x_e = get_split_by_jet_data(ids_tests,xte,i)
#         w_init = np.linalg.lstsq(x_r.T @ x_r, x_r.T @ y_r,rcond=None)[0]
#         print(x_r.shape)
#         w, loss = reg_logistic_regression(y_r, x_r, lambdas[i], w_init, 1000, gammas[i])
#         print(loss)
#         y_pred[i] = predict_log(w,x_e)
#         id[i] = y_e

#     y_predict = np.concatenate((y_pred[0],y_pred[1],y_pred[2],y_pred[3]),axis=0)
#     ids_test = np.concatenate((id[0],id[1],id[2],id[3]),axis=0)

#     print(len(y_predict[y_predict == 1]))
#     print(len(y_predict[y_predict == -1]))
#     print(len(y_predict))

#     #10 1e-6 = 0.269576
#     #5  1e-6 = 0.045 - 0.3248 - 0.5362 - 0.2657
#     #change lambdas  and gammas for each jet

# Prediction ridge regb

In [34]:
id = {}
y_pred_tr = {}
y_pred = {}
# lambdas = [0,0,-2000,0]
lambdas = [1e-5, 1e-4, 1e-4, 1e-3]
gammas = [0.5e-7, 0.5e-07, 0.5e-06, 0.5e-6]
print(best_results[0]["degree"])
degrees = [
    best_results[0]["degree"],
    best_results[1]["degree"],
    best_results[2]["degree"],
    best_results[3]["degree"],
]

for i in range(0, 4):
    xtr = build_poly(x_tr.copy(), degrees[i])
    xte = build_poly(x_te.copy(), degrees[i])

    y_r, x_r = get_split_by_jet_data(y_tr, xtr, i)
    y_e, x_e = get_split_by_jet_data(ids_tests, xte, i)

    # w, loss = reg_logistic_regression(y_r, x_r, lambdas[i], w_init, 1000, gammas[i])
    w, loss = ridge_regression(y_r, x_r, lambda_=lambdas[i])
    y_pred_tr[i] = predict_reg(w, x_r)
    y_pred[i] = predict_reg(w, x_e)
    id[i] = y_e

    print(i, "accuracy", accuracy(y_r, y_pred_tr[i]))
    print(i, "f1", f1_score(y_r, y_pred_tr[i]))


y_predict_tr = np.concatenate(
    (y_pred_tr[0], y_pred_tr[1], y_pred_tr[2], y_pred_tr[3]), axis=0
)

y_predict = np.concatenate((y_pred[0], y_pred[1], y_pred[2], y_pred[3]), axis=0)
ids_test = np.concatenate((id[0], id[1], id[2], id[3]), axis=0)

print(len(y_predict[y_predict == 1]))
print(len(y_predict[y_predict == -1]))
print(len(y_predict))

print("=====================================")
print("Global scores")
print("accuracy", accuracy(y_tr, y_predict_tr))
print("f1", f1_score(y_tr, y_predict_tr))


create_csv_submission(ids_test, y_predict, "../results/pred.csv")

4
0 accuracy 0.8414920981253691
0 f1 0.6515281536735098
1 accuracy 0.7813886309708037
1 f1 0.6755846442377617
2 accuracy 0.7827864784930229
2 f1 0.7958738271558881
3 accuracy 0.7885760693015701
3 f1 0.5842058562555457
175343
392895
568238
Global scores
accuracy 0.560528
f1 0.32422192151556156


In [35]:
# Run on the train set
x_pred = build_poly(x_tr, 3)
w, loss = ridge_regression(y_tr, x_pred, lambda_=1e-3)
y_pred = predict_reg(w, x_pred)

print("Training accuracy : {}".format(accuracy(y_tr, y_pred)))
print("Training f1 : {}".format(f1_score(y_tr, y_pred)))
print(loss)

# Run on the test set
x_pred = build_poly(x_te, 3)
y_pred = predict_reg(w, x_pred)

y_test, input_test, ids_test = load_csv_data(DATA_FOLDER + TEST_FILE, False)

create_csv_submission(ids_test, y_pred, "../results/pred.csv")

Training accuracy : 0.791008
Training f1 : 0.6773540164015414
0.30253050445628277


In [36]:
# w_init, _ = least_squares(y_tr, xt_tr)

# gamma_min = 2e-5
# gamma_max = 35e-5
# lambda_min = 0.0
# lambda_max = 2000
# lambdas = np.linspace(lambda_min, lambda_max, num_intervals_l)
# gammas = np.linspace(gamma_min, gamma_max, num_intervals_g)

# mean_loss_tr, mean_loss_te = run_cross_validation(y_tr, xt_tr, 2, w_init, 5, 2e-5,35e-5,0,2000,10,10)
# print("Mean training mse: {}".format(mean_loss_tr))
# print("Mean test mse: {}".format(mean_loss_te))

# 2000  1e-05   = 0.69310
# 722   1e-4    = 0.69307
# 444   15e-05  = 0.69285
# 222   25e-05  = 0.69265
# 222   35e-05  = 0.69242

### Balance dataset

In [37]:
# nb_s = len(y_tr[y_tr == 1])
# nb_b = len(y_tr) - nb_s
# print("Signals: {} ({}%)".format(nb_s, 100 * nb_s / len(x_tr)))
# print("Backgrounds: {} ({}%)".format(nb_b, 100 * nb_b / len(x_tr)))
# print("Ratio signal / background: {}".format(nb_s / nb_b))

# x_tr_ds, y_tr_ds = balance_dataset(x_tr, y_tr) TODO