# Predictions
This is used to run the whole pipeline of preprocessing and make predictions.  
It doesn't contain any graphs or analysis tool in order to speed up a full run of the notebook.  

## Imports and global variables

In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
import sys
import csv

# import self-defined modules
sys.path.append("../")
sys.path.append("../src/")

from src.helpers import *
from src.implementations import *
from src.cross_validation import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
DATA_FOLDER = "../data/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"

RESULT_FOLDER = "./results/"
RESULT_FILE = "predictions.csv"

NAN_VALUE = -999.0
INTEGER_COLUMN = (
    22  # 24 in raw csv file, but 23 when id and prediction column are removed
)

# For debug purpose only
SUB_SAMPLE = False

# Import dataset

In [3]:
# Load data
y_tr, x_tr, _ = load_csv_data(DATA_FOLDER + TRAIN_FILE, sub_sample=SUB_SAMPLE)
y_te, x_te, ids_tests = load_csv_data(DATA_FOLDER + TEST_FILE, sub_sample=SUB_SAMPLE)
print("x_tr shape : {}, y_tr shape : {}".format(x_tr.shape, y_tr.shape))
print("x_te shape : {}, y_te shape : {}".format(x_te.shape, y_te.shape))

# Define missing values as NAN
x_tr[x_tr == NAN_VALUE] = np.nan
x_te[x_te == NAN_VALUE] = np.nan

# Get columns names
col_names = []
with open(DATA_FOLDER + TRAIN_FILE) as dataset:
    col_names = dataset.readline().split(",")

x_tr shape : (250000, 30), y_tr shape : (250000,)
x_te shape : (568238, 30), y_te shape : (568238,)


Initial range of values

In [4]:
print("x_tr range :{} {}".format(np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range :{} {}".format(np.nanmin(x_te), np.nanmax(x_te)))

x_tr range :-18.066 4974.979
x_te range :-19.012 4794.827


## Pre-processing

In [5]:
# before pre-processing
print("Before pre-processing:")
print("x_tr shape : {}".format(x_tr.shape))
print("x_te shape : {}".format(x_te.shape))
print("x_tr range :{} {}".format(np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range :{} {}".format(np.nanmin(x_te), np.nanmax(x_te)))

Before pre-processing:
x_tr shape : (250000, 30)
x_te shape : (568238, 30)
x_tr range :-18.066 4974.979
x_te range :-19.012 4794.827


In [6]:
# apply log transformation
cols_to_log_transform = ["DER_pt_h", "DER_pt_tot", "PRI_met", "PRI_met_sumet"]
cols_idx = [get_col_idx(col, col_names) for col in cols_to_log_transform]

x_tr, x_te = log_transform(x_tr, x_te, cols_idx)

# Remove columns with too much NAN
x_tr = remove_nan_columns(x_tr, 0.3)
x_te = remove_nan_columns(x_te, 0.3)

print(x_tr.shape)

# Replace missing data by the mean
mean_x = np.nanvar(x_tr, axis=0)
x_tr = replace_nan_by_means(x_tr, mean_data=mean_x)
x_te = replace_nan_by_means(x_te, mean_data=mean_x)

assert x_tr[np.isnan(x_tr)].shape[0] == 0
assert x_te[np.isnan(x_te)].shape[0] == 0

# Standardize after replacing missing values
IDs_degrees = np.array([10, 13, 15])
x_tr = transform(x_tr, IDs_degrees)
x_te = transform(x_te, IDs_degrees)

(250000, 20)


In [7]:
# plot features after pre-processing
print("After pre-processing:")
print("x_tr shape : {}".format(x_tr.shape))
print("x_te shape : {}".format(x_te.shape))
print("x_tr range :{} {}".format(np.min(x_tr), np.max(x_tr)))
print("x_te range :{} {}".format(np.min(x_te), np.max(x_te)))

After pre-processing:
x_tr shape : (250000, 23)
x_te shape : (568238, 23)
x_tr range :-2.0760373982511515 6.712827536317546
x_te range :-2.0734828021024287 6.719471419278016


## Cross-validation

In [8]:
results, best_res = run_cross_validation(
    y_tr,
    x_tr,
    3,
    is_regression=True,
    lambdas=[5e-1, 5e-2, 5e-3, 5e-4, 5e-5],
    degrees=[4, 5, 6, 7, 8],
)
# run_cross_validation(y_tr, xt_tr, 5, is_regression=False, lambdas=[0.0], gammas=[1e-3], initial_w=w_init, degree=0, max_iters=10000)

In [9]:
best_res

{'lambda': 5e-05,
 'gamma': 0.0,
 'degree': 8,
 'acc': 0.8146472585890344,
 'f1': 0.7171217309743866}

# Run one CV for each PRI_jet_num model

In [None]:
# Run multiple CV, on for each PRI_jet_num
id = {}
y_pred = {}
best_results = []
results = []

lambdas = [0.5, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
gammas = [0.5e-7, 0.5e-07, 0.5e-06, 0.5e-6]
degrees = [1, 2, 3, 4]

# run all CV
for i in range(4):
    print(f"===== Jet_num : {i} =====")
    y_r, x_r = get_split_by_jet_data(
        y_tr, x_tr, i, jet_column=17
    )  # 17 because no offset
    # 0, 1e-1, 1e-2, 1e-3
    res, best_res = run_cross_validation(
        y_r, x_r, 3, is_regression=True, lambdas=lambdas, degrees=degrees
    )

    best_results.append(best_res)
    results.append(res)

In [None]:
for i, best_res in enumerate(best_results):
    print(i, best_res)
    print("----------------")

## Predictions

### Predictions with regression and threshold

In [None]:
print(y_tr.shape)
print(y_tr[y_tr == 1].shape, y_te.shape)
print(x_tr.shape)

### Predictions with log_reg

In [None]:
# # logistic_regression(y, tx, initial_w, max_iters, gamma)
# # Fit the model
# for i in range(1, 5):

#     print("build poly {}", i)
#     xtr = x_tr.copy()
#     xte = x_te.copy()

#     xtr = build_poly(xtr, 3)
#     xte = build_poly(xte, 3)

#     assert(xtr[np.isnan(xtr)].shape[0] == 0)
#     assert(xte[np.isnan(xte)].shape[0] == 0)

#     #
#     # =====
#     #
#     print(xtr.shape)
#     id = {}
#     y_pred = {}
#     # lambdas = [0,0,-2000,0]
#     lambdas = [13-3,1e-3,1e-3,1e-3]
#     gammas = [0.5e-7,0.5e-07,0.5e-06,0.5e-6]
#     for i in range(0,4):
#         y_r,x_r = get_split_by_jet_data(y_tr,xtr,i)
#         y_e,x_e = get_split_by_jet_data(ids_tests,xte,i)
#         w_init = np.linalg.lstsq(x_r.T @ x_r, x_r.T @ y_r,rcond=None)[0]
#         print(x_r.shape)
#         w, loss = reg_logistic_regression(y_r, x_r, lambdas[i], w_init, 1000, gammas[i])
#         print(loss)
#         y_pred[i] = predict_log(w,x_e)
#         id[i] = y_e

#     y_predict = np.concatenate((y_pred[0],y_pred[1],y_pred[2],y_pred[3]),axis=0)
#     ids_test = np.concatenate((id[0],id[1],id[2],id[3]),axis=0)

#     print(len(y_predict[y_predict == 1]))
#     print(len(y_predict[y_predict == -1]))
#     print(len(y_predict))

#     #10 1e-6 = 0.269576
#     #5  1e-6 = 0.045 - 0.3248 - 0.5362 - 0.2657
#     #change lambdas  and gammas for each jet

# Prediction ridge regb

In [None]:
id = {}
y_pred_tr = {}
y_pred = {}
# lambdas = [0,0,-2000,0]
lambdas = [1e-5, 1e-4, 1e-4, 1e-3]
gammas = [0.5e-7, 0.5e-07, 0.5e-06, 0.5e-6]
print(best_results[0]["degree"])
degrees = [
    best_results[0]["degree"],
    best_results[1]["degree"],
    best_results[2]["degree"],
    best_results[3]["degree"],
]

for i in range(0, 4):
    xtr = build_poly(x_tr.copy(), degrees[i])
    xte = build_poly(x_te.copy(), degrees[i])

    y_r, x_r = get_split_by_jet_data(y_tr, xtr, i)
    y_e, x_e = get_split_by_jet_data(ids_tests, xte, i)

    # w, loss = reg_logistic_regression(y_r, x_r, lambdas[i], w_init, 1000, gammas[i])
    w, loss = ridge_regression(y_r, x_r, lambda_=lambdas[i])
    y_pred_tr[i] = predict_reg(w, x_r)
    y_pred[i] = predict_reg(w, x_e)
    id[i] = y_e

    print(i, "accuracy", accuracy(y_r, y_pred_tr[i]))
    print(i, "f1", f1_score(y_r, y_pred_tr[i]))


y_predict_tr = np.concatenate(
    (y_pred_tr[0], y_pred_tr[1], y_pred_tr[2], y_pred_tr[3]), axis=0
)

y_predict = np.concatenate((y_pred[0], y_pred[1], y_pred[2], y_pred[3]), axis=0)
ids_test = np.concatenate((id[0], id[1], id[2], id[3]), axis=0)

print(len(y_predict[y_predict == 1]))
print(len(y_predict[y_predict == -1]))
print(len(y_predict))

print("=====================================")
print("Global scores")
print("accuracy", accuracy(y_tr, y_predict_tr))
print("f1", f1_score(y_tr, y_predict_tr))


create_csv_submission(ids_test, y_predict, RESULT_FOLDER + RESULT_FILE)

In [None]:
# Run on the train set
x_pred = build_poly(x_tr, 3)
w, loss = ridge_regression(y_tr, x_pred, lambda_=1e-3)
y_pred = predict_reg(w, x_pred)

print("Training accuracy : {}".format(accuracy(y_tr, y_pred)))
print("Training f1 : {}".format(f1_score(y_tr, y_pred)))
print(loss)

# Run on the test set
x_pred = build_poly(x_te, 3)
y_pred = predict_reg(w, x_pred)

y_test, input_test, ids_test = load_csv_data(DATA_FOLDER + TEST_FILE, False)

create_csv_submission(ids_test, y_pred, RESULT_FOLDER + RESULT_FILE)

In [None]:
# w_init, _ = least_squares(y_tr, xt_tr)

# gamma_min = 2e-5
# gamma_max = 35e-5
# lambda_min = 0.0
# lambda_max = 2000
# lambdas = np.linspace(lambda_min, lambda_max, num_intervals_l)
# gammas = np.linspace(gamma_min, gamma_max, num_intervals_g)

# mean_loss_tr, mean_loss_te = run_cross_validation(y_tr, xt_tr, 2, w_init, 5, 2e-5,35e-5,0,2000,10,10)
# print("Mean training mse: {}".format(mean_loss_tr))
# print("Mean test mse: {}".format(mean_loss_te))

# 2000  1e-05   = 0.69310
# 722   1e-4    = 0.69307
# 444   15e-05  = 0.69285
# 222   25e-05  = 0.69265
# 222   35e-05  = 0.69242