# Predictions
This is used to run the whole pipeline of preprocessing and make predictions.  
It doesn't contain any graphs or analysis tool in order to speed up a full run of the notebook.  

## Imports and global variables

In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
import sys
import csv
import sys

# import self-defined modules
sys.path.append('../')
from implementations import *
from helpers import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
DATA_FOLDER = "../data/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
NAN_VALUE = -999.0
INTEGER_COLUMN = (
    22  # 24 in raw csv file, but 23 when id and prediction column are removed
)

# For debug purpose only
SUB_SAMPLE = True

# Import dataset

In [3]:
# Load data
y_tr,x_tr,_ = load_csv_data(
    DATA_FOLDER + TRAIN_FILE, sub_sample=SUB_SAMPLE
) 
y_te, x_te,ids_tests = load_csv_data(DATA_FOLDER + TEST_FILE, sub_sample=SUB_SAMPLE)
print("x_tr shape : {}, y_tr shape : {}".format(x_tr.shape, y_tr.shape))
print("x_te shape : {}, y_te shape : {}".format(x_te.shape, y_te.shape))

# Define missing values as NAN
x_tr[x_tr == NAN_VALUE] = np.nan
x_te[x_te == NAN_VALUE] = np.nan

# Get columns names
col_names = []
with open(DATA_FOLDER + TRAIN_FILE) as dataset:
    col_names = dataset.readline().split(",")

x_tr shape : (5000, 30), y_tr shape : (5000,)
x_te shape : (11365, 30), y_te shape : (11365,)


Initial range of values

In [4]:
print("x_tr range :{} {}".format(np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range :{} {}".format(np.nanmin(x_te), np.nanmax(x_te)))

x_tr range :-14.127 3056.908
x_te range :-16.408 3145.674


## Pre-processing

In [5]:
# before pre-processing
print("Before pre-processing:")
print("x_tr shape : {}".format(x_tr.shape))
print("x_te shape : {}".format(x_te.shape))
print("x_tr range :{} {}".format(np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range :{} {}".format(np.nanmin(x_te), np.nanmax(x_te)))

Before pre-processing:
x_tr shape : (5000, 30)
x_te shape : (11365, 30)
x_tr range :-14.127 3056.908
x_te range :-16.408 3145.674


In [6]:
# apply log transformation
cols_to_log_transform = ['DER_pt_h', 'DER_pt_tot', 'PRI_met', 'PRI_met_sumet']
cols_idx = [get_col_idx(col, col_names) for col in cols_to_log_transform]

x_tr, x_te = log_transform(x_tr, x_te, cols_idx)

# Remove columns with too much NAN
x_tr = remove_nan_columns(x_tr, 0.5)
x_te = remove_nan_columns(x_te, 0.5)

# Replace missing data by the mean
# mean_x = np.nanmean(x_tr, axis=0)
mean_x = np.nanvar(x_tr, axis=0)
x_tr = replace_nan_by_means(x_tr, mean_data=mean_x)
x_te = replace_nan_by_means(x_te, mean_data=mean_x)

# build poly
# x_tr = build_poly(x_tr, 3)
# x_te = build_poly(x_te, 3)

assert(x_tr[np.isnan(x_tr)].shape[0] == 0)
assert(x_te[np.isnan(x_te)].shape[0] == 0)

# Standardize after replacing missing values
IDs_degrees = np.array([10,13,15])
x_tr = transform(x_tr,IDs_degrees)
x_te = transform(x_te,IDs_degrees)

In [7]:
# plot features after pre-processing
print("After pre-processing:")
print("x_tr shape : {}".format(x_tr.shape))
print("x_te shape : {}".format(x_te.shape))
print("x_tr range :{} {}".format(np.min(x_tr), np.max(x_tr)))
print("x_te range :{} {}".format(np.min(x_te), np.max(x_te)))

After pre-processing:
x_tr shape : (5000, 60)
x_te shape : (11365, 60)
x_tr range :-2.053142668759995 3.0
x_te range :-2.0568148842470846 3.0


## Analyse test set :

In [9]:
# ---

## Model fitting

In [10]:
# Use downsampling

# Add offset term to x
xt_tr = add_offset(x_tr)
xt_te = add_offset(x_te)

# Fit a model
w, loss_tr = least_squares(y_tr, x_tr)

print("Training loss : {}".format(loss_tr))

Training loss : 0.3185717696346312


# Model fitting and Predictions

In [11]:
# w_init = np.zeros((xt_tr.shape[1], 1))
# logistic_regression_penalized_gradient_descent_demo(y_tr, xt_tr, w_init, 10000, 0.0005, 0.5)

## Cross-validation

In [12]:
from implementations import *
from helpers import *

# w_init, _ = least_squares(y_tr, xt_tr)
# mean_loss_tr, mean_loss_te = run_cross_validation(y_tr, xt_tr, 2, w_init, 5, 2e-5,35e-5,0,2000,10,10)
# print("Mean training mse: {}".format(mean_loss_tr))
# print("Mean test mse: {}".format(mean_loss_te))

# 2000  1e-05   = 0.69310
# 722   1e-4    = 0.69307
# 444   15e-05  = 0.69285
# 222   25e-05  = 0.69265
# 222   35e-05  = 0.69242

## Predictions

### Predictions with regression and threshold

In [13]:
# Make predictions from model's weight and set treshold for signal & background
# y_predict = x_te@w
# prediction_threshold = y_te.mean()

In [14]:
# prediction_threshold = 0
# print(y_predict.shape)
# for i in range(len(y_predict)) :
#     if y_predict[i] <= prediction_threshold :
#         y_predict[i] = 1
#     else :
#         y_predict[i] = -1
# print(y_predict.shape)
# y_test, input_test, ids_test = load_csv_data('./data/test.csv',False)
# create_csv_submission(ids_test,y_predict,"prediction test least square + feature engineering")

In [15]:
print(y_tr.shape)
print(y_tr[y_tr==1].shape, y_te.shape)
print(x_tr.shape)

(5000,)
(1643,) (11365,)
(5000, 60)


### Predictions with log_reg

In [16]:
# logistic_regression(y, tx, initial_w, max_iters, gamma)
# Fit the model
for i in range(1, 5):

    print("build poly {}", i)
    xtr = x_tr.copy()
    xte = x_te.copy()

    xtr = build_poly(xtr, 3)
    xte = build_poly(xte, 3)

    assert(xtr[np.isnan(xtr)].shape[0] == 0)
    assert(xte[np.isnan(xte)].shape[0] == 0)

    # Standardize after replacing missing values
    IDs_degrees = np.array([10,13,15])
    xtr = transform(xtr,IDs_degrees)
    xte = transform(xte,IDs_degrees)

    xt_tr = add_offset(xtr)
    xt_te = add_offset(xte)

    #
    # =====
    #

    id = {}
    y_pred = {}
    # lambdas = [0,0,-2000,0]
    lambdas = [0,0,0,0]
    gammas = [0.5e-7,0.5e-07,0.5e-06,0.5e-6]
    for i in range(0,4):
        y_r,x_r = get_split_by_jet_data(y_tr,xt_tr,i)
        y_e,x_e = get_split_by_jet_data(ids_tests,xt_te,i)
        w_init = np.linalg.lstsq(x_r.T @ x_r, x_r.T @ y_r,rcond=None)[0]
        w, loss = reg_logistic_regression(y_r, x_r, lambdas[i], w_init, 1000, gammas[i])
        print(loss)
        y_pred[i] = predict(w,x_e)
        id[i] = y_e

    y_predict = np.concatenate((y_pred[0],y_pred[1],y_pred[2],y_pred[3]),axis=0)
    ids_test = np.concatenate((id[0],id[1],id[2],id[3]),axis=0)

    print(len(y_predict[y_predict == 1]))
    print(len(y_predict[y_predict == -1]))
    print(len(y_predict))

    #10 1e-6 = 0.269576
    #5  1e-6 = 0.045 - 0.3248 - 0.5362 - 0.2657
    #change lambdas  and gammas for each jet



build poly {} 1
-0.1202816180764591
0.14184742408367937


  return 1.0 / (1.0 + np.exp(-t))


0.31424986654596604
-0.11985578073404303
5681
5684
11365
build poly {} 2
-0.1202816180764591
0.14184742408367937
0.31424986654596604
-0.11985578073404303
5681
5684
11365
build poly {} 3
-0.1202816180764591
0.14184742408367937
0.31424986654596604
-0.11985578073404303
5681
5684
11365
build poly {} 4
-0.1202816180764591
0.14184742408367937
0.31424986654596604
-0.11985578073404303
5681
5684
11365


In [17]:
#y_test, input_test, ids_test = load_csv_data("./data/test.csv", False)
# create_csv_submission(
#     ids_test, y_predict, "prediction test least square + feature engineering"
# )

### Balance dataset

In [None]:
# nb_s = len(y_tr[y_tr == 1])
# nb_b = len(y_tr) - nb_s
# print("Signals: {} ({}%)".format(nb_s, 100 * nb_s / len(x_tr)))
# print("Backgrounds: {} ({}%)".format(nb_b, 100 * nb_b / len(x_tr)))
# print("Ratio signal / background: {}".format(nb_s / nb_b))

# # x_tr_ds, y_tr_ds = balance_dataset(x_tr, y_tr) TODO