In [291]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime
from helpers import *
from costs import *
from gradient_descent import *
from features_engineering import *
from cross_validation import *
from pre_processing import *
from group_by import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the dataset

In [292]:
y, x, ids = load_csv_data(data_path="datas/train.csv", sub_sample=False)

In [293]:
submission_y, submission_x, submission_ids = load_csv_data(data_path="datas/test.csv", sub_sample=False)

# Create sub dataset 
- Group by numjet column (categorical data : (0, 1, 2, 3))
- Group by the NaN columns

We obtain at the end 8 datasets, one for each numjet and for each of these, 2 according to the NaN columns removed.

In [294]:
sub_jet_num_x_dict, sub_jet_num_y_dict, sub_jet_num_ids_dict = group_by_jetnum_NaN(submission_x, submission_y, submission_ids)

0 (227458, 30) (227458,) (227458,)
1 (175338, 30) (175338,) (175338,)
2 (114648, 30) (114648,) (114648,)
3 (50794, 30) (50794,) (50794,)
num_jet: 0
(0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) (59263, 19) (59263, 1) (59263, 1)
(4, 5, 6, 12, 23, 24, 25, 26, 27, 28) (168195, 20) (168195, 1) (168195, 1)
	Remove col : 
	 (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 17 [ 0.  0.  0. ...,  0.  0.  0.]
	 (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 18 [ 0.  0.  0. ...,  0.  0.  0.]
	 (4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 18 [ 0.  0.  0. ...,  0.  0.  0.]
	 (4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 19 [ 0.  0.  0. ...,  0.  0.  0.]
num_jet: 1
(4, 5, 6, 12, 26, 27, 28) (158095, 23) (158095, 1) (158095, 1)
(0, 4, 5, 6, 12, 26, 27, 28) (17243, 22) (17243, 1) (17243, 1)
	Remove col : 
	 (4, 5, 6, 12, 26, 27, 28) 18 [ 1.  1.  1. ...,  1.  1.  1.]
	 (0, 4, 5, 6, 12, 26, 27, 28) 17 [ 1.  1.  1. ...,  1.  1.  1.]
num_jet: 2
() (107905, 30) (107905, 1) (107905, 1)
(0,) (6743, 29) (6743, 1) (6743, 1)
	Remove col : 
	 ()

In [295]:
jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict = group_by_jetnum_NaN(x, y, ids)

0 (99913, 30) (99913,) (99913,)
1 (77544, 30) (77544,) (77544,)
2 (50379, 30) (50379,) (50379,)
3 (22164, 30) (22164,) (22164,)
num_jet: 0
(4, 5, 6, 12, 23, 24, 25, 26, 27, 28) (73790, 20) (73790, 1) (73790, 1)
(0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) (26123, 19) (26123, 1) (26123, 1)
	Remove col : 
	 (4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 18 [ 0.  0.  0. ...,  0.  0.  0.]
	 (4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 19 [ 0.  0.  0. ...,  0.  0.  0.]
	 (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 17 [ 0.  0.  0. ...,  0.  0.  0.]
	 (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 18 [ 0.  0.  0. ...,  0.  0.  0.]
num_jet: 1
(4, 5, 6, 12, 26, 27, 28) (69982, 23) (69982, 1) (69982, 1)
(0, 4, 5, 6, 12, 26, 27, 28) (7562, 22) (7562, 1) (7562, 1)
	Remove col : 
	 (4, 5, 6, 12, 26, 27, 28) 18 [ 1.  1.  1. ...,  1.  1.  1.]
	 (0, 4, 5, 6, 12, 26, 27, 28) 17 [ 1.  1.  1. ...,  1.  1.  1.]
num_jet: 2
() (47427, 30) (47427, 1) (47427, 1)
(0,) (2952, 29) (2952, 1) (2952, 1)
	Remove col : 
	 () 22 [ 2.  2.  2. ...,

# Build the best model for each of the sub dataset

In [304]:
def get_false(x, y, w, predict_threshold):
    """Get the ratio of negative predictions over wrong predictions"""
    
    # Get the predicted values
    pred_y = predict_labels(w, x, predict_threshold)
    # Initialize at 0
    false_count = 0
    count_negatif = 0
    
    # If prediction is wrong, add 1, if prediction is wrong and negative, add 1
    for index, yi in enumerate(y):
        pred_yi = pred_y[index]
        if pred_yi != yi:
            false_count += 1
            if pred_yi == -1:
                count_negatif += 1
                
    # Calculate which percentage of wrong predictions are due to negative value
    return count_negatif / false_count



def get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, numjet, index):
    # Get the column number of the features that wil be removed
    removed_col_key = list(jet_num_x_dict[numjet])[index]
    # Get the samples of the category numjet of PRI_num_jet and removed data
    x = jet_num_x_dict[numjet][removed_col_key]
    y = jet_num_y_dict[numjet][removed_col_key]
    ids = jet_num_ids_dict[numjet][removed_col_key]
    return x, y, ids

def build_features(x, numjet, index):
    """
    Calculate different features depending on the data (category of PRI_num_jet and nan or not)
    Which features are used has been done with trial and error to improve the loss
    1. Normalize data
    2. Build combinations
    """
    if numjet == 0 and index == 0:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 3)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 5, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 6, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 7, 8)
    elif numjet == 0 and index == 1:
        x_numjet0_index1 = normalize(x)
        polynomial_x = x_numjet0_index1
        polynomial_x = np.concatenate((polynomial_x, np.tanh(x_numjet0_index1)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.sqrt(np.abs(x_numjet0_index1))), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(x_numjet0_index1, 2)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.tanh(x_numjet0_index1), 2)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.log(np.abs(x_numjet0_index1)), 2)), axis=1)
    elif numjet == 1 and index == 1:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 3)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 5, 10)
    elif numjet == 2 and index == 1:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 2)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 5, 10)
    elif numjet == 3 and index == 0:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 5)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 5, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 6, 8)
    elif numjet == 3 and index == 1:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 6)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)
    else:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 3)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)

    return polynomial_x

def build_best_model(x_, y_, numjet, index):
    """
    Build the best model with the best parameters
    """
    
    # Initialize k_fold and prediction threshold and build features
    k = 5
    predict_threshold = 0
    polynomial_x = build_features(x_, numjet, index)

    # Use the best lambda for best result
    if numjet == 0 and index == 0:
        lambda_ = 4.52035365636e-07
    elif numjet == 0 and index == 1:
        lambda_ = 1e-08
    elif numjet == 1 and index == 1:
        lambda_ = 0.137382379588
    elif numjet == 2 and index == 1:
        lambda_ = 0.0188739182214
    elif numjet == 3 and index == 0:
        lambda_ = 7.27895384398e-05
    elif numjet == 3 and index == 1:
        predict_threshold = -0.1
        lambda_ = 0.5
    else:
        lambda_ = 0.000001


    #Gest the accuracy of test and train using k_fold_corss_validation
    accuracy_train_k, accuracy_test_k = k_fold_cross_validation(y_, polynomial_x, k, lambda_, predict_threshold)
    # Find optimal weights and loss with ridge regression
    w, loss = ridge_regression(y_, polynomial_x, lambda_)

    
    print("\t Predicted -1 but was 1 :", get_false(polynomial_x, y_, w, predict_threshold))

    
    return w, predict_threshold, accuracy_train_k, accuracy_test_k

    

In [305]:
"""
Initialize variables to submit data, this includes the id.
It is important as the data will be separated depending on its features and category
"""
count = 0

accuracy_train = 0
accuracy_test = 0

submission_ids = []
submission_y = []

result_y = []
result_ids = []

# For each category in PRI_num_jet and if they have or not NA
for numjet in range(0, 4):
    for index in range(0, 2):
        # Get the x, y and ID
        x_, y_, ids_ = get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, numjet, index)
        
        # Get the optimal weights and accuracy
        w, predict_threshold, accuracy_train_k, accuracy_test_k = build_best_model(x_, y_, numjet, index)
        
        # Get the number of elements in that category
        number_of_el = len(y_)

        # Add the accuracy in proportion to the number of elements (max 1 if all elements in 1 category)
        accuracy_train += accuracy_train_k * number_of_el
        accuracy_test += accuracy_test_k * number_of_el
        
        # PRint training and testing accuracy
        print(numjet, index, "Train Accuracy: " + str(accuracy_train_k))
        print(numjet, index, "Test Accuracy: " + str(accuracy_test_k))
        
        # Count the number of elements
        count += number_of_el
  
        # Predict local
        removed_col_key = list(jet_num_x_dict[numjet])[index]
        sub_x2 = jet_num_x_dict[numjet][removed_col_key]
        sub_ids2 = jet_num_ids_dict[numjet][removed_col_key]

        sub_x2 = build_features(sub_x2, numjet, index)
        pred_y2 = predict_labels(w, sub_x2, predict_threshold)
        
        for sub_index, sub_id in enumerate(sub_ids2):
            result_ids.append(sub_id)
            result_y.append(pred_y2[sub_index])
        
        
        # Predict submission
        removed_col_key = list(jet_num_x_dict[numjet])[index]
        sub_x = sub_jet_num_x_dict[numjet][removed_col_key]
        sub_ids = sub_jet_num_ids_dict[numjet][removed_col_key]
        
        sub_x = build_features(sub_x, numjet, index)
        pred_y = predict_labels(w, sub_x, predict_threshold)
        for sub_index, sub_id in enumerate(sub_ids):
            submission_ids.append(sub_id)
            submission_y.append(pred_y[sub_index])
        
print("Count:", count)
print("Train Accuracy: " + str(accuracy_train / count))
print("Test Accuracy: " + str(accuracy_test / count))

2017-10-25 11:01:56.578356 combinations 2 : 0 / 28
2017-10-25 11:01:59.112191 combinations 3 : 0 / 56
2017-10-25 11:02:05.180187 combinations 3 : 50 / 56
2017-10-25 11:02:05.781188 combinations 4 : 0 / 70
2017-10-25 11:02:11.693183 combinations 4 : 50 / 70
2017-10-25 11:02:14.519223 combinations 5 : 0 / 56
2017-10-25 11:02:23.057418 combinations 5 : 50 / 56
2017-10-25 11:02:24.276936 combinations 6 : 0 / 28
2017-10-25 11:02:28.881935 combinations 7 : 0 / 8
	 Predicted -1 but was 1 : 0.634313005143277
0 0 Train Accuracy: 0.815696571351
0 0 Test Accuracy: 0.814337986177
2017-10-25 11:02:50.782114 combinations 2 : 0 / 28
2017-10-25 11:02:53.140114 combinations 3 : 0 / 56
2017-10-25 11:02:57.876115 combinations 3 : 50 / 56
2017-10-25 11:02:58.506076 combinations 4 : 0 / 70
2017-10-25 11:03:04.528078 combinations 4 : 50 / 70
2017-10-25 11:03:08.123117 combinations 5 : 0 / 56
2017-10-25 11:03:16.048074 combinations 5 : 50 / 56
2017-10-25 11:03:16.904114 combinations 6 : 0 / 28
2017-10-25 11:

2017-10-25 11:15:30.541090 combinations 3 : 50 / 120
2017-10-25 11:15:30.820087 combinations 3 : 100 / 120
2017-10-25 11:15:30.944623 combinations 4 : 0 / 210
2017-10-25 11:15:31.268090 combinations 4 : 50 / 210
2017-10-25 11:15:31.613088 combinations 4 : 100 / 210
2017-10-25 11:15:32.057627 combinations 4 : 150 / 210
2017-10-25 11:15:32.468589 combinations 4 : 200 / 210
2017-10-25 11:15:32.576092 combinations 5 : 0 / 252
2017-10-25 11:15:33.176593 combinations 5 : 50 / 252
2017-10-25 11:15:33.818243 combinations 5 : 100 / 252
2017-10-25 11:15:34.535238 combinations 5 : 150 / 252
2017-10-25 11:15:35.345246 combinations 5 : 200 / 252
2017-10-25 11:15:36.056238 combinations 5 : 250 / 252
2017-10-25 11:15:36.242743 combinations 2 : 0 / 45
2017-10-25 11:15:36.663241 combinations 3 : 0 / 120
2017-10-25 11:15:37.156738 combinations 3 : 50 / 120
2017-10-25 11:15:37.840861 combinations 3 : 100 / 120
2017-10-25 11:15:38.093364 combinations 4 : 0 / 210
2017-10-25 11:15:39.231868 combinations 4 :

In [306]:
# Get total accuracy in our train set
def get_accuracy_ids(result_y, result_ids, y, ids):
    stacked = np.column_stack((ids, y))
    stacked = stacked[stacked[:,0].argsort()]
    stacked_pred = np.column_stack((result_ids, result_y))
    stacked_pred = stacked_pred[stacked_pred[:,0].argsort()]
    
    print(len(stacked_pred), len(stacked))
    unique, counts = np.unique((stacked == stacked_pred)[:, 1], return_counts=True)
    return dict(zip(unique, counts))[True] / len(y)

get_accuracy_ids(result_y, result_ids, y, ids)

250000 250000


0.84059200000000001

In [307]:
# Create submission csv file
submission_stacked = np.column_stack((submission_ids, submission_y))
submission_stacked = submission_stacked[submission_stacked[:,0].argsort()]
create_csv_submission(submission_stacked[:,0], submission_stacked[:,1], "datas/submission.csv")
print('Submission file created !')

Submission file created !


# Find the best model for specific classifier

In [297]:
k = 5

# Manually choose the categories
x_, y_, ids_ = get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, 0, 1)

def build_polynomial2(x, max_degree):
    polynomial_x = x
    # Create new features with the tanh of the original data
    polynomial_x = np.concatenate((polynomial_x, np.tanh(x)), axis=1)
    # Create new features with the ln of the original data
    polynomial_x = np.concatenate((polynomial_x, np.log(np.abs(x))), axis=1)
    # Create new features with the square root of the original data
    polynomial_x = np.concatenate((polynomial_x, np.sqrt(np.abs(x))), axis=1)
    
    # Create polynomials of max_degree of the new data
    for degree in range(2, max_degree + 1):
        polynomial_x = np.concatenate((polynomial_x, np.power(x, degree)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.tanh(x), degree)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.log(np.abs(x)), degree)), axis=1)

    return polynomial_x

# Build combinations
x_numjet0_index1 = normalize(x_)
polynomial_x = x_numjet0_index1
polynomial_x = np.concatenate((polynomial_x, np.tanh(x_numjet0_index1)), axis=1)
polynomial_x = np.concatenate((polynomial_x, np.sqrt(np.abs(x_numjet0_index1))), axis=1)
polynomial_x = np.concatenate((polynomial_x, np.power(x_numjet0_index1, 2)), axis=1)
polynomial_x = np.concatenate((polynomial_x, np.power(np.tanh(x_numjet0_index1), 2)), axis=1)
polynomial_x = np.concatenate((polynomial_x, np.power(np.log(np.abs(x_numjet0_index1)), 2)), axis=1)

predict_threshold = -0.00

# 4.52035365636e-07 0.813863667164

In [298]:
# Here to test the accuracy of one specific classifier
# Find best lambdas
lambdas = np.logspace(-8, 0, 30)
best_accuracy = 0
best_lambda = 0
for lambda_ in lambdas:
    
    accuracy_train_k, accuracy_test_k = k_fold_cross_validation(y_, polynomial_x, k, lambda_, predict_threshold)

    if accuracy_test_k > best_accuracy:
        best_accuracy = accuracy_test_k
        best_lambda = lambda_
    print("Lambdas:", lambda_, "Train:", accuracy_train_k, " Test:", accuracy_test_k)

w, loss = ridge_regression(y_, polynomial_x, best_lambda)

print("\t Predicted -1 but was 1 :", get_false(polynomial_x, y_, w, predict_threshold))

print("BEST:", best_lambda, best_accuracy)
lambda_ = best_lambda


Lambdas: 1e-08 Train: 0.95047856049  Test: 0.949961715161
Lambdas: 1.88739182214e-08 Train: 0.95047856049  Test: 0.949961715161
Lambdas: 3.56224789026e-08 Train: 0.95047856049  Test: 0.949961715161
Lambdas: 6.7233575365e-08 Train: 0.95047856049  Test: 0.949961715161
Lambdas: 1.26896100317e-07 Train: 0.95047856049  Test: 0.949961715161
Lambdas: 2.39502661999e-07 Train: 0.95047856049  Test: 0.949961715161
Lambdas: 4.52035365636e-07 Train: 0.95047856049  Test: 0.949961715161
Lambdas: 8.53167852417e-07 Train: 0.95047856049  Test: 0.949961715161
Lambdas: 1.61026202756e-06 Train: 0.9504881317  Test: 0.949961715161
Lambdas: 3.03919538231e-06 Train: 0.9504881317  Test: 0.949961715161
Lambdas: 5.73615251045e-06 Train: 0.9504881317  Test: 0.949961715161
Lambdas: 1.08263673387e-05 Train: 0.9504881317  Test: 0.949923430322
Lambdas: 2.04335971786e-05 Train: 0.95047856049  Test: 0.949923430322
Lambdas: 3.85662042116e-05 Train: 0.950449846861  Test: 0.949885145482
Lambdas: 7.27895384398e-05 Train: 0.

In [299]:
# Find best threshold
threshs = np.linspace(0, -0.4, num=100)
best_accuracy = 0
best_thresh = 0
for thresh in threshs:
    predict_threshold = thresh
    
    accuracy_train_k, accuracy_test_k = k_fold_cross_validation(y_, polynomial_x, k, lambda_, predict_threshold)

    if accuracy_test_k > best_accuracy:
        best_accuracy = accuracy_test_k
        best_thresh = thresh
    print("Thresh:", thresh, "Train:", accuracy_train_k, " Test:", accuracy_test_k)

w, loss = ridge_regression(y_, polynomial_x, best_lambda)

print("\t Predicted -1 but was 1 :", get_false(polynomial_x, y_, w, predict_threshold))

print("BEST:", best_thresh, best_accuracy)

Thresh: 0.0 Train: 0.95047856049  Test: 0.949961715161
Thresh: -0.0040404040404 Train: 0.950545558959  Test: 0.949961715161
Thresh: -0.00808080808081 Train: 0.950574272588  Test: 0.950076569678
Thresh: -0.0121212121212 Train: 0.950574272588  Test: 0.950076569678
Thresh: -0.0161616161616 Train: 0.950612557427  Test: 0.950114854518
Thresh: -0.020202020202 Train: 0.950641271057  Test: 0.950153139357
Thresh: -0.0242424242424 Train: 0.950689127106  Test: 0.950191424196
Thresh: -0.0282828282828 Train: 0.950593415008  Test: 0.950153139357
Thresh: -0.0323232323232 Train: 0.950593415008  Test: 0.950267993874
Thresh: -0.0363636363636 Train: 0.950679555896  Test: 0.950191424196
Thresh: -0.040404040404 Train: 0.950727411945  Test: 0.950229709035
Thresh: -0.0444444444444 Train: 0.950727411945  Test: 0.950191424196
Thresh: -0.0484848484848 Train: 0.950765696784  Test: 0.950191424196
Thresh: -0.0525252525253 Train: 0.950775267994  Test: 0.950191424196
Thresh: -0.0565656565657 Train: 0.950784839204  T