In [None]:
import numpy as np
import random
from datetime import datetime
from helpers import *
from costs import *
from gradient_descent import *
from stochastic_gradient_descent import *
from features_engineering import *
from cross_validation import *
from pre_processing import *
from group_by import *

random.seed(10)

In [None]:
y, x, ids = load_csv_data(data_path="datas/train.csv", sub_sample=False)

In [None]:
submission_y, submission_x, submission_ids = load_csv_data(data_path="datas/test.csv", sub_sample=False)

In [None]:
sub_jet_num_x_dict, sub_jet_num_y_dict, sub_jet_num_ids_dict = group_by_jetnum_NaN(submission_x, submission_y, submission_ids)

In [None]:
jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict = group_by_jetnum_NaN(x, y, ids)

In [None]:
# Idea : Unbalance data
# Check misclassified data

# Need to test on two group of data


In [None]:
def get_false(x, y, w):
    pred_y = predict_labels(w, x)
    false_count = 0
    count_negatif = 0
    for index, yi in enumerate(y):
        pred_yi = pred_y[index]
        if pred_yi != yi:
            false_count += 1
            if pred_yi == -1:
                count_negatif += 1
    
    return count_negatif / false_count



def get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, numjet, index):
    removed_col_key = list(jet_num_x_dict[numjet])[index]
    x = jet_num_x_dict[numjet][removed_col_key]
    y = jet_num_y_dict[numjet][removed_col_key]
    ids = jet_num_ids_dict[numjet][removed_col_key]
    return x, y, ids

def build_features(x):
    polynomial_x = normalize(x)
    polynomial_x = build_polynomial(polynomial_x, 6)
    polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
    return polynomial_x


count = 0
accuracy_train = 0
accuracy_test = 0

submission_ids = []
submission_y = []

result_y = []
result_ids = []

for numjet in range(0, 3):
    for index in range(0, 1):
        x_, y_, ids_ = get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, numjet, index)
        
        polynomial_x = build_features(x_)
        train_x, train_y, test_x, test_y = separate_set(polynomial_x, y_)

        lambda_ = find_best_ridge_lambda(train_y, train_x, test_x, test_y)
        w, loss = ridge_regression(train_y, train_x, lambda_)
        
        number_of_el = len(y_)
        accuracy_train += get_accuracy(train_x, train_y, w) * number_of_el
        accuracy_test += get_accuracy(test_x, test_y, w) * number_of_el
        
        print("\t Predicted -1 but was 1 :", get_false(test_x, test_y, w))
        
        count += number_of_el
        
        # Predict local
        removed_col_key = list(jet_num_x_dict[numjet])[index]
        sub_x2 = jet_num_x_dict[numjet][removed_col_key]
        sub_ids2 = jet_num_ids_dict[numjet][removed_col_key]

        sub_x2 = build_features(sub_x2)
        pred_y2 = predict_labels(w, sub_x2)
        
        for sub_index, sub_id in enumerate(sub_ids2):
            result_ids.append(sub_id)
            result_y.append(pred_y2[sub_index])
        

        
        # Predict submission
        removed_col_key = list(jet_num_x_dict[numjet])[index]
        sub_x = sub_jet_num_x_dict[numjet][removed_col_key]
        sub_ids = sub_jet_num_ids_dict[numjet][removed_col_key]

        sub_x = build_features(sub_x)
        pred_y = predict_labels(w, sub_x)
        
        for sub_index, sub_id in enumerate(sub_ids):
            submission_ids.append(sub_id)
            submission_y.append(pred_y[sub_index])
        
print("Count:", count)
print("Train Accuracy: " + str(accuracy_train / count))
print("Test Accuracy: " + str(accuracy_test / count))

In [None]:
len(submission_ids) 

In [None]:
_, x2, _ = load_csv_data(data_path="datas/test.csv", sub_sample=False)
print(proportion_of_NaN(x2) - proportion_of_NaN(x))

In [None]:
proportion_of_NaN(x)

In [None]:
w_init = np.random.rand(x.shape[1])
w, loss = least_squares_GD(train_y, train_x, w_init, max_iters=100, gamma=0.3)

In [None]:
w, loss = least_squares_SGD(train_y, train_x, w_init, 100, gamma=0.01)

In [None]:
get_accuracy(test_x, test_y, w)

In [None]:
_, submission_x, submission_ids = load_csv_data(data_path="datas/test.csv", sub_sample=False)
submission_x = submission_x[:, :15] # Removed all the primary
submission_x = preprocess_data(submission_x)

submission_x = build_polynomial(submission_x, 6)
submission_x = build_combinations_lvl(submission_x, 2)
submission_x = build_combinations_lvl(submission_x, 3)
submission_x = build_combinations_lvl(submission_x, 4)
submission_x = build_combinations_lvl(submission_x, 5)
submission_x = build_combinations_lvl(submission_x, 6)
submission_x = build_combinations_lvl(submission_x, 7)
submission_x = build_combinations_lvl(submission_x, 8)

pred_y = predict_labels(w, submission_x)

In [None]:
create_csv_submission(submission_ids, submission_y, "datas/submission.csv")
print('Done !')


In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt

# https://stackoverflow.com/a/7941594/4810319
def main():
    np.random.seed(1977)
    numvars, numdata = 5, 100
    data = 10 * np.random.random((numvars, numdata))
    data = x[0:300, 0:7].T
    print(x[0:200, 7])
    fig = scatterplot_matrix(data, ['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet'],
            linestyle='none', marker='o', color='black', mfc='none')
    fig.suptitle('Simple Scatterplot Matrix')
    plt.show()

def scatterplot_matrix(data, names, **kwargs):
    """Plots a scatterplot matrix of subplots.  Each row of "data" is plotted
    against other rows, resulting in a nrows by nrows grid of subplots with the
    diagonal subplots labeled with "names".  Additional keyword arguments are
    passed on to matplotlib's "plot" command. Returns the matplotlib figure
    object containg the subplot grid."""
    numvars, numdata = data.shape
    fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8,8))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    # Plot the data.
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i,j), (j,i)]:
            axes[x,y].plot(data[x], data[y], **kwargs)

    # Label the diagonal subplots...
    for i, label in enumerate(names):
        axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                ha='center', va='center')

    # Turn on the proper x or y axes ticks.
    for i, j in zip(range(numvars), itertools.cycle((-1, 0))):
        axes[j,i].xaxis.set_visible(True)
        axes[i,j].yaxis.set_visible(True)

    return fig

main()

In [None]:
# Feature engineering (Add more features)
polynomial_x = x
# polynomial_x = build_polynomial(polynomial_x, 6)
# polynomial_x = build_combinations_lvl(polynomial_x, 2)
# polynomial_x = build_combinations_lvl(polynomial_x, 3)
# polynomial_x = build_combinations_lvl(polynomial_x, 4)
# polynomial_x = build_combinations_lvl(polynomial_x, 5)
# polynomial_x = build_combinations_lvl(polynomial_x, 6)
# polynomial_x = build_combinations_lvl(polynomial_x, 7)
# polynomial_x = build_combinations_lvl(polynomial_x, 8)

train_x, train_y, test_x, test_y = separate_set(polynomial_x, y)

polynomial_x.shape

In [None]:
def get_accuracy_ids(result_y, result_ids, y, ids):
    stacked = np.column_stack((ids, y))
    stacked = stacked[stacked[:,0].argsort()]
    stacked_pred = np.column_stack((result_ids, result_y))
    stacked_pred = stacked_pred[stacked_pred[:,0].argsort()]
    
    print(len(stacked_pred), len(stacked))
    unique, counts = np.unique((stacked == stacked_pred)[:, 1], return_counts=True)
    return dict(zip(unique, counts))[True] / len(y)

get_accuracy_ids(result_y, result_ids, y, ids)

In [None]:
submission_stacked = np.column_stack((submission_ids, submission_y))
submission_stacked = submission_stacked[submission_stacked[:,0].argsort()]


In [None]:
create_csv_submission(submission_stacked[:,0], submission_stacked[:,1], "datas/submission.csv")
print('Done !')

In [None]:
def build_k_indices(y, k_fold, seed=1):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def k_fold_cross_validation(y, x, k, lambda_):
    """return the accuracy of ridge regression."""
    
    k_indices = build_k_indices(y, k)
    accuracy_tr = []
    accuracy_te = []
    
    accuracy_tr = []
    accuracy_te = []

    for i in range(0, k):
        i = 0
        
        # get k'th subgroup in test, others in train:
        x_test = x[k_indices[i]]
        y_test = y[k_indices[i]]
        x_train = np.array([]).reshape(0, x.shape[1])
        y_train = []

        for j in range(0, k):
            if j != i:
                x_train = np.concatenate((x_train, x[k_indices[j]]))
                y_train = np.concatenate((y_train, y[k_indices[j]]))

        # ridge regression:
        w, loss = ridge_regression(y_train, x_train, lambda_)

        # calculate the loss for train and test data
        accuracy_tr.append(get_accuracy(x_train, y_train, w))
        accuracy_te.append(get_accuracy(x_test, y_test, w))

    return np.mean(accuracy_tr), np.mean(accuracy_te)

In [None]:
k = 4
lambda_ = 0.1
k_fold_cross_validation(y, x, k, lambda_)