In [None]:
import numpy as np
import random
from helpers import *
from costs import *
from gradient_descent import *
from stochastic_gradient_descent import *

random.seed(10)

In [None]:
y, x, ids = load_csv_data(data_path="datas/train.csv", sub_sample=False)
x = x[:, :13] # Removed all the primary

In [None]:
def remove_NaN(x):
    columns_with_NaN = set("")
    for row in x:
        for i,feature in enumerate(row):
            if feature == -999:
                columns_with_NaN.add(i)
        
    x = np.delete(x, [col for col in columns_with_NaN], axis=1)
        
    print("Cleaned " + str(len(columns_with_NaN)) + " columns")
        
    return x


In [None]:
def proportion_of_NaN(x):
    nb_of_nan = np.zeros(30)
    for row in x:
        for i,feature in enumerate(row):
            if feature == -999:
                nb_of_nan[i] += 1
        
    return nb_of_nan / x.shape[0]

In [None]:
_, x2, _ = load_csv_data(data_path="datas/test.csv", sub_sample=False)
print(proportion_of_NaN(x2) - proportion_of_NaN(x))

In [None]:
proportion_of_NaN(x)

In [None]:
def normalize(x):
    return (x - x.mean(axis=0)) / (x.std(axis=0) + 0.0000000001)


def preprocess_data(x):
    return normalize(remove_NaN(x))
x = preprocess_data(x)
x.shape

In [None]:
def separate_set(x, y):
    x_and_y = np.concatenate((y.reshape((y.shape[0], 1)), x), axis=1)
    np.random.shuffle(x_and_y)
    
    count = x_and_y.shape[0]
    last_train_index = int(count * 0.95)
    
    train_set = x_and_y[0:last_train_index, :]
    test_set = x_and_y[last_train_index:, :]
    
    train_y = train_set[:, 0]
    test_y = test_set[:, 0]

    train_x = train_set[:, 1:]
    test_x = test_set[:, 1:]

    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = separate_set(x, y)

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

In [None]:
w_init = np.random.rand(x.shape[1])
w, loss = least_squares_GD(train_y, train_x, w_init, max_iters=100, gamma=0.3)

In [None]:
w, loss = least_squares_SGD(train_y, train_x, w_init, 100, gamma=0.01)

In [None]:
get_accuracy(test_x, test_y, w)

In [None]:
_, submission_x, submission_ids = load_csv_data(data_path="datas/test.csv", sub_sample=False)
submission_x = submission_x[:, :13] # Removed all the primary
submission_x = preprocess_data(submission_x)

submission_x = build_polynomial(submission_x, 6)
submission_x = build_combinations_lvl(submission_x, 2)
submission_x = build_combinations_lvl(submission_x, 3)
submission_x = build_combinations_lvl(submission_x, 4)
submission_x = build_combinations_lvl(submission_x, 5)
submission_x = build_combinations_lvl(submission_x, 6)
submission_x = build_combinations_lvl(submission_x, 7)
submission_x = build_combinations_lvl(submission_x, 8)

pred_y = predict_labels(w, submission_x)

In [None]:
create_csv_submission(submission_ids, pred_y, "datas/submission.csv")
print('Done !')


In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt

# https://stackoverflow.com/a/7941594/4810319
def main():
    np.random.seed(1977)
    numvars, numdata = 5, 100
    data = 10 * np.random.random((numvars, numdata))
    data = x[0:300, 0:7].T
    print(x[0:200, 7])
    fig = scatterplot_matrix(data, ['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet'],
            linestyle='none', marker='o', color='black', mfc='none')
    fig.suptitle('Simple Scatterplot Matrix')
    plt.show()

def scatterplot_matrix(data, names, **kwargs):
    """Plots a scatterplot matrix of subplots.  Each row of "data" is plotted
    against other rows, resulting in a nrows by nrows grid of subplots with the
    diagonal subplots labeled with "names".  Additional keyword arguments are
    passed on to matplotlib's "plot" command. Returns the matplotlib figure
    object containg the subplot grid."""
    numvars, numdata = data.shape
    fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8,8))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    # Plot the data.
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i,j), (j,i)]:
            axes[x,y].plot(data[x], data[y], **kwargs)

    # Label the diagonal subplots...
    for i, label in enumerate(names):
        axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                ha='center', va='center')

    # Turn on the proper x or y axes ticks.
    for i, j in zip(range(numvars), itertools.cycle((-1, 0))):
        axes[j,i].xaxis.set_visible(True)
        axes[i,j].yaxis.set_visible(True)

    return fig

main()

In [None]:
def least_squares(y, tx):
    gram = tx.T.dot(tx)
    print("Rank: " + str(np.linalg.matrix_rank(gram)))
    w = np.linalg.inv(gram).dot(tx.T).dot(y)
    return w, compute_loss(y, tx, w)
    
w, loss = least_squares(train_y, train_x)
print("Loss: " + str(loss))
print("Accuracy: " + str(get_accuracy(train_x, train_y, w)))
print("Accuracy: " + str(get_accuracy(test_x, test_y, w)))

In [None]:
def ridge_regression(y, tx, lambda_):
    gram = tx.T.dot(tx)
    lambda_prime = 2 * len(y) * lambda_
    I = np.identity(len(gram))
    w = np.linalg.inv(gram + np.dot(lambda_prime, I)).dot(tx.T).dot(y)
    return w, compute_loss(y, tx, w)
    
def find_best_ridge_lambda(train_y, train_x, test_x, test_y):
    step = 0.0001
    lambda_ = 0
    best_accuracy = 0
    best_lambda = 0
    for i in range(0, int(0.001/step)):
        w, loss = ridge_regression(train_y, train_x, lambda_)

        accuracy = get_accuracy(test_x, test_y, w)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_lambda = lambda_
            print(lambda_, accuracy)

        lambda_ += step
    
    return best_lambda
    
lambda_ = find_best_ridge_lambda(train_y, train_x, test_x, test_y)
# lambda_ = 0.03
w, loss = ridge_regression(train_y, train_x, lambda_)
print("Lambda:" + str(lambda_))
print("Loss: " + str(loss))
print("Accuracy: " + str(get_accuracy(train_x, train_y, w)))
print("Accuracy: " + str(get_accuracy(test_x, test_y, w)))
    


In [None]:
def build_polynomial(x, max_degree):
    polynomial_x = x
    polynomial_x = np.concatenate((polynomial_x, np.tanh(x)), axis=1)
    polynomial_x = np.concatenate((polynomial_x, np.log(np.abs(x))), axis=1)
    for degree in range(2, max_degree +1):
        polynomial_x = np.concatenate((polynomial_x, np.power(x, degree)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.tanh(x), degree)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.log(np.abs(x)), degree)), axis=1)

    return polynomial_x

In [None]:
import numpy as np
import itertools as it

def build_combinations(x):
    columns_index = np.array(range(0, 19))
    combinations = list(it.combinations(np.unique(columns_index), 2))

    polynomial_x = x
    for col1, col2 in combinations:
        new_col = x[:, col1] * x[:, col2]
        new_col = new_col.reshape(new_col.shape[0], 1)
        polynomial_x = np.concatenate((polynomial_x, new_col), axis=1)
    
    return polynomial_x

In [None]:
def build_combinations_lvl(x, lvl):
    columns_index = np.array(range(0, 8))
    combinations = list(it.combinations(np.unique(columns_index), lvl))

    polynomial_x = x
    for cols in combinations:
        new_col = 1
        for col in cols:
            new_col *= x[:, col]
        new_col = new_col.reshape(new_col.shape[0], 1)
        polynomial_x = np.concatenate((polynomial_x, new_col), axis=1)
    
    return polynomial_x

In [None]:
# Feature engineering (Add more features)
polynomial_x = x
polynomial_x = build_polynomial(polynomial_x, 6)
polynomial_x = build_combinations_lvl(polynomial_x, 2)
polynomial_x = build_combinations_lvl(polynomial_x, 3)
polynomial_x = build_combinations_lvl(polynomial_x, 4)
polynomial_x = build_combinations_lvl(polynomial_x, 5)
polynomial_x = build_combinations_lvl(polynomial_x, 6)
polynomial_x = build_combinations_lvl(polynomial_x, 7)
polynomial_x = build_combinations_lvl(polynomial_x, 8)

train_x, train_y, test_x, test_y = separate_set(polynomial_x, y)

polynomial_x.shape