In [89]:
import numpy as np
import random
from helpers import *
from costs import *
from gradient_descent import *
from stochastic_gradient_descent import *

random.seed(10)

In [90]:
y, x, ids = load_csv_data(data_path="datas/train.csv", sub_sample=False)


In [91]:
# Set to 0 all column containing -999
def remove_NaN(x):
    columns_with_NaN = set("")
    for row in x:
        for i,feature in enumerate(row):
            if feature == -999:
                columns_with_NaN.add(i)

    for col in columns_with_NaN:
        x[:, col] = 0
        
    return x
        
x = remove_NaN(x)


In [92]:
def normalize(x):
    return (x - x.mean(axis=0)) / (x.std(axis=0) + 0.0000000001)
x = normalize(x)
x

array([[ 0.        ,  0.06833197,  0.40768027, ...,  0.        ,
         0.        ,  0.4125105 ],
       [ 0.        ,  0.55250482,  0.54013641, ...,  0.        ,
         0.        , -0.27381996],
       [ 0.        ,  3.19515553,  1.09655998, ...,  0.        ,
         0.        , -0.29396985],
       ..., 
       [ 0.        ,  0.31931645, -0.13086367, ...,  0.        ,
         0.        , -0.31701723],
       [ 0.        , -0.84532397, -0.30297338, ...,  0.        ,
         0.        , -0.74543941],
       [ 0.        ,  0.66533608, -0.25352276, ...,  0.        ,
         0.        , -0.74543941]])

In [93]:
def separate_set(x, y):
    x_and_y = np.concatenate((y.reshape((y.shape[0], 1)), x), axis=1)
    np.random.shuffle(x_and_y)
    
    count = x_and_y.shape[0]
    last_train_index = int(count * 0.8)
    
    train_set = x_and_y[0:last_train_index, :]
    test_set = x_and_y[last_train_index:, :]
    
    train_y = train_set[:, 0]
    test_y = test_set[:, 0]

    train_x = train_set[:, 1:]
    test_x = test_set[:, 1:]

    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = separate_set(x, y)

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(200000, 30)
(200000,)
(50000, 30)
(50000,)


In [95]:
w_init = np.random.rand(x.shape[1])
w, loss = least_squares_GD(train_y, train_x, w_init, max_iters=100, gamma=0.3)

Gradient Descent(0/99): loss=9.606167510643457		0.407925
Gradient Descent(1/99): loss=4.437332362676688		0.622815
Gradient Descent(2/99): loss=2.4190050649254657		0.4459
Gradient Descent(3/99): loss=1.451067205450863		0.642675
Gradient Descent(4/99): loss=0.9587158184431638		0.5155
Gradient Descent(5/99): loss=0.7037169310322476		0.65858
Gradient Descent(6/99): loss=0.5704191801056481		0.612865
Gradient Descent(7/99): loss=0.5001257063758184		0.67442
Gradient Descent(8/99): loss=0.46263584727603985		0.681305
Gradient Descent(9/99): loss=0.44231537320308273		0.688985
Gradient Descent(10/99): loss=0.4310359407342719		0.700995
Gradient Descent(11/99): loss=0.42455438801652823		0.69714
Gradient Descent(12/99): loss=0.4206458103531692		0.704405
Gradient Descent(13/99): loss=0.41813739479820144		0.70142
Gradient Descent(14/99): loss=0.41640684319586313		0.705505
Gradient Descent(15/99): loss=0.4151211715622359		0.703835
Gradient Descent(16/99): loss=0.4141002293712916		0.70598
Gradient Desce

In [None]:
w, loss = least_squares_SGD(y, x, w_init, 100, gamma=0.03)

In [96]:
get_accuracy(test_x, test_y, w)

0.70532

In [None]:
y_test, x_test, ids_test = load_csv_data(data_path="datas/test.csv", sub_sample=False)
pred_y = predict_labels(w, x_test)


In [None]:
create_csv_submission(ids_test, pred_y, "datas/submission.csv")
print('Done !')


In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt

# https://stackoverflow.com/a/7941594/4810319
def main():
    np.random.seed(1977)
    numvars, numdata = 5, 100
    data = 10 * np.random.random((numvars, numdata))
    data = x[0:300, 0:7].T
    print(x[0:200, 7])
    fig = scatterplot_matrix(data, ['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet'],
            linestyle='none', marker='o', color='black', mfc='none')
    fig.suptitle('Simple Scatterplot Matrix')
    plt.show()

def scatterplot_matrix(data, names, **kwargs):
    """Plots a scatterplot matrix of subplots.  Each row of "data" is plotted
    against other rows, resulting in a nrows by nrows grid of subplots with the
    diagonal subplots labeled with "names".  Additional keyword arguments are
    passed on to matplotlib's "plot" command. Returns the matplotlib figure
    object containg the subplot grid."""
    numvars, numdata = data.shape
    fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8,8))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    # Plot the data.
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i,j), (j,i)]:
            axes[x,y].plot(data[x], data[y], **kwargs)

    # Label the diagonal subplots...
    for i, label in enumerate(names):
        axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                ha='center', va='center')

    # Turn on the proper x or y axes ticks.
    for i, j in zip(range(numvars), itertools.cycle((-1, 0))):
        axes[j,i].xaxis.set_visible(True)
        axes[i,j].yaxis.set_visible(True)

    return fig

main()