In [89]:
import numpy as np
import random
from helpers import *
from costs import *
from gradient_descent import *
from stochastic_gradient_descent import *

random.seed(10)

In [169]:
y, x, ids = load_csv_data(data_path="datas/train.csv", sub_sample=False)


In [176]:
# Set to 0 all column containing -999
def remove_NaN(x):
    columns_with_NaN = set("")
    for row in x:
        for i,feature in enumerate(row):
            if feature == -999:
                columns_with_NaN.add(i)
        
    x = np.delete(x, [col for col in columns_with_NaN], axis=1)
        
    print("Cleaned " + str(len(columns_with_NaN)) + " columns")
        
    return x


In [177]:
def normalize(x):
    return (x - x.mean(axis=0)) / (x.std(axis=0) + 0.0000000001)


def preprocess_data(x):
    return normalize(remove_NaN(x))
x = preprocess_data(x)
x

Cleaned 11 columns


array([[ 0.06833197,  0.40768027, -0.46996624, ...,  0.38684673,
         1.04440205,  0.4125105 ],
       [ 0.55250482,  0.54013641, -0.15316749, ..., -0.35771893,
         0.02130497, -0.27381996],
       [ 3.19515553,  1.09655998, -0.34970965, ...,  0.40013535,
         0.02130497, -0.29396985],
       ..., 
       [ 0.31931645, -0.13086367, -0.28495489, ..., -0.08608887,
         0.02130497, -0.31701723],
       [-0.84532397, -0.30297338, -0.69737766, ..., -0.76742886,
        -1.00179211, -0.74543941],
       [ 0.66533608, -0.25352276, -0.79202769, ..., -0.87267059,
        -1.00179211, -0.74543941]])

In [178]:
def separate_set(x, y):
    x_and_y = np.concatenate((y.reshape((y.shape[0], 1)), x), axis=1)
    np.random.shuffle(x_and_y)
    
    count = x_and_y.shape[0]
    last_train_index = int(count * 0.8)
    
    train_set = x_and_y[0:last_train_index, :]
    test_set = x_and_y[last_train_index:, :]
    
    train_y = train_set[:, 0]
    test_y = test_set[:, 0]

    train_x = train_set[:, 1:]
    test_x = test_set[:, 1:]

    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = separate_set(x, y)

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(200000, 19)
(200000,)
(50000, 19)
(50000,)


In [180]:
w_init = np.random.rand(x.shape[1])
w, loss = least_squares_GD(train_y, train_x, w_init, max_iters=100, gamma=0.3)

Gradient Descent(0/99): loss=5.59617539364443		0.42319
Gradient Descent(1/99): loss=2.5001047463576427		0.61715
Gradient Descent(2/99): loss=1.393074338097993		0.46595
Gradient Descent(3/99): loss=0.9010783906364982		0.644165
Gradient Descent(4/99): loss=0.6629962676706072		0.56048
Gradient Descent(5/99): loss=0.5434411941514207		0.665315
Gradient Descent(6/99): loss=0.48205495638705326		0.65575
Gradient Descent(7/99): loss=0.4499316523004009		0.68387
Gradient Descent(8/99): loss=0.4327644209422526		0.69095
Gradient Descent(9/99): loss=0.4233406696413214		0.69537
Gradient Descent(10/99): loss=0.4179787970595486		0.700535
Gradient Descent(11/99): loss=0.41478045311525763		0.700715
Gradient Descent(12/99): loss=0.4127571574707983		0.703565
Gradient Descent(13/99): loss=0.4113886040913678		0.703005
Gradient Descent(14/99): loss=0.4103973401057686		0.70473
Gradient Descent(15/99): loss=0.4096329887566649		0.704565
Gradient Descent(16/99): loss=0.409012344322813		0.70557
Gradient Descent(17

In [105]:
w, loss = least_squares_SGD(train_y, train_x, w_init, 100, gamma=0.01)

SGD (0/99): loss=9.491306913659226		0.57564
SGD (1/99): loss=8.834314810848172		0.571265
SGD (2/99): loss=7.816103257852662		0.57043
SGD (3/99): loss=7.823711805249579		0.57096
SGD (4/99): loss=7.218297244495725		0.570445
SGD (5/99): loss=7.305700470608432		0.574485
SGD (6/99): loss=7.264329729422425		0.5738
SGD (7/99): loss=7.006150239762559		0.57053
SGD (8/99): loss=6.960262243015965		0.57068
SGD (9/99): loss=5.802231588486342		0.55542
SGD (10/99): loss=5.824283840074395		0.555815
SGD (11/99): loss=2.1752018422353223		0.51689
SGD (12/99): loss=2.172737107948579		0.51491
SGD (13/99): loss=2.1742030083267343		0.515055
SGD (14/99): loss=2.2248484937244957		0.501445
SGD (15/99): loss=2.226512379714472		0.501965
SGD (16/99): loss=2.2126747616654177		0.50305
SGD (17/99): loss=2.14389957215831		0.50632
SGD (18/99): loss=2.0235792541988276		0.509495
SGD (19/99): loss=2.006647473992341		0.513785
SGD (20/99): loss=2.0176038854358604		0.511625
SGD (21/99): loss=2.012635899832488		0.512605
SGD (

In [156]:
get_accuracy(test_x, test_y, w)

0.641

In [181]:
_, submission_x, submission_ids = load_csv_data(data_path="datas/test.csv", sub_sample=False)
submission_x = preprocess_data(submission_x)
pred_y = predict_labels(w, submission_x)

Cleaned 11 columns


In [182]:
create_csv_submission(submission_ids, pred_y, "datas/submission.csv")
print('Done !')


Done !


In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt

# https://stackoverflow.com/a/7941594/4810319
def main():
    np.random.seed(1977)
    numvars, numdata = 5, 100
    data = 10 * np.random.random((numvars, numdata))
    data = x[0:300, 0:7].T
    print(x[0:200, 7])
    fig = scatterplot_matrix(data, ['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet'],
            linestyle='none', marker='o', color='black', mfc='none')
    fig.suptitle('Simple Scatterplot Matrix')
    plt.show()

def scatterplot_matrix(data, names, **kwargs):
    """Plots a scatterplot matrix of subplots.  Each row of "data" is plotted
    against other rows, resulting in a nrows by nrows grid of subplots with the
    diagonal subplots labeled with "names".  Additional keyword arguments are
    passed on to matplotlib's "plot" command. Returns the matplotlib figure
    object containg the subplot grid."""
    numvars, numdata = data.shape
    fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8,8))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    # Plot the data.
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i,j), (j,i)]:
            axes[x,y].plot(data[x], data[y], **kwargs)

    # Label the diagonal subplots...
    for i, label in enumerate(names):
        axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                ha='center', va='center')

    # Turn on the proper x or y axes ticks.
    for i, j in zip(range(numvars), itertools.cycle((-1, 0))):
        axes[j,i].xaxis.set_visible(True)
        axes[i,j].yaxis.set_visible(True)

    return fig

main()

In [179]:
def least_squares(y, tx):
    gram = tx.T.dot(tx)
    print("Rank: " + str(np.linalg.matrix_rank(gram)))
    w = np.linalg.inv(gram).dot(tx.T).dot(y)
    return w, compute_loss(y, tx, w)
    
w, loss = least_squares(train_y, train_x)
print("Loss: " + str(loss))
print("Accuracy: " + str(get_accuracy(train_x, train_y, w)))
print("Accuracy: " + str(get_accuracy(test_x, test_y, w)))

Rank: 19
Loss: 0.402476348718
Accuracy: 0.706655
Accuracy: 0.71032
