In [73]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [74]:
from proj1_helpers import *
from implementations import *

In [75]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [76]:
X = tX.copy()

In [77]:
# Define some constants

M = X.shape[1] # number of features
UNDEF = -999.
JET_NUM_INDEX = 22 # the index of the PRI_jet_num feature

### Method definitions (move to another file eventually)

In [78]:
# group data by PRI_jet_num
def data_of_jet_num(X, y, jet_num):
    mask = X[:, JET_NUM_INDEX] == jet_num
    return X[mask], y[mask], mask

In [119]:
# remove rows that still have undefined values
def remove_undef_rows(X,y,info=False):
    to_remove = []
    for i,row in enumerate(X):
        if UNDEF in row:
            to_remove.append(i)
    x_clean = np.delete(X, to_remove, axis=0)
    y_clean = np.delete(y, to_remove)
    if info:
        print('Removed',len(to_remove),'samples containing',UNDEF)
    return x_clean, y_clean

In [120]:
def remove_unwanted_cols(X,unwanted_cols):
    x_clean = np.delete(X, unwanted_cols, axis=1)
    return x_clean

In [130]:
def clean_data(X,y,unwanted_cols):
    X_ = remove_unwanted_cols(X,unwanted_cols)
    return remove_undef_rows(X_,y)

In [82]:
# compute weights for columns we selected
def regress(X, y, lamb=0):
    w,_ = ridge_regression(y, X, lamb)
    return w

In [96]:
def predict(w, test_x):
    labels = test_x @ w
    labels = np.sign(labels)*1.0
    return labels

In [84]:
def accuracy(predictions, actual):
    return np.sum(predictions==actual)/len(actual)

### Implementation

In [85]:
# undefined columns for each group
unwanted0 = [4,5,6,8,9,12,22,23,24,25,26,27,28,29]
unwanted1 = [4,5,6,12,22,26,27,28]
unwanted2 = [22]
unwanted3 = [22]

unwanted_cols = [unwanted0, unwanted1, unwanted2, unwanted3]

In [86]:
TRAIN_X, TRAIN_Y, VAL_X, VAL_Y, TEST_X, TEST_Y = split_data(X,y,0.4,0.2)

### Train on our train set: 1 classifier per JET_NUM

In [131]:
# main training method, returns a list of 4 sets of weights, one for each classifier
def train(train_x, train_y, lamb=0):
    
    # each model has it's own weights, store them in a list
    weights = []

    # for every jet number
    for i in range(4):

        # get train data
        x, y, _ = data_of_jet_num(train_x, train_y, i)

        # clean training data
        clean_x, clean_y = clean_data(x, y, unwanted_cols[i])

        # build polynomial expansion
        ### TODO
        expanded_x = clean_x

        # fit the model
        w = regress(expanded_x, clean_y, lamb)
        weights.append(w)
        
    return weights

In [143]:
weights = train(TRAIN_X, TRAIN_Y)

### Check accuracy with our test set

In [144]:
# return predictions for a given chunck of data (validation or test set)
def predict_all(test_x, test_y, weights):
    
    predictions = np.ones(test_y.shape)
    
    # for every jet number
    for i in range(4):

        # get test data
        x, y, ids = data_of_jet_num(test_x, test_y, i)
        
        # clean test data (only remove undefined columns)
        clean_x = remove_unwanted_cols(x, unwanted_cols[i])

        # build polynomial expansion
        ### TODO
        expanded_x = clean_x

        # predict the labels
        p = predict(weights[i], expanded_x)
        
        predictions[ids] = p
        
    return predictions

In [147]:
# method that predicts and returns the accuracy
def check(test_x, test_y, weights):
    predictions = predict_all(test_x, test_y, weights)
    return accuracy(predictions, test_y)

In [148]:
check(TEST_X, TEST_Y, weights)

0.75437

### Peaufiner lambda avec le validation set

In [37]:
### TODO

# Generate predictions and save ouput in csv format for submission:

In [90]:
def predict_for_submission(test_x, weights):
    bogus_y = np.ones((test_x.shape[0],))
    return predict_all(test_x, bogus_y, weights)

In [70]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
y_pred = predict_for_submission(tX_test, weights)

In [None]:
OUTPUT_PATH = '../output/predictions_2.csv'
y_pred = predict(ws, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)