In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
from implementations import *

In [3]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [4]:
X = tX.copy()

In [5]:
# Define some constants

M = X.shape[1] # number of features
UNDEF = -999.
JET_NUM_INDEX = 22 # the index of the PRI_jet_num feature

### Method definitions (move to another file eventually)

In [6]:
# group data by PRI_jet_num
def data_of_jet_num(X, y, jet_num):
    mask = X[:, JET_NUM_INDEX] == jet_num
    if y is None:
        return X[mask], mask
    return X[mask], y[mask], mask

In [7]:
# remove rows that still have undefined values
def remove_undef_rows(X,y,info=False):
    to_remove = []
    for i,row in enumerate(X):
        if UNDEF in row:
            to_remove.append(i)
    x_clean = np.delete(X, to_remove, axis=0)
    y_clean = np.delete(y, to_remove)
    if info:
        print('Removed',len(to_remove),'samples containing',UNDEF)
    return x_clean, y_clean

In [8]:
def remove_unwanted_cols(X,unwanted_cols):
    x_clean = np.delete(X, unwanted_cols, axis=1)
    return x_clean

In [9]:
def clean_data(X,y,unwanted_cols):
    X_ = remove_unwanted_cols(X,unwanted_cols)
    return remove_undef_rows(X_,y)

In [10]:
# compute weights for columns we selected
def regress(X, y, lamb=0):
    w,_ = ridge_regression(y, X, lamb)
    return w

In [11]:
def accuracy(predictions, actual):
    return np.sum(predictions==actual)/len(actual)

### Implementation

In [12]:
# undefined columns for each group
unwanted0 = [4,5,6,8,9,12,22,23,24,25,26,27,28,29]
unwanted1 = [4,5,6,12,22,26,27,28]
unwanted2 = [22]
unwanted3 = [22]

unwanted_cols = [unwanted0, unwanted1, unwanted2, unwanted3]

In [13]:
TRAIN_X, TRAIN_Y, VAL_X, VAL_Y, TEST_X, TEST_Y = split_data(X,y,train_ratio=0.4,validation_ratio=0.2)

### Train on our train set: 1 classifier per JET_NUM

In [14]:
#TODO Need to use crossvalidation to find the best degree (for now, 5 and 10 seem good)
#TODO Need to expand polynomial more (i.e., x1 * x2, x1 * x3)
def expand_poly(X, degree=10):
    expanded_x = X.copy()
    for d in range(2,degree + 1):
            expanded_x = np.c_[expanded_x, np.power(X, d)]
    return expanded_x

In [15]:
# main training method, returns a list of 4 sets of weights, one for each classifier
def train(train_x, train_y, lambs=[0,0,0,0]):
    
    # each model has it's own weights, store them in a list
    weights = []

    # for every jet number
    for i in range(4):

        # get train data
        x, y, _ = data_of_jet_num(train_x, train_y, i)

        # clean training data
        clean_x, clean_y = clean_data(x, y, unwanted_cols[i])

        expanded_x = expand_poly(clean_x)

        # fit the model
        w = regress(expanded_x, clean_y, lambs[i])
        weights.append(w)
        
    return weights

In [16]:
weights = train(TRAIN_X, TRAIN_Y)

### Check accuracy with our test set

In [17]:
def predict(w, test_x):
    labels = np.dot(test_x, w)
    labels = np.sign(labels)*1.0
    return labels

In [18]:
# return predictions for a given chunck of data (validation or test set)
def predict_all(test_x, weights):
    
    predictions = np.ones((test_x.shape[0],))
    
    # for every jet number
    for i in range(4):

        # get test data
        x, ids = data_of_jet_num(test_x, None, i)
        
        # clean test data (only remove undefined columns)
        clean_x = remove_unwanted_cols(x, unwanted_cols[i])

        expanded_x = expand_poly(clean_x)

        # predict the labels
        p = predict(weights[i], expanded_x)
        
        predictions[ids] = p
        
    return predictions

In [19]:
# method that predicts and returns the accuracy
def check(test_x, test_y, weights):
    predictions = predict_all(test_x, weights)
    return accuracy(predictions, test_y)

In [20]:
check(TEST_X, TEST_Y, weights)

0.81204

### Peaufiner lambda avec le validation set

In [23]:
# return a list of 4 accuracies (1 for each jet num), to be used for finding best hyper parameters
def all_accuracies(val_x, val_y, weights):
    
    accuracies = []
    
    # for every jet number
    for i in range(4):

        # get validation data
        x, y, ids = data_of_jet_num(val_x, val_y, i)
        
        # clean validation data (only remove undefined columns)
        clean_x = remove_unwanted_cols(x, unwanted_cols[i])

        # build polynomial expansion
        ### TODO
        expanded_x = expand_poly(clean_x)

        # predict the labels
        p = predict(weights[i], expanded_x)
        
        accuracies.append(accuracy(p, y)) 
        
    return accuracies

In [24]:
# get best lambdas by computing accuracy separately for each jet num
# disclaimer: c'est ZERO optimal mais ça va assez vite donc wala

lambdas = np.logspace(-5, 0, 10)
lambdas = np.insert(lambdas, 0, 0)

accuracies = [[] for _ in range(4)]
for i in range(4):
    lambs = [0,0,0,0]
    for j, lamb in enumerate(lambdas):
        lambs[i] = lamb
        weights = train(TRAIN_X, TRAIN_Y, lambs)
        accuracies[i].append(all_accuracies(VAL_X, VAL_Y, weights)[i])
        print('\r%d / %d' % ((i)*len(lambdas)+(j+1), 4*len(lambdas)), end='')

44 / 44

In [25]:
# find best lambdas for each jet num
best_ids = np.argmax(accuracies, axis=1)
best_lambdas = lambdas[best_ids]
best_lambdas

array([0.07742637, 0.00046416, 0.00599484, 0.07742637])

In [28]:
weights = train(TRAIN_X, TRAIN_Y, best_lambdas)

In [32]:
check(TEST_X, TEST_Y, weights)

0.82473

# Generate predictions and save ouput in csv format for submission:

In [33]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [34]:
y_pred = predict_all(tX_test, weights)

In [35]:
OUTPUT_PATH = '../output/predictions_4.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)