In [1]:
from project_helpers import *
import csv
import numpy as np
import pandas as pd
import math

In [2]:
def load_csv_data(data_path, sub_sample=False):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1, 1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    
    #initialize w
    w = initial_w
    
    #loop for the iterations of the gradient
    for loop in range(max_iters):
        
        #computes the errors and the gradient
        e = y - tx.dot(w)
        grad = -tx.T.dot(e) / e.shape[0]
             
        #gradient descent
        w = w - grad * gamma
        
    #calculate the loss through the mean square method
    loss = 1/2*np.mean(e**2)
    
    return w, loss

def predict_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    
    return y_pred

def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [122]:
def AIC_forward(y, x):
    
    left = set(range(1, x.shape[1]))
    picked = [0]
    
    current, new = 1000000.0, 1000000.0
    
    while left and current == new:
        
        aics_cov = []
        
        for covariate in left:
            columns = picked + [covariate]
            print(columns)
            loss = least_squares_GD(y, x[:,columns], np.zeros(len(columns)), 3000, 0.01)[1]
            aic = 2*loss*y.shape[0] + 2*len(columns)
            aics_cov.append((aic, covariate))
        
        aics_cov.sort()
        new, best_cov = aics_cov[0]
        
        if current > new:
            left.remove(best_cov)
            picked.append(best_cov)
            current = new
            
    return picked

Exploratory Data Analysis:

In [3]:
#Load train data
x_pd = pd.read_csv('Data/train.csv')
x_pd.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000,s,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,...,-0.277,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,100001,b,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,...,-1.916,164.546,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226
2,100002,b,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,...,-2.186,260.414,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251
3,100003,b,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,...,0.06,86.062,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
4,100004,b,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,...,-0.871,53.131,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0


Many columns have missing values. It looks like that missingness depends on the value of PRI_jet_num.  
  
When jet_num=0:  
DER_deltaeta_jet_jet, DER_mass_jet_jet, DER_prodeta_jet_jet, DER_lep_eta_centrality, PRI_jet_leading_pt, PRI_jet_leading_eta, PRI_jet_leading_phi, PRI_jet_subleading_pt, PRI_jet_subleading_eta, PRI_jet_subleading_phi and PRI_jet_all_pt do not contain any information 
  
When jet_num=1:  
DER_deltaeta_jet_jet, DER_mass_jet_jet, DER_prodeta_jet_jet, DER_lep_eta_centrality, PRI_jet_subleading_pt, PRI_jet_subleading_eta, PRI_jet_subleading_phi do not contain any information  
  
When jet_num=2,3 every column contains relevant information  
  
The column DER_mass_MMC has missing values for all jet_num values

In [4]:
#x_pd.corr()

We observe that many columns are correlated. This violates the assumption of independence between covariates of a linear model and may result in an ill-conditioned design matrix.

Model Building:

Approach 1: Fit the whole data set  
Approach 2: Split the data according to the jet_num value and fit models separately  
Approach 3: Build a new model based on AIC forward selection  
  
At each step we will resort to 80/20 cross validation, least squares GD and regularized least squares GD

In [117]:
#Standardize data and add intercept
def set_up(tx):
    tx[tx == -999] = 0
    tx = (tx - np.mean(tx)) / np.std(tx)
    intercept = np.ones((tx.shape[0], 1))
    tx = np.concatenate((intercept, tx), axis=1)
    return tx

In [118]:
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

In [119]:
#Load train data
y, tx, ids = load_csv_data('Data/train.csv', sub_sample=False)

In [120]:
tx_j0 = tx[tx[:,22]==0]
sel_j0 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21]
tx_j0 = tx_j0[:,sel_j0]
tx_j0 = set_up(tx_j0)
y_j0 = y[tx[:,22]==0]

In [125]:
#a = AIC_forward(y_j0, tx_j0)

In [75]:
tx_j1 = tx[tx[:,22]==1]
sel_j1 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,23,24,25,29]
tx_j1 = tx_j1[:,sel_j1]
tx_j1 = set_up(tx_j1)
y_j1 = y[tx[:,22]==1]

In [76]:
tx_j23 = tx[(tx[:,22]==2) | (tx[:,22]==3)]
tx_j23 = set_up(tx_j23)
y_j23 = y[(tx[:,22]==2) | (tx[:,22]==3)]

In [98]:
#Fit whole data
w0, loss0 = least_squares_GD(y_j0, tx_j0, np.zeros(tx_j0.shape[1]), 5000, 0.000001)
w1, loss1 = least_squares_GD(y_j1, tx_j1, np.zeros(tx_j1.shape[1]), 5000, 0.000001)
w23, loss23 = least_squares_GD(y_j23, tx_j23, np.zeros(tx_j23.shape[1]), 5000, 0.000001)

Predicting test data:

In [103]:
y_t, tx_t, ids_t = load_csv_data('Data/test.csv', sub_sample=False)

In [104]:
tx_t_j0 = tx_t[tx_t[:,22]==0]
sel_j0 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21]
tx_t_j0 = tx_t_j0[:,sel_j0]
tx_t_j0 = set_up(tx_t_j0)
ids_t_j0 = ids_t[tx_t[:,22]==0]

In [106]:
tx_t_j1 = tx_t[tx_t[:,22]==1]
sel_j1 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,23,24,25,29]
tx_t_j1 = tx_t_j1[:,sel_j1]
tx_t_j1 = set_up(tx_t_j1)
ids_t_j1 = ids_t[tx_t[:,22]==1]

In [107]:
tx_t_j23 = tx_t[(tx_t[:,22]==2) | (tx_t[:,22]==3)]
tx_t_j23 = set_up(tx_t_j23)
ids_t_j23 = ids_t[(tx_t[:,22]==2) | (tx_t[:,22]==3)]

In [190]:
y_pred_j0 = predict_labels(w0, tx_t_j0)
y_pred_j1 = predict_labels(w1, tx_t_j1)
y_pred_j23 = predict_labels(w23, tx_t_j23)

In [203]:
ids_sub = ids_t_j0.tolist() + ids_t_j1.tolist() + ids_t_j23.tolist()
y_sub = y_pred_j0.tolist() + y_pred_j1.tolist() + y_pred_j23.tolist()

In [68]:
#create_csv_submission(ids_t, pred, 'pred full model unknown set to 0 LS GD')