In [3]:
from implementations import *
from project_helpers import *
import csv
import numpy as np
import pandas as pd
import math

In [56]:
def AIC_forward(y, x):
    
    left = set(range(1, x.shape[1]))
    picked = [0]
    
    current, new = 1000000.0, 1000000.0
    
    while left and current == new:
        
        aics_cov = []
        
        for covariate in left:
            columns = picked + [covariate]
            loss = least_squares(y, x[:,columns])[1]
            aic = 2*loss*y.shape[0] + 2*len(columns)
            aics_cov.append((aic, covariate))
        
        aics_cov.sort()
        new, best_cov = aics_cov[0]
        
        if current > new:
            left.remove(best_cov)
            picked.append(best_cov)
            current = new
            
    return picked

In [5]:
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

Exploratory Data Analysis:

In [6]:
#Load train data
x_pd = pd.read_csv('Data/train.csv')
x_pd.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000,s,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,...,-0.277,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,100001,b,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,...,-1.916,164.546,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226
2,100002,b,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,...,-2.186,260.414,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251
3,100003,b,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,...,0.06,86.062,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
4,100004,b,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,...,-0.871,53.131,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0


Many columns have missing values. It looks like that missingness depends on the value of PRI_jet_num.  
  
When jet_num=0:  
DER_deltaeta_jet_jet, DER_mass_jet_jet, DER_prodeta_jet_jet, DER_lep_eta_centrality, PRI_jet_leading_pt, PRI_jet_leading_eta, PRI_jet_leading_phi, PRI_jet_subleading_pt, PRI_jet_subleading_eta, PRI_jet_subleading_phi and PRI_jet_all_pt do not contain any information 
  
When jet_num=1:  
DER_deltaeta_jet_jet, DER_mass_jet_jet, DER_prodeta_jet_jet, DER_lep_eta_centrality, PRI_jet_subleading_pt, PRI_jet_subleading_eta, PRI_jet_subleading_phi do not contain any information  
  
When jet_num=2,3 every column contains relevant information  
  
The column DER_mass_MMC has missing values for all jet_num values

In [7]:
#x_pd.corr()

We observe that many columns are correlated. This may result in an ill-conditioned design matrix when computing inverses.

Model Building:

Approach 1: Fit the whole data set  
Approach 2: Split the data according to the jet_num value and fit models separately  
Approach 3: Build a new model based on AIC forward selection  
  
At each step we will resort to 80/20 cross validation, least squares GD and regularized least squares GD

In [8]:
#Standardize data and add intercept
def set_up(tx):
    tx[tx == -999] = 0
    #tx = (tx - np.mean(tx)) / np.std(tx)
    intercept = np.ones((tx.shape[0], 1))
    tx = np.concatenate((intercept, tx), axis=1)
    return tx

In [9]:
#Load train data
y, tx, ids = load_csv_data('Data/train.csv', sub_sample=False)

In [73]:
#tx = set_up(tx)

In [95]:
#w, loss = ridge_regression(y, tx, 10**-7)

In [12]:
tx_j0 = tx[tx[:,22]==0]
sel_j0 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21]
tx_j0 = tx_j0[:,sel_j0]
tx_j0 = set_up(tx_j0)
y_j0 = y[tx[:,22]==0]

In [27]:
tx_j1 = tx[tx[:,22]==1]
sel_j1 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,23,24,25,29]
tx_j1 = tx_j1[:,sel_j1]
tx_j1 = set_up(tx_j1)
y_j1 = y[tx[:,22]==1]

In [34]:
tx_j23 = tx[(tx[:,22]==2) | (tx[:,22]==3)]
tx_j23 = set_up(tx_j23)
y_j23 = y[(tx[:,22]==2) | (tx[:,22]==3)]

In [45]:
#Fit whole data ridge regression
wj0, lossj0 = ridge_regression(y_j0, tx_j0, 10**-6)
wj1, lossj1 = ridge_regression(y_j1, tx_j1, 10**-6)
wj23, lossj23 = ridge_regression(y_j23, tx_j23, 10**-6)

Predicting test data:

In [96]:
y_t, tx_t, ids_t = load_csv_data('Data/test.csv', sub_sample=False)

In [97]:
tx_t = set_up(tx_t)

In [99]:
pred = predict_labels(w, tx_t)

In [104]:
tx_t_j0 = tx_t[tx_t[:,22]==0]
sel_j0 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21]
tx_t_j0 = tx_t_j0[:,sel_j0]
tx_t_j0 = set_up(tx_t_j0)
ids_t_j0 = ids_t[tx_t[:,22]==0]

In [106]:
tx_t_j1 = tx_t[tx_t[:,22]==1]
sel_j1 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,23,24,25,29]
tx_t_j1 = tx_t_j1[:,sel_j1]
tx_t_j1 = set_up(tx_t_j1)
ids_t_j1 = ids_t[tx_t[:,22]==1]

In [107]:
tx_t_j23 = tx_t[(tx_t[:,22]==2) | (tx_t[:,22]==3)]
tx_t_j23 = set_up(tx_t_j23)
ids_t_j23 = ids_t[(tx_t[:,22]==2) | (tx_t[:,22]==3)]

In [190]:
y_pred_j0 = predict_labels(w0, tx_t_j0)
y_pred_j1 = predict_labels(w1, tx_t_j1)
y_pred_j23 = predict_labels(w23, tx_t_j23)

In [203]:
ids_sub = ids_t_j0.tolist() + ids_t_j1.tolist() + ids_t_j23.tolist()
y_sub = y_pred_j0.tolist() + y_pred_j1.tolist() + y_pred_j23.tolist()

In [103]:
create_csv_submission(ids_t, pred, 'pred full model ridge reg')