In [534]:
from project_helpers import *
import csv
import numpy as np
import pandas as pd
import math

In [535]:
#Compute sigmoid function
def sigmoid(z):
    #return 1 / (1+np.exp(-z))
    #This approximation of the sigmoid function avoids exp overflow
    return .5 * (1 + np.tanh(.5 * z))

In [536]:
#Compute loss function
def loss_f_lr(h, y):
    #epsilon is added to log computations in order to avoid log(0) instances
    epsilon = 0.00001
    return (-y*np.log(h+epsilon) - (1-y)*np.log(1-h+epsilon)).sum()

In [537]:
def logistic_regression(y, tx, initial_w, max_iters, gamma, pandas=False):
   
    w = initial_w
    h = 0
    loss_prev = 1000000
    
    for i in range(max_iters):
    
        #Compute x_t*w
        z = np.dot(tx, w)
        #Compute sigmoid of z
        h = sigmoid(z)
        
        loss_curr = loss_f_lr(h, y)
        
        if abs(loss_prev - loss_curr) < 0.00001:
            break
            
        loss_prev = loss_curr

        #Compute stochastic gradient
        n = np.random.randint(len(y))
        stoch_gradient = (h[n]-y[n]) * tx[n]
        
        #Update w according to stochastic gradient
        update = gamma*stoch_gradient
        w = w - update
    
    loss = loss_f_lr(h, y)
    
    return (w, loss)

In [538]:
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    
    w = initial_w
    h = 0
    
    for i in range(max_iters):
    
        z = np.dot(tx, w)
        h = sigmoid(z)

        #The only difference with the previous function is the regularization constraint factored
        #in the gradient computation
        gradient = np.dot(tx.T, h-y) + lambda_*w
        w = w - gamma*gradient
    
    loss = loss_f_lr(h, y) + (lambda_/2)*np.dot(w.T,w)
    
    return (w, loss)

In [539]:
def proba(X, w):
    return sigmoid(np.dot(X, w))

def predict(X, w, threshold=0.5):
    return proba(X, w) >= threshold

In [540]:
def AIC_forward(y, x_pd):
    
    x = np.array(x_pd)
    
    left = set(range(x_pd.shape[1]))
    left.remove(x_pd.shape[1]-1)
    
    picked = [x_pd.shape[1]-1]
    
    current, new = 1000000.0, 1000000.0
    
    while left and current == new:
        
        aics_cov = []
        
        for covariate in left:
            columns = picked + [covariate]
            loss = logistic_regression(y, x[:,columns], np.zeros(len(columns)), 100000, 0.01)[1]
            aic = 2*loss + 2*len(columns)
            aics_cov.append((aic, covariate))
        
        aics_cov.sort()
        new, best_cov = aics_cov[0]
        
        if current > new:
            left.remove(best_cov)
            picked.append(best_cov)
            current = new
            
    return np.array(x_pd.columns)[picked]

In [541]:
#Load train data
#y, tx, ids = load_csv_data('Data/train.csv', sub_sample=True)
#intercept = np.ones((tx.shape[0], 1))
#tx = np.concatenate((intercept, tx), axis=1)

In [542]:
#Fit whole data
#w1, loss1 = logistic_regression(y, tx, np.zeros(tx.shape[1]), 100000, 0.01)

In [543]:
#Train validation
#preds = predict(tx,w1)
#(preds == y).mean()

**Loading Train Data**  

In [544]:
#Load sub sample of train data
x = pd.read_csv('Data/train.csv')
x = x.iloc[::25, :]
x['Intercept'] = 1
x["Prediction"] = x["Prediction"].apply(lambda p: 1 if p=='s' else 0)

In [545]:
#Data for jet value=0
y_j0 = np.array(x[x["PRI_jet_num"]==0].Prediction)

df_j0 = x[x["PRI_jet_num"]==0].drop(columns=['Prediction','DER_deltaeta_jet_jet', 
                                               'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 
                                               'DER_lep_eta_centrality', 'PRI_jet_leading_pt',
                                               'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 
                                               'PRI_jet_subleading_pt','PRI_jet_subleading_eta', 
                                               'PRI_jet_subleading_phi','PRI_jet_num','Id','PRI_jet_all_pt'])

mean_j0 = df_j0[df_j0['DER_mass_MMC'] != -999].DER_mass_MMC.mean()
df_j0 = df_j0.replace({'DER_mass_MMC': {-999: mean_j0}})
#df_j0 = (df_j0 - df_j0.mean()) / df_j0.std()
df_j0['Intercept'] = 1

In [546]:
#Data for jet value=1
y_j1 = np.array(x[x["PRI_jet_num"]==1].Prediction)

df_j1 = x[x["PRI_jet_num"]==1].drop(columns=['PRI_jet_num','Prediction', 'Id', 'DER_deltaeta_jet_jet',
                                               'DER_mass_jet_jet', 'DER_prodeta_jet_jet',
                                               'DER_lep_eta_centrality', 'PRI_jet_subleading_pt',
                                               'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi'])

mean_j1 = df_j1[df_j1['DER_mass_MMC'] != -999].DER_mass_MMC.mean()
df_j1=df_j1.replace({'DER_mass_MMC': {-999: mean_j1}})
#df_j1 = (df_j1 - df_j1.mean()) / df_j1.std()
df_j1['Intercept'] = 1

In [547]:
#Data for jet value=2,3
y_j23 = np.array(x[x["PRI_jet_num"].isin([2,3])].Prediction)

df_j23 = x[x["PRI_jet_num"].isin([2,3])].drop(columns=['Prediction','Id','PRI_jet_num'])

mean_j23 = df_j23[df_j23['DER_mass_MMC'] != -999].DER_mass_MMC.mean()
df_j23=df_j23.replace({'DER_mass_MMC': {-999: mean_j23}})
#df_j23 = (df_j23 - df_j23.mean()) / df_j23.std()
df_j23['Intercept'] = 1

**Performing forward selection for each subgroup**

In [548]:
#ij0 = AIC_forward(y_j0, df_j0)
ij0 = np.array(['Intercept', 'DER_mass_transverse_met_lep', 'PRI_tau_pt',
       'DER_deltar_tau_lep', 'DER_mass_vis', 'DER_sum_pt', 'PRI_met'])

In [549]:
df_jet0_aic = df_j0.loc[:, ij0]

x_jet0_aic = np.array(df_jet0_aic)

w0, loss0 = logistic_regression(y_j0, x_jet0_aic, np.zeros(ij0.shape), 100000, 0.01)

In [550]:
#jet=0
preds = predict(x_jet0_aic, w0)
(preds == y_j0).mean()

0.7658536585365854

In [551]:
#ij1 = AIC_forward(y_j1, df_j1)
ij1 = np.array(['Intercept', 'DER_mass_transverse_met_lep', 'PRI_tau_pt',
       'DER_met_phi_centrality', 'PRI_met', 'DER_pt_h', 'DER_sum_pt',
       'DER_mass_vis', 'DER_deltar_tau_lep', 'DER_pt_tot',
       'PRI_met_sumet', 'PRI_jet_leading_phi'])

In [552]:
df_jet1_aic = df_j1.loc[:, ij1]

x_jet1_aic = np.array(df_jet1_aic)

w1, loss1 = logistic_regression(y_j1, x_jet1_aic, np.zeros(ij1.shape), 100000, 0.01)

In [553]:
#jet=1
preds = predict(x_jet1_aic, w1)
(preds == y_j1).mean()

0.645872715816005

In [554]:
#ij23 = AIC_forward(y_j23, df_j23)
ij23 = np.array(['Intercept', 'DER_deltaeta_jet_jet', 'DER_met_phi_centrality',
       'PRI_tau_pt', 'PRI_met_sumet', 'DER_lep_eta_centrality',
       'DER_mass_transverse_met_lep', 'DER_mass_jet_jet',
       'DER_pt_ratio_lep_tau', 'DER_pt_h', 'DER_deltar_tau_lep',
       'PRI_lep_pt', 'DER_mass_vis', 'PRI_jet_all_pt'])

In [555]:
df_jet23_aic = df_j23.loc[:, ij23]

x_jet23_aic = np.array(df_jet23_aic)

w23, loss23 = logistic_regression(y_j23, x_jet23_aic, np.zeros(ij23.shape), 100000, 0.01)

In [556]:
#jet=2,3
preds = predict(x_jet23_aic, w23)
(preds == y_j23).mean()

0.44626407369498466

**Test Data Prediction**

In [557]:
x_t = pd.read_csv('Data/test.csv')
x_t['Intercept'] = 1

In [558]:
#Test for jet value=0

ids_j0 = x_t[x_t["PRI_jet_num"]==0].Id

df_j0t = x_t[x_t["PRI_jet_num"]==0].drop(columns=['Prediction','DER_deltaeta_jet_jet', 
                                               'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 
                                               'DER_lep_eta_centrality', 'PRI_jet_leading_pt',
                                               'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 
                                               'PRI_jet_subleading_pt','PRI_jet_subleading_eta', 
                                               'PRI_jet_subleading_phi','PRI_jet_num','Id','PRI_jet_all_pt'])

df_j0t=df_j0t.replace({'DER_mass_MMC': {-999: mean_j0}})
#df_j0t = (df_j0t - df_j0.mean()) / df_j0.std()
df_j0t['Intercept'] = 1
df_j0t = df_j0t.loc[:, ij0]
x_j0t = np.array(df_j0t)
preds_j0t = predict(x_j0t, w0)

In [559]:
#Test for jet value=1

ids_j1 = x_t[x_t["PRI_jet_num"]==1].Id

df_j1t = x_t[x_t["PRI_jet_num"]==1].drop(columns=['PRI_jet_num','Prediction', 'Id', 'DER_deltaeta_jet_jet',
                                               'DER_mass_jet_jet', 'DER_prodeta_jet_jet',
                                               'DER_lep_eta_centrality', 'PRI_jet_subleading_pt',
                                               'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi'])

df_j1t=df_j1t.replace({'DER_mass_MMC': {-999: mean_j1}})
#df_j1t = (df_j1t - df_j1.mean()) / df_j1.std()
df_j1t['Intercept'] = 1
df_j1t = df_j1t.loc[:, ij1]
x_j1t = np.array(df_j1t)
preds_j1t = predict(x_j1t, w1)

In [560]:
#Test for jet value=2,3

ids_j23 = x_t[x_t["PRI_jet_num"].isin([2,3])].Id

df_j23t = x_t[x_t["PRI_jet_num"].isin([2,3])].drop(columns=['Prediction','Id','PRI_jet_num'])

df_j23t=df_j23t.replace({'DER_mass_MMC': {-999: mean_j23}})
#df_j23t = (df_j23t - df_j23.mean()) / df_j23.std()
df_j23t['Intercept'] = 1
df_j23t = df_j23t.loc[:, ij23]
x_j23t = np.array(df_j23t)
preds_j23t = predict(x_j23t, w23)

In [561]:
ids = ids_j0.tolist() + ids_j1.tolist() + ids_j23.tolist()
y_pred = preds_j0t.tolist() + preds_j1t.tolist() + preds_j23t.tolist()
y_pred = [1 if pred else -1 for pred in y_pred]

In [562]:
create_csv_submission(ids, y_pred, 'pred1')