In [1]:
from project_helpers import load_csv_data
import csv
import numpy as np
import pandas as pd
import math

In [2]:
#Compute sigmoid function
def sigmoid(z):
    #return 1 / (1+np.exp(-z))
    #This approximation of the sigmoid function avoids exp overflow
    return .5 * (1 + np.tanh(.5 * z))

In [3]:
#Compute loss function
def loss_f_lr(h, y):
    #epsilon is added to log computations in order to avoid log(0) situations
    epsilon = 1e-5 
    return (-y*np.log(h+epsilon) - (1-y)*np.log(1-h+epsilon)).sum()

In [4]:
def logistic_regression(y, tx, initial_w, max_iters, gamma, pandas=False):
   
    w = initial_w
    h = 0
    
    for i in range(max_iters):
    
        #Compute x_t*w
        z = np.dot(tx, w)
        #Compute sigmoid of z
        h = sigmoid(z)

        #Compute gradient of loss function
        gradient = np.dot(tx.T, h-y)
        
        #Update w according to gradient
        w = w - gamma*gradient
    
    loss = loss_f_lr(h, y)
    
    return (w, loss)

In [5]:
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    
    w = initial_w
    h = 0
    
    for i in range(max_iters):
    
        z = np.dot(tx, w)
        h = sigmoid(z)

        #The only difference with the previous function is the regularization constraint factored
        #in the gradient computation
        gradient = np.dot(tx.T, h-y) + lambda_*w
        w = w - gamma*gradient
    
    loss = loss_f_lr(h, y) + (lambda_/2)*np.dot(w.T,w)
    
    return (w, loss)

In [6]:
def proba(X, w):
    return sigmoid(np.dot(X, w))

def predict(X, w, threshold=0.5):
    return proba(X, w) >= threshold

In [7]:
#Load train data
#y, tx, ids = load_csv_data('Data/train.csv', sub_sample=True)
#intercept = np.ones((tx.shape[0], 1))
#tx = np.concatenate((intercept, tx), axis=1)

In [8]:
#Fit whole data
#w1, loss1 = logistic_regression(y, tx, np.zeros(tx.shape[1]), 100000, 0.01)

In [9]:
#Train validation
#preds = predict(tx,w1)
#(preds == y).mean()

In [10]:
#Load sub sample of train data
x = pd.read_csv('Data/train.csv')
x['Intercept'] = 1
x = x.iloc[::25, :]
x["Prediction"] = x["Prediction"].apply(lambda p: 1 if p=='s' else 0)

In [11]:
y_p = np.array(x[x["PRI_jet_num"]==0].Prediction)
y_p_1 = y_p[:3000]
y_p_2 = y_p[3000:]

In [12]:
#Data for first jet value
df_jet1 = x[x["PRI_jet_num"]==0].drop(columns=['Prediction','DER_deltaeta_jet_jet', 
                                               'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 
                                               'DER_lep_eta_centrality', 'PRI_jet_leading_pt',
                                               'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 
                                               'PRI_jet_subleading_pt','PRI_jet_subleading_eta', 
                                               'PRI_jet_subleading_phi','PRI_jet_num','Id'])

In [13]:
#Choosing the AIC wise best model for the first jet value
df_jet1_aic = x[x["PRI_jet_num"]==0].loc[:,['DER_mass_transverse_met_lep','PRI_tau_pt',
                                       'DER_deltar_tau_lep','DER_mass_vis',
                                       'DER_sum_pt','DER_pt_ratio_lep_tau',
                                       'PRI_met_sumet','PRI_met','DER_mass_MMC',
                                       'PRI_met_phi','PRI_lep_eta','Intercept']]

x_jet1 = np.array(df_jet1_aic)

In [14]:
x_jet1_1 = x_jet1[:3000]
x_jet1_2 = x_jet1[3000:]

In [15]:
w2, loss2 = logistic_regression(y_p_1, x_jet1_1, np.zeros(12), 100000, 0.01)

In [16]:
#Cross validation
preds = predict(x_jet1_2,w2)
(preds == y_p_2).mean()

0.8189944134078212

In [17]:
def AIC_forward(y, x_pd):
    
    left = set(range(x_pd.shape[1]))
    x = np.array(x_pd)
    left.remove(x_pd.shape[1]-1)
    picked = [x_pd.shape[1]-1]
    current, new = 100000.0, 100000.0
    
    while left and current == new:
        
        aics_cov = []
        
        for covariate in left:
            columns = picked + [covariate]
            loss = logistic_regression(y, x[:,columns], np.zeros(len(columns)), 10000, 0.01)[1]
            aic = loss
            aics_cov.append((aic, covariate))
        
        aics_cov.sort()
        new, best_cov = aics_cov[0]
        
        if current == new:
            break
        
        if current > new:
            left.remove(best_cov)
            picked.append(best_cov)
            current = new
            
    return np.array(x_pd.columns)[picked]

In [18]:
#AIC_forward(y_p, df_jet4)