In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
%load_ext autoreload
%autoreload 2

In [2]:
# sigmoid (logistic) function
def logistic_function(x,w):
    l=np.exp(x.dot(w))/(1+np.exp(x.dot(w)))
    return l

In [3]:
# predictions with logistic regression
def logistic_prediction(w,x):
    y_pred_l=np.zeros((x.shape[0],))
    y_pred_l[np.where(logistic_function(x,w)<0.5)] = -1
    y_pred_l[np.where(logistic_function(x,w)>=0.5)] = 1
    return y_pred_l

In [4]:
def standardize(tx):
    mean = np.mean(tx, axis=0)
    std = np.std(tx, axis=0)
    tx = (tx-mean)/std
    return tx

# function that add new features with polynomial expansion it also adds offset (all 1 column)
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    phi=np.zeros((x.shape[0],degree*x.shape[1]))
    for j in range(degree):
            phi[:,j*x.shape[1]:(j+1)*x.shape[1]]=x**(j+1)
    phi = np.hstack(( np.ones((phi.shape[0], 1), dtype=phi.dtype),phi))
    return phi 


def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

#Cross validation for logistic regression
def cross_validation_LR(y, x, k_fold, initial_w, max_iters, gamma, seed=1):
    loss_tr = [] 
    loss_te = []
    ws = []
    acc=[]
    k_indices = build_k_indices(y, k_fold, seed)
    for k in range(k_fold):
        # ***************************************************
        # get k'th subgroup in test, others in train
        # ***************************************************
        idx_tr = (np.delete(k_indices, k, 0)).flatten()
        idx_te = k_indices[k]
        x_tr, y_tr = x[idx_tr], y[idx_tr]
        x_te, y_te = x[idx_te], y[idx_te]
        mean = np.mean(x_tr, axis=0)
        std = np.std(x_tr, axis=0)
        for j in range (x_tr.shape[1]):
            #Because assuming there is an all one column vector at the beginning
            if j>0:
                x_tr[:,j]=(x_tr[:,j]-mean[j])/std[j]
                #Standardization of the test with the mean and std of the training 
                x_te[:,j]=(x_te[:,j]-mean[j])/std[j]
        # ***************************************************
        # calculate the loss for train and test data
        # ***************************************************
        w, loss = logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma, mode='GD')
        y_out_test = logistic_prediction(w,x_te)
        accuracy=100*(y_out_test==(y_te*2-1)).tolist().count(True)/y_out_test.shape[0]
        loss_tr.append(loss)
        loss_te.append(logistic_loss(y_te, x_te, w))
        ws.append(w)
        acc.append(accuracy)
    var_tr = np.var(loss_tr)
    var_te = np.var(loss_te)
    loss_tr = np.mean(loss_tr)
    loss_te = np.mean(loss_te)
    ws = np.mean(np.asarray(ws), axis=0)
    acc=np.mean(np.asarray(acc), axis=0)
    return loss_tr, loss_te, var_tr, var_te, ws, acc

# First : exploring the data

We'll need to have a look at what the data is, how it is distributed for the different features, and start to get an intuition about what methods might work better for analysis and prediction later.

## Load the training data into feature matrix, class labels, and event ids:

In [5]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [6]:
# remove features with error values
y_jet = []
tx_jet = []
y_jet_nm = []
tx_jet_nm = []
# filtering according to undefinition due to jet number
idx_jet_undef = [np.array([0,1,2,3,7,10,11,13,14,15,16,17,18,19,20,21]),
                np.array([0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,23,24,25,29]),
                np.array([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29]),
                np.array([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29])]
# Extra filtering according to definition of mass
idx_jet_undef_nm = [np.array([1,2,3,7,10,11,13,14,15,16,17,18,19,20,21]),
                    np.array([1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,23,24,25,29]),
                    np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29]),
                    np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29])]
for jet in range(4):
    idx_jet = tX[:,22]==jet
    y_jet.append(y[idx_jet])
    tx_jet.append(tX[idx_jet][:,idx_jet_undef[jet]])
    #tx_jet.append(standardize(tX[idx_jet][:,idx_jet_undef[jet]]))
for jet in range(4):
    idx_jet = tX[:,22]==jet
    y_jet_nm.append(y[idx_jet])
    tx_jet_nm.append(tX[idx_jet][:,idx_jet_undef_nm[jet]])
    #tx_jet.append(standardize(tX[idx_jet][:,idx_jet_undef_nm[jet]]))

for jet in range(4):
    print('Jet {:} shape is {:}'.format(jet,tx_jet[jet].shape))

Jet 0 shape is (99913, 16)
Jet 1 shape is (77544, 22)
Jet 2 shape is (50379, 29)
Jet 3 shape is (22164, 29)


# Actual predictions start from here

After having looked at the data we will now do some actual predictions using different models andd parameters.

In [7]:
tx_full = tx_jet+tx_jet_nm
y_full = y_jet+y_jet_nm
# for y_i in y_full:
#     y_i[y_i==-1] = 0

In [8]:
# logistic regression cross validation 
k_fold=6
degree = 1 
max_iters_LR=4000
#loss_tr = np.zeros((8,len(degrees)))
#loss_te = np.zeros((8,len(degrees)))
#var_tr = np.zeros((8,len(degrees)))
#var_te = np.zeros((8,len(degrees)))
acc = np.zeros((8))
for i in range(8):
    tx_i = tx_full[i]
    y_i = (y_full[i]+1)/2 # Trasforming y in an array of 0/1 instead of -1/1
    gamma=(1e-1)/(len(y_i)*(k_fold-1)/k_fold)
    seed = 1
    initial_w = np.zeros(build_poly(tx_i,degree).shape[1],)
    _, _, _, _, ws, acc[i] = cross_validation_LR(y_i, build_poly(tx_i,degree), k_fold, initial_w, max_iters_LR, gamma, seed)
    #print('set {:} - train_loss: {:}, test_loss: {:}, train_var: {:}, test_var: {:} accuracy:{:}'.format(i,loss_tr[i,g],loss_te[i,g], var_tr[i,g], var_te[i,g],acc[i]))
        #print(ws)
    print('set {:} - accuracy:{:}'.format(i,acc[i]))

set 0 - accuracy:82.60269036752342
set 1 - accuracy:71.23310636541835
set 2 - accuracy:73.98364300460538
set 3 - accuracy:73.09149972929076
set 4 - accuracy:82.5876771558972
set 5 - accuracy:71.2021561951924
set 6 - accuracy:73.91019533111006
set 7 - accuracy:72.9741923840462


In [9]:
# Observing LR
w_final=[]
for i in range(8):
    max_iter = 4000
    threshold = 1e-6
    losses = []
    tx_i = tx_full[i]
    tx_i=standardize(tx_i)
    y_i = (y_full[i]+1)/2
    w_l = np.zeros(build_poly(tx_full[i],1).shape[1],)
    #Dividing the gamma for the number of samples (it is the same as dividing the gradient)
    gamma=(1e-1)/len(y_i)
    for iter in range(max_iter):
        w_l, loss_l = logistic_regression(y_i,build_poly(tx_i,1), w_l, 1, gamma,mode='GD')
        #dividing the loss for the number of samples
        loss_l=loss_l/len(y_i)
        if iter % 100 == 0:
            print("set={i} Current iteration={iter},set={i}, loss={l}".format(i=i,iter=iter, l=loss_l))
             #converge criterion
        losses.append(loss_l)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    w_final.append(w_l)
#visualitation
#print('for {}, iteration {} w: {}, loss: {}'.format(i,iter,w_l,loss_l))

set=0 Current iteration=0,set=0, loss=0.6763342208300869
set=0 Current iteration=100,set=0, loss=0.44181866378390905
set=0 Current iteration=200,set=0, loss=0.4266163193775543
set=0 Current iteration=300,set=0, loss=0.42045239192561606
set=0 Current iteration=400,set=0, loss=0.41627032198006464
set=0 Current iteration=500,set=0, loss=0.413002880724155
set=0 Current iteration=600,set=0, loss=0.41032980288810084
set=0 Current iteration=700,set=0, loss=0.40809406858891834
set=0 Current iteration=800,set=0, loss=0.4061969316287591
set=0 Current iteration=900,set=0, loss=0.4045694993759145
set=0 Current iteration=1000,set=0, loss=0.4031612584273638
set=0 Current iteration=1100,set=0, loss=0.4019339936782408
set=0 Current iteration=1200,set=0, loss=0.4008580992495643
set=0 Current iteration=1300,set=0, loss=0.3999101868913331
set=0 Current iteration=1400,set=0, loss=0.3990714714640561
set=0 Current iteration=1500,set=0, loss=0.39832664775185944
set=0 Current iteration=1600,set=0, loss=0.3976

set=6 Current iteration=800,set=6, loss=0.519160784826803
set=6 Current iteration=900,set=6, loss=0.51852982744917
set=6 Current iteration=1000,set=6, loss=0.5180301135857194
set=6 Current iteration=1100,set=6, loss=0.517628733987101
set=6 Current iteration=1200,set=6, loss=0.5173026598357782
set=6 Current iteration=1300,set=6, loss=0.5170353008655324
set=6 Current iteration=1400,set=6, loss=0.5168144035461005
set=6 Current iteration=1500,set=6, loss=0.5166307238736993
set=6 Current iteration=1600,set=6, loss=0.5164771624380179
set=6 Current iteration=1700,set=6, loss=0.5163481839883266
set=6 Current iteration=1800,set=6, loss=0.5162394173445031
set=7 Current iteration=0,set=7, loss=0.6842676826862074
set=7 Current iteration=100,set=7, loss=0.5458683678702808
set=7 Current iteration=200,set=7, loss=0.5381030531952116
set=7 Current iteration=300,set=7, loss=0.5350853761408234
set=7 Current iteration=400,set=7, loss=0.533235238059444
set=7 Current iteration=500,set=7, loss=0.531970624439

In [10]:
w_final[3]

array([-1.02795223,  0.25527513, -0.38023046, -0.44671939,  0.53818345,
       -0.3173693 ,  0.61231853,  0.08435523,  0.55453496, -0.00776721,
       -0.05334809, -0.4503705 ,  0.26252371,  0.26490822,  0.30989789,
       -0.03427286, -0.00147702,  0.51932606, -0.00753477,  0.00411351,
        0.14747072,  0.02058241, -0.18124601, -0.33660195, -0.0014882 ,
       -0.00396532, -0.06499279,  0.01742307,  0.007029  , -0.27387289])

In [14]:
#from proj1_helpers import *
#DATA_TEST_PATH = '../data/test.csv' 
# test_y, test_X, ids = load_csv_data(DATA_TRAIN_PATH)

## Generate predictions using only features with no errrors throughought

This enables us to use some of the methods from the course directly, without having to adjust some of the functionnality to account for the fact that a lot of errors are in the dataset. First let us see which features from the test dataset are error free.

With this figured out we can now extract the valid columns from test and train data, do some training and testing on data, then generate answers for the test data and submit to aicrowd !

# Save prediction ouput in csv format for submission:

In [15]:
#DATA_TEST_PATH = '' # TODO: download train data and supply path here 
#_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [16]:
#OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
#y_pred = predict_labels(weights, tX_test)
#create_csv_submission(ids_test, y_pred, OUTPUT_PATH)