In [1]:
#main notebook
import numpy as np
from numpy.linalg import inv
from numpy import linalg as la
from implementations import *


In [2]:
#load data
import datetime
from helpers import *
import csv




In [4]:
def load_csv_data(data_path, sub_sample=False, set_outliers_zero=False, skip_n_header=1, skip_n_footer=0, cross_validation=0):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=skip_n_header, skip_footer=skip_n_footer, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=skip_n_header, skip_footer=skip_n_footer)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    #remove outliers by setting the values to 0
    if set_outliers_zero:
        #input_data = np.array([[ 0 if data == -999 else data for data in line.split()] for line in input_data])
        for (x,y), data in np.ndenumerate(input_data):
            if data == -999. :
                input_data[x,y] = 0.
               
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]
        
    #cross-validation samples    
    if cross_validation==1:
        n=int(2*len(yb)/3)
        return yb[0:n], input_data[0:n,:], ids [0:n], yb[n:(len(yb)-1)], input_data[n:(len(yb)-1),:], ids[n:(len(yb)-1)]
    if cross_validation==2:
        n=int(1*len(yb)/3)
        return yb[0:n], input_data[0:n,:], ids [0:n], yb[n:(len(yb)-1)], input_data[n:(len(yb)-1),:], ids[n:(len(yb)-1)]
    if cross_validation==3:
        n=int(1*len(yb)/3)
        return yb[n:(len(yb)-1)], input_data[n:(len(yb)-1),:], ids [n:(len(yb)-1)], yb[0:n], input_data[0:n,:], ids[0:n]
    if cross_validation==4:
        n=int(2*len(yb)/3)
        return yb[n:(len(yb)-1)], input_data[n:(len(yb)-1),:], ids [n:(len(yb)-1)], yb[0:n], input_data[0:n,:], ids[0:n]
    

    return yb, input_data, ids


In [76]:
def indexes_by_jet_num(x):
    l = np.shape(x)[0]
    ids = [[],[],[],[]]
    for i in range(l) :
        test = int(x[i,22])
        ids[test].append(i)
    return ids

def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    return x if std_x == 0 else x / std_x


def create_tx(datas):
    for i in range(np.shape(datas)[1]) :
        datas[:,i] = standardize(datas[:,i])
    tx = np.c_[np.ones(np.shape(datas)[0]), datas]
    
    return tx

def change_to_int(y_evaluated, adapted_mean=True) :
    y_predicted = np.array([])
    special_mean = 0
    
    if adapted_mean :
        sorted_y = np.sort(y_evaluated)
        special_mean = sorted_y[int(len(sorted_y)/2 - np.sum(y_evaluated)/2)]
        
    y_predicted = [-1 if y < special_mean else 1 for y in y_evaluated]
    return y_predicted

In [190]:
def do_a_cross_test_col(col):
    #loads the datas
    y, datas, ids, y_cross, datas_cross, ids_cross = load_csv_data("train.csv",set_outliers_zero=True,cross_validation=3)
    datas = datas[:,col]
    datas_cross = datas_cross[:,col]
    #create the good datas
    tx = create_tx(datas)
    tx_cross = create_tx(datas_cross)
    
    #finds the w via gradient descent
    L_GD,w_GD =least_squares_GD(y,tx, np.zeros(np.shape(tx)[1]),500,0.1)
    #finds by ridge regression
    L_ridge,w_ridge = ridge_regression(y, tx, 0.00001)
    #calculates the evaluation
    y_evaluated_GD = tx_cross @ w_GD
    y_evaluated_ridge = tx_cross @ w_ridge
    #changes to int
    y_pred = change_to_int(y_evaluated_GD, adapted_mean=True)
    error = 1 - np.sum((y_pred - y_cross)**2)/4/len(y_pred)
    
    print("GD has error : ", error)

    y_pred = change_to_int(y_evaluated_GD, adapted_mean=False)
    error = 1 - np.sum((y_pred - y_cross)**2)/4/len(y_pred)
    print("GD has error  with normal mean: ", error)

    y_pred = change_to_int(y_evaluated_ridge, adapted_mean=True)
    error = 1- np.sum((y_pred - y_cross)**2)/4/len(y_pred)
    print("Ridge has error : ",error)
    
    return 

In [191]:
def do_a_cross_test():
    #loads the datas
    y, datas, ids, y_cross, datas_cross, ids_cross = load_csv_data("train.csv",set_outliers_zero=True,cross_validation=3)
    
    #create the good datas
    tx = create_tx(datas)
    tx_cross = create_tx(datas_cross)
    
    #finds the w via gradient descent
    L_GD,w_GD =least_squares_GD(y,tx, np.zeros(np.shape(tx)[1]),500,0.1)
    #finds by ridge regression
    L_ridge,w_ridge = ridge_regression(y, tx, 0.00001)
    #calculates the evaluation
    y_evaluated_GD = tx_cross @ w_GD
    y_evaluated_ridge = tx_cross @ w_ridge
    #changes to int
    y_pred = change_to_int(y_evaluated_GD, adapted_mean=True)
    error = 1 - np.sum((y_pred - y_cross)**2)/4/len(y_pred)
    
    print("GD has error : ", error)

    y_pred = change_to_int(y_evaluated_GD, adapted_mean=False)
    error = 1 - np.sum((y_pred - y_cross)**2)/4/len(y_pred)
    print("GD has error  with normal mean: ", error)

    y_pred = change_to_int(y_evaluated_ridge, adapted_mean=True)
    error = 1- np.sum((y_pred - y_cross)**2)/4/len(y_pred)
    print("Ridge has error : ",error)

    
    
    return 

In [116]:
def AIC_forward(y, x_pd):
    
    x = np.array(x_pd)
    
    left = set(range(x_pd.shape[1]))
    left.remove(x_pd.shape[1]-1)
    
    picked = [x_pd.shape[1]-1]
    
    current, new = 1000000.0, 1000000.0

    while (left and current == new) :
        
        aics_cov = []
        
        for covariate in left:
            columns = picked + [covariate]
            print(columns)
            loss = least_squares_GD(y, x[:,columns], np.zeros(len(columns)), 500, 0.1)[0]
            
            aic =  loss
            print(aic)
            aics_cov.append((aic, covariate))
        
        aics_cov.sort()
        #print(aics_cov)
        new, best_cov = aics_cov[0]

        if current > new:
            left.remove(best_cov)
            picked.append(best_cov)
            if best_cov == 0:
                new = 1000000.0
            current = new
            
    return picked

In [77]:
y,datas,ids= load_csv_data("train.csv")

tx = create_tx(datas)

In [73]:
np.shape(tx), np.shape(datas)

((250000, 31), (250000, 30))

In [180]:

col=AIC_forward(y,tx)


[30, 0]
0.44236848302360704
[30, 1]
0.47045942120177625
[30, 2]
0.44263877315614747
[30, 3]
0.4918533740332261
[30, 4]
0.4827066967631343
[30, 5]
0.48994210046756687
[30, 6]
0.4834201722755375
[30, 7]
0.4900338081214138
[30, 8]
0.4889095283718636
[30, 9]
0.4892793151069034
[30, 10]
0.4882509132996921
[30, 11]
0.4726333746479609
[30, 12]
0.46665784968683993
[30, 13]
0.48996794224880036
[30, 14]
0.47333984992950223
[30, 15]
0.49187425411915486
[30, 16]
0.49186369681213993
[30, 17]
0.4893468667815268
[30, 18]
0.49187519113986095
[30, 19]
0.49186635716642196
[30, 20]
0.4903337584149498
[30, 21]
0.49185601804666823
[30, 22]
0.4912951070422874
[30, 23]
0.49104344226846036
[30, 24]
0.48812134851106204
[30, 25]
0.48849192436123123
[30, 26]
0.48849207554953195
[30, 27]
0.49011316705492414
[30, 28]
0.489974766653299
[30, 29]
0.48997548004164504
[30, 0, 1]
0.42095270475377633
[30, 0, 2]
0.3931320567081473
[30, 0, 3]
0.4423466575852262
[30, 0, 4]
0.43319998031513424
[30, 0, 5]
0.4404353840195668
[

0.3568788138858521
[30, 0, 2, 14, 12, 6, 27, 8, 28]
0.3570716939800004
[30, 0, 2, 14, 12, 6, 27, 8, 29]
0.3570707339582783
[30, 0, 2, 14, 12, 6, 27, 8, 3, 1]
0.3530232221781391
[30, 0, 2, 14, 12, 6, 27, 8, 3, 4]
0.3509747463425182
[30, 0, 2, 14, 12, 6, 27, 8, 3, 5]
0.35350220537017063
[30, 0, 2, 14, 12, 6, 27, 8, 3, 7]
0.3534903024438435
[30, 0, 2, 14, 12, 6, 27, 8, 3, 9]
0.3530482670864082
[30, 0, 2, 14, 12, 6, 27, 8, 3, 10]
0.3506728336206975
[30, 0, 2, 14, 12, 6, 27, 8, 3, 11]
0.35312218449355487
[30, 0, 2, 14, 12, 6, 27, 8, 3, 13]
0.35350101908861536
[30, 0, 2, 14, 12, 6, 27, 8, 3, 15]
0.353430446461486
[30, 0, 2, 14, 12, 6, 27, 8, 3, 16]
0.3534290952885271
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17]
0.349090432729666
[30, 0, 2, 14, 12, 6, 27, 8, 3, 18]
0.3534307908365048
[30, 0, 2, 14, 12, 6, 27, 8, 3, 19]
0.35342686397518547
[30, 0, 2, 14, 12, 6, 27, 8, 3, 20]
0.3514180741748112
[30, 0, 2, 14, 12, 6, 27, 8, 3, 21]
0.35343073010197606
[30, 0, 2, 14, 12, 6, 27, 8, 3, 22]
0.3533993866094654

0.3402970168457784
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 15]
0.34032241478619757
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 16]
0.3403217967531837
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 18]
0.34032238366269885
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 19]
0.3403191774326544
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 21]
0.34032256477076506
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 25]
0.340305467344393
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 26]
0.3403056052834298
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 28]
0.3402947837380795
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 29]
0.34029397187779764
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4]
0.34007950368219914
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 5]
0.34011971581324196
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 7]
0.3401093469171188
[30, 0, 2, 14, 12, 6, 

0.34002639055471817
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 29]
0.3400259331173768
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 5]
0.3400274834867215
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 10]
0.34002573182268364
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 13]
0.34002691881654057
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 15]
0.3400255808569479
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 16]
0.34002522375610705
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 21]
0.3400252237728793
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 28]
0.340025618830447
[30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 29]
0.340025147336279
[30, 0, 2, 14, 12, 6,

In [121]:
#finds the w via gradient descent
L,w =least_squares_GD(y,tx, np.zeros(np.shape(tx)[1]),500,0.1)


62.68005490172589

In [182]:

colset = [30, 0, 2, 14, 12, 6, 27, 8, 3, 17, 11, 20, 24, 23, 22, 1, 4, 7, 25, 19, 26, 9, 18, 29, 16, 21, 15]
np.shape(colset)

(27,)

In [55]:
#finds by ridge regression
L,w = ridge_regression(y, tx, 0.00001)


In [25]:
#import the test datas
y_real, data_test, id_test = load_csv_data("test.csv",set_outliers_zero=True)
tx_test = create_tx(data_test)

In [26]:
y_evaluated = tx_test @ w

In [110]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            r2 = '-1'if r2==-1 else '1'
            writer.writerow({'Id':int(r1),'Prediction':r2})

In [30]:
create_csv_submission(id_test, y_predicted, "sample_submission_5.csv")

In [47]:
#tries with separation of the different cases of jet num

In [132]:
#loads datas and find what the indexes of cases of jet num are
y,datas,ids= load_csv_data("train.csv",set_outliers_zero=True)
I = indexes_by_jet_num(datas)

In [133]:
#creates the datas and apply linear regression for the 4 subsystems
IDS = [[],[],[],[]]
Y = [[],[],[],[]]
TX = [[],[],[],[]]
W = [[],[],[],[]]
COLS = [[0, 2, 14, 8, 3, 17, 11, 20, 12, 9, 22, 1, 18, 21, 10, 4, 15, 16],[0, 2, 14, 12, 8, 3, 17, 11, 9, 20, 22, 4, 1, 18, 21, 15, 16, 26, 10, 19, 25, 24],[5, 2, 12, 14, 13, 6, 22, 8, 4, 27, 3, 17, 11, 20, 1, 0, 7, 9, 18, 29, 25, 26, 28, 21, 19, 15, 24],[0, 4, 2, 6, 14, 12, 8, 13, 5, 3, 24, 1, 11, 17, 22, 20, 7, 15, 21, 28, 27, 18, 25, 19, 29, 9, 16, 10, 26, 23]]

for i in range(4):
    TX[i] = create_tx(datas[I[i]][:,COLS[i]])
    IDS[i] = ids[I[i]]
    Y[i] = y[I[i]]
    L,W[i] = least_squares_GD(Y[i],TX[i], np.zeros(np.shape(TX[i])[1]),500,0.1)

In [134]:
#load the datas of the test
y_unknown, data_test, id_test = load_csv_data("test.csv")

In [140]:
# estimates the prediction for each jet num
I_test = indexes_by_jet_num(data_test)
TX_test = [[],[],[],[]]
Y_predicted = [[],[],[],[]]
for i in range(4):
    TX_test[i] = create_tx(data_test[I_test[i]][:,COLS[i]])
    Y_predicted[i] = change_to_int(TX_test[i] @ W[i],adapted_mean=True)

In [141]:
#rebuild the prediction vector 
counters=[0,0,0,0]
col_22 = data_test[:,22]
l = len(col_22)
y_predicted = np.zeros(l)
for i in range(l):
    jet_num = int(col_22[i])
    y_predicted[i] = Y_predicted[jet_num][int(counters[jet_num])]
    counters[jet_num] = counters[jet_num]+1
y_predicted, np.shape(y_predicted)

(array([-1., -1., -1., ...,  1.,  1., -1.]), (568238,))

In [138]:
np.shape(id_test)

(568238,)

In [142]:
create_csv_submission(id_test, y_predicted, "sample_submission_8.csv")