## Import libraries

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
from utils import *
from implementations import *
from preprocessing import *
%load_ext autoreload
%autoreload 2

## Load the data

In [2]:
y, X, ids = load_csv_data(path="data/train.csv")
print("Input data shape", X.shape)
print("Labels data shape", y.shape)

Input data shape (250000, 30)
Labels data shape (250000,)


In [3]:
y[np.where(y == -1)] = 0
y

array([1., 0., 0., ..., 1., 0., 0.])

In [4]:
X

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

In [5]:
columns_remove = [14,15,17,18]
X = np.delete(X, columns_remove, axis=1)
X.shape

(250000, 26)

In [6]:
# deal with DER_mass_MMC missing value
# fill the missing with mean of background signal
DER_mass_MMC = X[:,0]
DER_mass_MMC_b_mean = np.mean(np.delete(DER_mass_MMC, np.where((DER_mass_MMC == -999) & (y == 1) )))
X[:,0] = np.where(DER_mass_MMC == -999, DER_mass_MMC_b_mean, DER_mass_MMC)
X

array([[ 138.47      ,   51.655     ,   97.827     , ...,    1.24      ,
          -2.475     ,  113.497     ],
       [ 160.937     ,   68.768     ,  103.235     , ..., -999.        ,
        -999.        ,   46.226     ],
       [ -38.12677709,  162.172     ,  125.953     , ..., -999.        ,
        -999.        ,   44.251     ],
       ...,
       [ 105.457     ,   60.526     ,   75.839     , ..., -999.        ,
        -999.        ,   41.992     ],
       [  94.951     ,   19.362     ,   68.812     , ..., -999.        ,
        -999.        ,    0.        ],
       [ -38.12677709,   72.756     ,   70.831     , ..., -999.        ,
        -999.        ,    0.        ]])

In [7]:
X = np.where(X == -999,np.NaN,X)
columns_to_drop, columns_to_fill, feature_medians = calculate_feature_medians(X)
X = fill_features_with_median(X)
X

array([[ 1.38470000e+02,  5.16550000e+01,  9.78270000e+01, ...,
         1.24000000e+00, -2.47500000e+00,  1.13497000e+02],
       [ 1.60937000e+02,  6.87680000e+01,  1.03235000e+02, ...,
        -1.00000000e-02, -2.00000000e-03,  4.62260000e+01],
       [-3.81267771e+01,  1.62172000e+02,  1.25953000e+02, ...,
        -1.00000000e-02, -2.00000000e-03,  4.42510000e+01],
       ...,
       [ 1.05457000e+02,  6.05260000e+01,  7.58390000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  4.19920000e+01],
       [ 9.49510000e+01,  1.93620000e+01,  6.88120000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  0.00000000e+00],
       [-3.81267771e+01,  7.27560000e+01,  7.08310000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  0.00000000e+00]])

In [8]:
columns_angle = [16,21,24]

In [9]:
X,y ,outliers= remove_outliers(X,y)
X.shape

(248568, 26)

In [11]:
X_angle = X[:,columns_angle]
X_rest = np.delete(X, columns_angle, axis=1)
X_poly = np.hstack((X_rest,np.sin(X_angle), np.cos(X_angle)))
X_final = build_poly_feature(X_poly,degree=5)
X_final,_,_=standardize(X_final)
X_final.shape

(248568, 551)

In [12]:
X_final

array([[ 0.54729814,  0.07448065,  0.44342679, ...,  1.74230828,
        -1.10649753, -1.58048415],
       [ 0.84345629,  0.57048453,  0.58407074, ...,  0.03818481,
        -0.46920231,  0.02203605],
       [-1.78058608,  3.27770982,  1.17488976, ...,  0.70078161,
        -0.83354747, -1.19346534],
       ...,
       [ 0.11212349,  0.33159808, -0.12840739, ..., -1.35218725,
        -1.43086659,  0.86603805],
       [-0.02636574, -0.86150095, -0.3111561 , ...,  1.43653027,
         1.09839009,  0.88508235],
       [-1.78058608,  0.68607289, -0.25864868, ...,  0.22578121,
         0.00876516,  0.88508235]])

In [13]:
x_tr, x_val, y_tr, y_val = split_data(add_bias_term(X_final), y, 0.8)

print("Training data shapes", x_tr.shape, y_tr.shape)
print("Validation data shapes", x_val.shape, y_val.shape)

Training data shapes (198854, 552) (198854,)
Validation data shapes (49714, 552) (49714,)


## Train

In [14]:
from implementations import *
weights, loss = ridge_regression(
    y_tr, x_tr, 10e-5)
loss

0.06972252741748963

In [15]:
compute_score(y_tr, x_tr, weights,f='linear')

0.8090508614360284

In [16]:
compute_score(y_val, x_val, weights,f='linear')

0.8083034959971034

In [17]:
from implementations import *
weights, loss = reg_logistic_regression(
    y_tr, x_tr, 0, np.zeros(x_tr.shape[1]), max_iters=5000, gamma=0.2)

Iteration 1/5000: loss=0.6931471805599453
Iteration 201/5000: loss=0.44091210182841784
Iteration 401/5000: loss=0.4294702191167116
Iteration 601/5000: loss=0.4238938113473563
Iteration 801/5000: loss=0.42040552426660877
Iteration 1001/5000: loss=0.41792522907063706
Iteration 1201/5000: loss=0.41599021402970576
Iteration 1401/5000: loss=0.41436979033014026
Iteration 1601/5000: loss=0.41300644492689853
Iteration 1801/5000: loss=0.41180855368183433
Iteration 2001/5000: loss=0.41074134744985014
Iteration 2201/5000: loss=0.4097764781228711
Iteration 2401/5000: loss=0.4089033449665712
Iteration 2601/5000: loss=0.40811355004526145
Iteration 2801/5000: loss=0.407405804545608
Iteration 3001/5000: loss=0.40679458195802237
Iteration 3201/5000: loss=0.4062751713032512
Iteration 3401/5000: loss=0.40581382600648136
Iteration 3601/5000: loss=0.4053993732031573
Iteration 3801/5000: loss=0.405023821187766
Iteration 4001/5000: loss=0.4046798563412289
Iteration 4201/5000: loss=0.40435800843235675
Iterati

In [18]:
print("Training score", compute_score(y_tr, x_tr, weights))
print("Validation score", compute_score(y_val, x_val, weights))

Training score 0.820506502257938
Validation score 0.8204932212254094


In [19]:
f1_score(y_val, x_val, weights)

0.7232696601339619

## Testing

In [24]:
_, XTest, idsTest = load_csv_data(path="data/test.csv")

print("Input data shape", XTest.shape)
# print("Labels data shape", y.shape)

Input data shape (568238, 30)


In [25]:
XTest = np.delete(XTest, columns_remove, axis=1)
XTest[:,0] = np.where(XTest[:,0] == -999, DER_mass_MMC_b_mean, XTest[:,0])
XTest = np.where(XTest == -999,np.NaN,XTest)
XTest=fill_features_with_median(XTest,columns_to_fill,feature_medians)
XTest

array([[-3.81267771e+01,  7.95890000e+01,  2.39160000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  0.00000000e+00],
       [ 1.06398000e+02,  6.74900000e+01,  8.79490000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  4.75750000e+01],
       [ 1.17794000e+02,  5.62260000e+01,  9.63580000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  0.00000000e+00],
       ...,
       [ 1.08497000e+02,  9.83700000e+00,  6.51490000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  0.00000000e+00],
       [ 9.67110000e+01,  2.00060000e+01,  6.69420000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  3.08630000e+01],
       [ 9.23730000e+01,  8.01090000e+01,  7.76190000e+01, ...,
        -1.00000000e-02, -2.00000000e-03,  0.00000000e+00]])

In [26]:
XTest_angle = XTest[:, columns_angle]
XTest_rest = np.delete(XTest, columns_angle, axis=1)
XTest_poly = np.hstack((XTest_rest,np.sin(XTest_angle), np.cos(XTest_angle)))
Xtest_final = build_poly_feature(XTest_poly,degree=5)
Xtest_final,_,_=standardize(Xtest_final)
Xtest_final.shape

(568238, 551)

In [27]:
Xtest_final = add_bias_term(Xtest_final)
Xtest_final.shape

(568238, 552)

In [28]:
yTest = np.array([linear(x, weights) for x in Xtest_final])

In [29]:
yTest

array([0, 0, 0, ..., 1, 0, 0])

In [30]:
yTest[np.where(yTest == 0)] = -1
print(yTest)

[-1 -1 -1 ...  1 -1 -1]


In [31]:
create_csv_submission(idsTest, yTest, "1029.csv")

In [32]:
(yTest==1).sum()/len(yTest)

0.2501927009457305