In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from implementations import *
from proj1_helpers import *

## Load the training data into feature matrix, class labels, and event ids:

In [6]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Data cleaning

* handle missing data : here we replace the missing data (the −999.0 values) by the mean of the other observations

In [7]:
# for each feature, replace the missing value by the mean of the observed data

no_val = -999.0
for j in range(tX.shape[1]):
    mean = np.mean(tX[:, j] != no_val)
    tX[:, j][tX[:, j] == no_val] = mean

* remove uncorrelated features

In [8]:
def pearson(X, Y):
    return np.cov(X, Y)[0, 1] / (np.std(X) * np.std(Y))

nb_features = tX.shape[1]

# correlation between feature and label

lab_matrix = np.zeros(nb_features)
for j in range(tX.shape[1]):
    cor = pearson(tX[:, j], y)    
    lab_matrix[j] = cor


lab_matrix[np.absolute(lab_matrix) < 0.03 ]

array([-0.01405533,  0.01224553, -0.01528749, -0.00094325, -0.00440256,
       -0.03194771,  0.00151624,  0.00412546,  0.02246584,  0.00747537,
       -0.03122378, -0.03160598, -0.01687982, -0.02245849])

In [62]:
# correlation between features
feat_matrix = np.zeros((nb_features, nb_features))
for i in range(nb_features):
    for j in range(nb_features):
        cor = pearson(tX[:, i], tX[:, j])
        feat_matrix[i, j] = cor
        if np.absolute(cor) >= 0.5 :
            print(i, j)
        
feat_matrix

0 0
0 2
1 1
2 0
2 2
2 7
3 3
3 7
3 9
3 11
3 19
3 21
3 22
3 23
3 26
3 29
4 4
4 5
4 6
4 12
4 22
4 26
5 4
5 5
5 6
5 9
5 12
5 22
5 26
5 29
6 4
6 5
6 6
6 12
7 2
7 3
7 7
8 8
9 3
9 5
9 9
9 13
9 19
9 21
9 22
9 23
9 26
9 29
10 10
10 16
11 3
11 11
12 4
12 5
12 6
12 12
13 9
13 13
14 14
14 17
15 15
16 10
16 16
17 14
17 17
18 18
19 3
19 9
19 19
19 23
19 29
20 20
21 3
21 9
21 21
21 22
21 23
21 26
21 29
22 3
22 4
22 5
22 9
22 21
22 22
22 23
22 26
22 29
23 3
23 9
23 19
23 21
23 22
23 23
23 26
23 29
24 24
25 25
26 3
26 4
26 5
26 9
26 21
26 22
26 23
26 26
26 29
27 27
28 28
29 3
29 5
29 9
29 19
29 21
29 22
29 23
29 26
29 29


array([[ 1.00020004e+00, -1.66561592e-01,  6.72396272e-01,
         1.57433081e-01,  6.86968090e-02,  6.05265594e-02,
        -2.82428860e-02,  4.50957782e-01,  3.05017723e-02,
         2.01725578e-01,  2.77296948e-02,  2.31719253e-01,
         3.61563783e-02,  2.59172678e-01,  2.05400351e-02,
        -1.38174796e-03,  2.73975126e-01, -1.48475748e-02,
         2.65693135e-02, -8.01380915e-02,  2.09611254e-02,
         1.97837274e-01,  1.39157703e-01,  1.44318200e-01,
        -3.95191993e-02, -5.28985122e-02,  7.68605514e-02,
        -3.41906373e-02, -2.50395557e-02,  1.23260885e-01],
       [-1.66561592e-01,  1.00020004e+00,  2.09193913e-01,
        -2.68996335e-01, -1.87564817e-01, -1.78423344e-01,
         1.12424234e-01,  6.94216772e-02,  9.50492514e-03,
        -1.60317604e-01,  3.42170231e-01, -4.39804499e-01,
        -1.21022910e-01, -1.54319249e-01,  1.07678496e-02,
         1.90075777e-02,  3.10452671e-01, -3.64396323e-03,
         6.05810953e-03,  1.79022163e-01, -3.92927303e-

## Apply methods

### Gradient descent

In [38]:
w_initial = np.array([0 for _ in range(tX.shape[1])])
max_iters = 50
gamma = 0.7
weights, loss = least_squares_GD(y, tX, w_initial, max_iters, gamma)

In [39]:
print(weights, loss)

[ 1.32393860e+251  5.15667824e+250  9.58694478e+250  1.12044675e+251
  1.99652335e+249  3.44287680e+251 -9.57727049e+248  2.48536433e+249
  2.95665223e+250  2.83705867e+251  1.69658336e+249  3.16110981e+248
  4.96867741e+248  5.25854670e+250 -1.90442678e+247  4.45580236e+246
  5.99095621e+250 -1.23645488e+247  1.66615787e+248  6.37052052e+250
  8.49397069e+247  3.48458615e+251  1.84304865e+249  1.09417700e+251
  1.12349261e+248  1.05186312e+248  4.52019213e+250  1.40282358e+248
  4.78967058e+247  1.71210818e+251] inf


## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = 'data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = 'data/predict.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)