## Import libraries

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
from utils import *
from implementations import *
from feature_filling import *
%load_ext autoreload
%autoreload 2

## Load the data

In [2]:
y, X, ids = load_csv_data(path="resources/train.csv")

print("Input data shape", X.shape)
print("Labels data shape", y.shape)

Input data shape (250000, 30)
Labels data shape (250000,)


In [35]:
X[X == -999] = np.NaN
# print(X[:, 1])
y[np.where(y == -1)] = 0
y

[ 51.655  68.768 162.172 ...  60.526  19.362  72.756]


array([1., 0., 0., ..., 1., 0., 0.])

In [9]:
x_tr, x_val, y_tr, y_val = split_data(X, y, 0.8)

print("Training data shapes", x_tr.shape, y_tr.shape)
print("Validation data shapes", x_val.shape, y_val.shape)

Training data shapes (200000, 30) (200000,)
Validation data shapes (50000, 30) (50000,)


## Fill the data

In [54]:
columns_with_missing_features, feature_medians = calculate_feature_medians(x_tr)

print(columns_with_missing_features)
print(feature_medians.shape)

[12 23 24 25 26 27 28]
(7,)


In [61]:
x_tr = fill_features_with_median(x_tr, columns_with_missing_features, feature_medians)
print(columns_with_missing_features)
print(feature_medians)
# x_tr[:12]

[12 23 24 25 26 27 28]
[ 4.5100e-01  6.5477e+01 -3.0000e-03 -3.4000e-02  4.7876e+01 -1.2000e-02
 -1.0000e-03]


array([[ 1.27492e+02,  1.53650e+01,  8.51300e+01,  6.44700e+00,
         4.51000e-01,  6.54770e+01, -3.00000e-03,  2.93200e+00,
         6.44700e+00,  8.54190e+01,  5.02000e-01, -1.29400e+00,
         4.51000e-01,  5.68780e+01,  6.26000e-01,  2.70000e-01,
         2.85400e+01, -1.01000e-01, -2.57000e+00,  3.63630e+01,
        -3.05200e+00,  1.47057e+02,  0.00000e+00,  6.54770e+01,
        -3.00000e-03, -3.40000e-02,  4.78760e+01, -1.20000e-02,
        -1.00000e-03,  0.00000e+00],
       [ 9.43680e+01,  3.10450e+01,  6.64580e+01,  4.58220e+01,
         2.10300e+00,  1.16093e+02, -1.00000e+00,  2.31700e+00,
         2.46690e+01,  1.55259e+02,  1.26200e+00,  1.40400e+00,
         3.40000e-01,  3.11650e+01,  1.74000e-01,  1.81000e-01,
         3.93160e+01,  7.68000e-01,  2.42100e+00,  1.54200e+01,
         1.05700e+00,  2.48829e+02,  2.00000e+00,  5.39770e+01,
         7.27000e-01, -2.45200e+00,  3.08000e+01, -1.37600e+00,
        -1.07700e+00,  8.47780e+01],
       [ 2.32744e+02,  2.19210

In [None]:
x_tr = add_bias_term(standardize(x_tr)[0])
x_val = add_bias_term(standardize(x_val)[0])

print("Training data shapes", x_tr.shape)
print("Validation data shapes", x_val.shape)

## Train

In [None]:
weights, loss = reg_logistic_regression(
    y_tr, x_tr, 0, np.zeros(x_tr.shape[1]), max_iters=10000, gamma=0.2)

In [None]:
print("Training score", compute_score(y_tr, x_tr, weights))
print("Validation score", compute_score(y_val, x_val, weights))

In [None]:
plt.plot(np.arange(len(loss)), loss)
plt.xlabel("Iterarion")
plt.ylabel("Loss")
plt.show()

## Testing

In [None]:
_, XTest, idsTest = load_csv_data(path="resources/test.csv")

print("Input data shape", XTest.shape)
# print("Labels data shape", y.shape)

In [None]:
XTest = add_bias_term(standardize(XTest)[0])
XTest.shape

In [None]:
yTest = np.array([predictions(x, weights) for x in XTest])

In [None]:
yTest[np.where(yTest == 0)] = -1
print(yTest)

In [None]:
create_csv_submission(idsTest, yTest, "01.csv")

In [None]:
(yTest==1).sum()/len(yTest)