In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
from utils import *
from implementations import *
from feature_filling import *
%load_ext autoreload
%autoreload 2

## Load data

In [2]:
y, X, ids = load_csv_data(path="../data/train.csv")

print("Input data shape", X.shape)
print("Labels data shape", y.shape)

Input data shape (250000, 30)
Labels data shape (250000,)


In [3]:
y[np.where(y == -1)] = 0
y

array([1., 0., 0., ..., 1., 0., 0.])

In [4]:
X = np.delete(X, [15, 18, 20, 25, 28], axis=1)
X.shape

(250000, 25)

In [5]:
X

array([[ 138.47 ,   51.655,   97.827, ...,   46.062,    1.24 ,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

In [6]:
X[X == -999.0] = None
X

array([[138.47 ,  51.655,  97.827, ...,  46.062,   1.24 , 113.497],
       [160.937,  68.768, 103.235, ...,     nan,     nan,  46.226],
       [    nan, 162.172, 125.953, ...,     nan,     nan,  44.251],
       ...,
       [105.457,  60.526,  75.839, ...,     nan,     nan,  41.992],
       [ 94.951,  19.362,  68.812, ...,     nan,     nan,   0.   ],
       [    nan,  72.756,  70.831, ...,     nan,     nan,   0.   ]])

In [7]:
x_tr, x_val, y_tr, y_val = split_data(X, y, 0.7)
print(x_tr.shape, x_val.shape, y_tr.shape, y_val.shape)

(175000, 25) (75000, 25) (175000,) (75000,)


#### For training

In [8]:
full_columns_idx = np.where(np.all(~np.isnan(x_tr), axis=0))[0]
full_columns_idx

array([ 1,  2,  3,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 24],
      dtype=int64)

In [9]:
learning_matrix = x_tr[:, full_columns_idx]
learning_matrix.shape

(175000, 16)

In [10]:
none_columns_idx = np.where(np.any(np.isnan(x_tr), axis=0))[0]
none_columns_idx

array([ 0,  4,  5,  6, 12, 20, 21, 22, 23], dtype=int64)

In [11]:
for column in none_columns_idx:
    x_tr_i = x_tr[:, column]
    
    x_values, y_values = learning_matrix[~np.isnan(x_tr_i)], x_tr_i[~np.isnan(x_tr_i)]
    weights, _ = ridge_regression(y_values, x_values, 10)
    
    x_tr_i[np.isnan(x_tr_i)] = learning_matrix[np.isnan(x_tr_i)] @ weights
    #x_tr_i[np.isnan(x_tr_i)] = np.median(x_tr_i[~np.isnan(x_tr_i)])

In [12]:
np.isnan(x_tr).sum()

0

#### For validation

In [13]:
np.isnan(x_val).sum()

390139

In [14]:
full_columns_idx = np.where(np.all(~np.isnan(x_val), axis=0))[0]
full_columns_idx

array([ 1,  2,  3,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 24],
      dtype=int64)

In [15]:
learning_matrix = x_val[:, full_columns_idx]
learning_matrix.shape

(75000, 16)

In [16]:
none_columns_idx = np.where(np.any(np.isnan(x_val), axis=0))[0]
none_columns_idx

array([ 0,  4,  5,  6, 12, 20, 21, 22, 23], dtype=int64)

In [17]:
for column in none_columns_idx:
    x_val_i = x_val[:, column]
    
    x_values, y_values = learning_matrix[~np.isnan(x_val_i)], x_val_i[~np.isnan(x_val_i)]
    weights, _ = ridge_regression(y_values, x_values, 10)
    
    x_val_i[np.isnan(x_val_i)] = learning_matrix[np.isnan(x_val_i)] @ weights
    #x_val_i[np.isnan(x_val_i)] = np.median(x_val_i[~np.isnan(x_val_i)])

In [18]:
np.isnan(x_val).sum()

0

## Train

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
x_tr, _, _ = standardize(x_tr)

In [21]:
x_tr

array([[ 5.37940932e-01, -4.81422716e-01,  1.14489702e-01, ...,
        -3.40144526e-01, -5.15103744e-01,  9.84106347e-04],
       [ 1.03838817e+00, -4.78646444e-01,  6.41188255e-01, ...,
        -3.96586657e-01, -5.15328707e-01, -5.15670557e-01],
       [ 2.62998419e-01,  2.97230604e-01, -8.81031549e-02, ...,
        -4.02793378e-01, -5.15913877e-01, -5.15670557e-01],
       ...,
       [ 9.07270926e-01,  1.37986186e-01,  5.06865974e-01, ...,
        -3.96285985e-01, -5.15376719e-01, -5.15670557e-01],
       [ 9.20573452e-01,  5.61608923e-01,  2.65716159e-01, ...,
        -3.12641495e-01, -5.16011433e-01, -1.24966480e-01],
       [ 5.24509776e-01, -4.40818391e-01,  2.34834147e-01, ...,
        -3.47473328e-01, -5.15850696e-01, -1.46351281e-01]])

In [22]:
x_tr.shape

(175000, 25)

In [23]:
y_tr

array([0., 1., 0., ..., 0., 1., 0.])

In [24]:
y_tr.shape

(175000,)

In [25]:
clf = LogisticRegression(random_state=0, max_iter=5000, C=10, class_weight="balanced").fit(x_tr, y_tr)

In [26]:
clf.score(x_tr, y_tr)

0.7376285714285714

In [27]:
x_val, _, _ = standardize(x_val)
clf.score(x_val, y_val)

0.7377066666666666