## Imports and global variables

In [199]:
import numpy as np

import sys
sys.path.append("./src/")

In [200]:
from implementations import *
from helpers import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [201]:
DATA_FOLDER = './data/'
TRAIN_FILE = './train.csv'
TEST_FILE = './test.csv'

NAN_VALUE = -999.0
INTEGER_COLUMN = 22 # 24 in raw csv file, but 23 when id and prediction column are removed

# Import dataset
Import dataset and apply basic transformations to be able to use it afterwards : 
- Replace labels chars by integers
- Standardize training df
- Replace missing values by the mean

In [202]:
# Load data
x_tr, y_tr = load_data(DATA_FOLDER + TRAIN_FILE, sub_sample=False)
x_te, y_te = load_data(DATA_FOLDER + TEST_FILE, sub_sample=False)

print("x_tr shape : ", x_tr.shape)
print("x_te shape : ", x_te.shape)

# Define missing values as NAN
x_tr[x_tr == NAN_VALUE] = np.nan
x_te[x_te == NAN_VALUE] = np.nan

# Standardize before replacing missing values
x_tr, mean_x, std_x = standardize_training(x_tr, missing_values=True)
x_te = standardize_test(x_te, mean_x, std_x)

# Replace missing data by the mean
x_tr = replace_nan_by_means(x_tr, mean_dataset=mean_x)
x_te = replace_nan_by_means(x_te, mean_dataset=mean_x)

assert(x_tr[np.isnan(x_tr)].shape[0] == 0)
assert(x_te[np.isnan(x_te)].shape[0] == 0)

print("x_tr range : %f %f" % (np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range : %f %f" % (np.nanmin(x_te), np.nanmax(x_te)))

x_tr shape :  (250000, 30)
x_te shape :  (568238, 30)
x_tr range : -4.811018 371.783360
x_te range : -5.074944 371.783360


In [204]:
# Add bias term to x
xt_tr = add_offset(x_tr)
xt_te = add_offset(x_te)

# Fit a model
w, loss_tr = least_squares(y_tr, x_tr)
loss_te = compute_mse(y_te, x_te, w)

print("Training loss : %f" % loss_tr)
print("Test loss : %f" % loss_te)

Training loss : 0.088658
Test loss : 0.253838
