## Imports and global variables

In [None]:
import sys
import math
import numpy as np
import matplotlib.pyplot as plt

sys.path.append("./src/")

In [None]:
from implementations import *
from helpers import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
DATA_FOLDER = './data/'
TRAIN_FILE = './train.csv'
TEST_FILE = './test.csv'

NAN_VALUE = -999.0
INTEGER_COLUMN = 22 # 24 in raw csv file, but 23 when id and prediction column are removed

# For debug purpose only
SUB_SAMPLE = True

# Import dataset

In [None]:
# Load data
x_tr, y_tr = load_data(DATA_FOLDER + TRAIN_FILE, sub_sample=SUB_SAMPLE)
x_te, y_te = load_data(DATA_FOLDER + TEST_FILE, sub_sample=SUB_SAMPLE)

print("x_tr shape : {}, y_tr shape : {}".format(x_tr.shape, y_tr.shape))
print("x_te shape : {}, y_te shape : {}".format(x_te.shape, y_te.shape))

# Define missing values as NAN
x_tr[x_tr == NAN_VALUE] = np.nan
x_te[x_te == NAN_VALUE] = np.nan

# Data exploration

Features with too much NAN values

In [None]:
MAX_NAN_RATIO = 0.5

nb_nan = np.count_nonzero(np.isnan(x_tr), axis=0)
nan_ratio = nb_nan / x_tr.shape[1]


print("Nb Columns with > {:.2f} nan ratio : {}".format(MAX_NAN_RATIO, np.count_nonzero(nan_ratio >= MAX_NAN_RATIO)))

col_names = []
with open(DATA_FOLDER + TRAIN_FILE) as dataset:
    col_names = dataset.readline().split(',')
    nan_col_names = [col_name for col_idx, col_name in enumerate(col_names[2:]) if nan_ratio[col_idx]]

    print("Columns with > {:.2f} nan ratio :".format(MAX_NAN_RATIO))
    print(nan_col_names)


In [None]:
fig, axs = plt.subplots(5, 6, sharex=False, sharey=False, figsize=(10, 6))

# don't select nan values
for col_idx in range(len(col_names)-2):
    subplt = axs[col_idx%5, math.floor(col_idx/5)]

    col = x_tr[:, col_idx]
    subplt.hist(col[~np.isnan(col)], bins=20)
    subplt.set_title(col_names[col_idx+2])

plt.tight_layout()

# fig.title("Histograms of raw features (exclude NAN values)")
plt.show()

In [None]:
col_names[6]

In [None]:
# Plot a feature
col_name_to_plot = 'DER_pt_h'
# col_names
col_idx_to_plot = [col_idx-2 for col_idx, col_name in enumerate(col_names) if col_name_to_plot == col_name]

col_to_plot = x_tr[:, col_idx_to_plot]

fig, axs = plt.subplots(1, 2)
axs[0].hist(col_to_plot[~np.isnan(col_to_plot)], bins=20)
axs[0].set_title(col_name_to_plot)

col_nan = col_to_plot[~np.isnan(col_to_plot)]
axs[1].boxplot(col_to_plot[~np.isnan(col_to_plot)])
axs[1].set_title(col_name_to_plot)

plt.show()

In [None]:
plt.loglog(col_to_plot[~np.isnan(col_to_plot)])
plt.title(col_name_to_plot + "log log plot")
plt.show()

In [None]:
# heatmap
# np.corrcoef(col_to_plot)

# Feature engineering / Data cleaning

Basic transformations
- Remove features which contains too much NAN values, because they don't contain ennough information
- Standardize training df
- Replace missing values

In [None]:
# Remove columns with too much NAN
# x_tr = x_tr[:, nan_ratio <= MAX_NAN_RATIO]
# x_te = x_te[:, nan_ratio <= MAX_NAN_RATIO]

print("x_tr shape : {}".format(x_tr.shape))
print("x_te shape : {}".format(x_te.shape))

# Standardize before replacing missing values
x_tr, mean_x, std_x = standardize_training(x_tr, missing_values=True)
x_te = standardize_test(x_te, mean_x, std_x)

# Replace missing data by the mean
x_tr = replace_nan_by_means(x_tr, mean_data=mean_x)
x_te = replace_nan_by_means(x_te, mean_data=mean_x)

assert(x_tr[np.isnan(x_tr)].shape[0] == 0)
assert(x_te[np.isnan(x_te)].shape[0] == 0)

print("x_tr range :{} {}".format(np.nanmin(x_tr), np.nanmax(x_tr)))
print("x_te range :{} {}".format(np.nanmin(x_te), np.nanmax(x_te)))

# Model fitting and predictions

In [None]:
# Add offset term to x
xt_tr = add_offset(x_tr)
xt_te = add_offset(x_te)

# Fit a model
w, loss_tr = least_squares(y_tr, x_tr)
loss_te = compute_mse(y_te, x_te, w)

print("Training loss : {}".format(loss_tr))
print("Test loss : {}".format(loss_te))