### HERE IS OUR PARAMETERS VALIDATION FILE

In [34]:
# Useful starting lines
%matplotlib inline

import random
from datetime import datetime

import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import time


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
from helpers import *
from helpers_create_data import *
from implementations import *

### LOADING THE CSV DATA INTO ARRAYS

In [4]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("./dataset", sub_sample=False)

#### FEATURE CHOICES :
From the dataset, we filter the features in 3 different ways :

1. Using (almost) all the features

2. Choosing 19 features (taken mostly from https://medium.com/@alexteboul17/building-predictive-models-for-heart-disease-using-the-2015-behavioral-risk-factor-surveillance-b786368021ab)

3. From the 19 features, we pick the 10 best ones using the best correlations towards the y variable

### DATA PREPROCESSING

#### 1. All features

In [36]:
# Change all the elements with -1 by 0
y_train_working = y_train.copy()
y_train_working[y_train_working == -1] = 0
# Make y have the correct shape
y_train_working = y_train_working.reshape(-1, 1)

# Shuffle the data
np.random.seed(6)
indices = np.arange(x_train.shape[0])
np.random.shuffle(indices)
X_shuffled = x_train[indices]
y_train_working_shuffled = y_train_working[indices]

# Split the data into training and validation sets (90% training, 10% validation)
X_train_all, X_val_all, Y_train_all, Y_val_all = split_train_val(X_shuffled, y_train_working_shuffled, 10, 9)

# we drop the rows that has a NaN percentage of {threshold} because we assume that they don't offer much information
X_tr_all, Y_tr_all = drop_rows_with_nan(X_train_all, Y_train_all, threshold=0.4)

# we process the dataset by replacing the remaining NaNs by column with the mode of the feature column that have less than 10 unique values and by its 
# mean if the feature column has more than 10 unique values. Also we remove the columns that have extremely low variance as this column 
# doesn't offer any information and we might encouter numerical issues when standardizing.
X_tr_all, X_val_all, X_test_all = process_datasets(X_tr_all, X_val_all, x_test, unique_values_thresh=10)

# We now balance the data to a slightly more balanced ratio of 0s and 1s
X_tr_all, Y_tr_all = undersampling_oversampling(X_tr_all, Y_tr_all, ratio_majority=0.5, ratio_majority_to_minority=2)

# We add a column of ones (bias term) to the dataset
X_tr_all = np.c_[np.ones((X_tr_all.shape[0], 1)), X_tr_all]
X_val_all = np.c_[np.ones((X_val_all.shape[0], 1)), X_val_all]
X_test_all = np.c_[np.ones((X_test_all.shape[0], 1)), X_test_all]

#### 2. 19 Best Features

In [24]:
# We had to create the make_data function (-> helpers_create_data) because we manually preprocess 
# each feature in this case and do all the necessary further preprocessing within the function 
X_tr_19, Y_tr_19, X_val_19, Y_val_19, X_test_19 = make_data('./dataset/x_train.csv', './dataset/x_test.csv', x_train, x_test, y_train, replace=False)

# Same as for all features, we re-balance the dataset 
X_tr_19, Y_tr_19 = undersampling_oversampling(X_tr_19, Y_tr_19, ratio_majority=1, ratio_majority_to_minority=2)

# We add a column of ones (bias term) before training
X_tr_19 = np.c_[np.ones((X_tr_19.shape[0], 1)), X_tr_19]
X_val_19 = np.c_[np.ones((X_val_19.shape[0], 1)), X_val_19]
X_test_19 = np.c_[np.ones((X_test_19.shape[0], 1)), X_test_19]

#Reshape form (#points,1) to (#points,) in order to use the implemented logistic regression function
Y_tr_19 = Y_tr_19.reshape(-1)
Y_val_19 = Y_val_19.reshape(-1)

In [80]:
X_tr_19[:, 1:].shape

(234226, 19)

#### 3. 10 Best Features

In [None]:
# We compute the correlations of all the feature variables with the output variable
correlations = np.zeros(X_tr_19.shape[1])
for i in range(len(correlations)):
    if i == 0:
        continue
    else:
        correlations[i] = np.abs(np.corrcoef(X_tr_19[:,i], Y_tr_19)[0,1])

# We take the 10 biggest ones
best_10_idx = np.argsort(correlations)[:-11:-1]

# We use those 10 features for the dataset
X_tr_10 = X_tr_19[:,best_10_idx]
X_val_10 = X_val_19[:,best_10_idx]
X_test_10 = X_test_19[:,best_10_idx]
Y_tr_10 = Y_tr_19
Y_val_10 = Y_val_19

# We add a column of ones (bias term) before training
X_tr_10 = np.c_[np.ones((X_tr_10.shape[0], 1)), X_tr_10]
X_val_10 = np.c_[np.ones((X_val_10.shape[0], 1)), X_val_10]
X_test_10 = np.c_[np.ones((X_test_10.shape[0], 1)), X_test_10]

### MODELS TRAINING

#### 1. All features
(takes a while to run...)

In [None]:
max_iter = 10000

lambdas = np.logspace(-7,-1,7)
gammas = np.array([0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1])

# add 0 to lambdas to test the case without regularization
lambdas = np.insert(lambdas, 0, 0)

accuracies_all = np.zeros((8,7))
f1_scores_all = np.zeros((8,7))

ws_all = []
losses_all = []

for i, lambda_ in enumerate(lambdas):
    for j, gamma in enumerate(gammas):
        w_reg_all = np.zeros(X_tr_all.shape[1])
        
        #Reshape form (#points,1) to (#points,) in order to use the implemented logistic regression function
        Y_tr_all = Y_tr_all.reshape(-1)
        Y_val_all = Y_val_all.reshape(-1)

        #Train model (-> our train set) using stocha logistic regression
        w, loss = reg_logistic_regression(Y_tr_all, X_tr_all, lambda_, w_reg_all, max_iter, gamma)
        Y_pred_all = prediction(X_val_all, w)

        accuracies_all[i,j] = compute_accuracy(Y_val_all, Y_pred_all)
        f1_scores_all[i,j] = f1(Y_pred_all, Y_val_all)
        ws_all.append(w)
        losses_all.append(loss)
        
best_idx_f1_all = np.unravel_index(np.argmax(f1_scores_all), f1_scores_all.shape)
best_idx_acc_all = np.unravel_index(np.argmax(accuracies_all), accuracies_all.shape)

In [57]:

print(f'When using all features, the best index for accuracy is {best_idx_acc_all} and the best one for f1 score is {best_idx_f1_all} on the val set.')
print(f'When using all features, the best setting for f1 score is lambda = {lambdas[best_idx_f1_all[0]]} and gamma = {gammas[best_idx_f1_all[1]]}.')
print(f'When using all features, the best setting for accuracy is lambda = {lambdas[best_idx_acc_all[0]]} and gamma = {gammas[best_idx_acc_all[1]]}.')
print(f'\nWhen using all features, the best accuracy is {accuracies_all[best_idx_acc_all]} and the best f1 score is {f1_scores_all[best_idx_f1_all]} on the val set.')

When using all features, the best index for accuracy is (5, 6) and the best one for f1 score is (0, 5) on the val set.
When using all features, the best setting for f1 score is lambda = 0.0 and gamma = 0.05.
When using all features, the best setting for accuracy is lambda = 0.001 and gamma = 0.1.

When using all features, the best accuracy is 91.5399384390333 and the best f1 score is 0.42239900171580097 on the val set.


#### 2. 19 Features
takes a looooong time... (please don't rerun)

In [97]:
max_iter = 10000

lambdas = np.logspace(-6,-1,6)
gammas = np.array([0.001, 0.005, 0.01, 0.05, 0.1])

# add 0 to lambdas to test the case without regularization
lambdas = np.insert(lambdas, 0, 0)

accuracies_19 = np.zeros((7,5))
f1_scores_19 = np.zeros((7,5))

ws_19 = []
losses_19 = []

for i, lambda_ in enumerate(lambdas):
    for j, gamma in enumerate(gammas):
        # Initialize the weights
        w_reg_19 = np.zeros(X_tr_19.shape[1])

        #Train model (-> our train set) using stocha logistic regression
        w, loss = reg_logistic_regression(Y_tr_19, X_tr_19, lambda_, w_reg_19, max_iter, gamma)
        Y_pred_19 = prediction(X_val_19, w)

        accuracies_19[i,j] = compute_accuracy(Y_val_19, Y_pred_19)
        f1_scores_19[i,j] = f1(Y_pred_19, Y_val_19)
        ws_19.append(w)
        losses_19.append(loss)
        
best_idx_f1_19 = np.unravel_index(np.argmax(f1_scores_19), f1_scores_19.shape)
best_idx_acc_19 = np.unravel_index(np.argmax(accuracies_19), accuracies_19.shape)

print(f'When using 19 features, the best index for accuracy is {best_idx_acc_19} and the best one for f1 score is {best_idx_f1_19} on the val set.')
print(f'When using 19 features, the best setting for f1 score is lambda = {lambdas[best_idx_f1_19[0]]} and gamma = {gammas[best_idx_f1_19[1]]}.')
print(f'When using 19 features, the best setting for accuracy is lambda = {lambdas[best_idx_acc_19[0]]} and gamma = {gammas[best_idx_acc_19[1]]}.')
print(f'\nWhen using 19 features, the best accuracy is {accuracies_19[best_idx_acc_19]} and the best f1 score is {f1_scores_19[best_idx_acc_19]} on the val set.')

When using 19 features, the best index for accuracy is (0, 3) and the best one for f1 score is (1, 1) on the val set.
When using 19 features, the best setting for f1 score is lambda = 1e-06 and gamma = 0.005.
When using 19 features, the best setting for accuracy is lambda = 0.0 and gamma = 0.05.

When using 19 features, the best accuracy is 85.31679517264499 and the best f1 score is 0.4114341558758856 on the val set.


In [60]:
print(f'When using 19 features, the best index for accuracy is {best_idx_acc_19} and the best one for f1 score is {best_idx_f1_19} on the val set.')
print(f'When using 19 features, the best setting for f1 score is lambda = {lambdas[best_idx_f1_19[0]]} and gamma = {gammas[best_idx_f1_19[1]]}.')
print(f'When using 19 features, the best setting for accuracy is lambda = {lambdas[best_idx_acc_19[0]]} and gamma = {gammas[best_idx_acc_19[1]]}.')
print(f'\nWhen using 19 features, the best accuracy is {accuracies_19[best_idx_acc_19]} and the best f1 score is {f1_scores_19[best_idx_f1_19]} on the val set.')

When using 19 features, the best index for accuracy is (1, 5) and the best one for f1 score is (2, 5) on the val set.
When using 19 features, the best setting for f1 score is lambda = 1e-05 and gamma = 0.01.
When using 19 features, the best setting for accuracy is lambda = 1e-06 and gamma = 0.01.

When using 19 features, the best accuracy is 84.82613598268979 and the best f1 score is 0.406396779244017 on the val set.


#### 3. 10 Features

In [98]:
max_iter = 10000

lambdas = np.logspace(-6,-1,6)
gammas = np.array([0.001, 0.005, 0.01, 0.05, 0.1])

# add 0 to lambdas to test the case without regularization
lambdas = np.insert(lambdas, 0, 0)

accuracies_10 = np.zeros((7,5))
f1_scores_10 = np.zeros((7,5))

ws_10 = []
losses_10 = []

for i, lambda_ in enumerate(lambdas):
    for j, gamma in enumerate(gammas):
        w_reg_10 = np.zeros(X_tr_10.shape[1])

        #Train model (-> our train set) using stocha logistic regression
        w, loss = reg_logistic_regression(Y_tr_10, X_tr_10, lambda_, w_reg_10, max_iter, gamma)
        Y_pred_10 = prediction(X_val_10, w)

        accuracies_10[i,j] = compute_accuracy(Y_val_10, Y_pred_10)
        f1_scores_10[i,j] = f1(Y_pred_10, Y_val_10)
        ws_10.append(w)
        losses_10.append(loss)
        
best_idx_f1_10 = np.unravel_index(np.argmax(f1_scores_10), f1_scores_10.shape)
best_idx_acc_10 = np.unravel_index(np.argmax(accuracies_10), accuracies_10.shape)

# We pick our w according to the optimal f1_score
#best_w_10 = ws_10[np.argmax(f1_scores_10)]

print(f'When using 10 features, the best index for accuracy is {best_idx_acc_10} and the best one for f1 score is {best_idx_f1_10} on the val set.')
print(f'When using 10 features, the best setting for f1 score is lambda = {lambdas[best_idx_f1_19[0]]} and gamma = {gammas[best_idx_f1_19[1]]}.')
print(f'When using 10 features, the best setting for accuracy is lambda = {lambdas[best_idx_acc_19[0]]} and gamma = {gammas[best_idx_acc_19[1]]}.')
print(f'\nWhen using 10 features, the best accuracy is {accuracies_10[best_idx_acc_10]} and the best f1 score is {f1_scores_10[best_idx_f1_10]} on the val set.')

When using 10 features, the best index for accuracy is (0, 0) and the best one for f1 score is (3, 1) on the val set.
When using 10 features, the best setting for f1 score is lambda = 1e-06 and gamma = 0.005.
When using 10 features, the best setting for accuracy is lambda = 0.0 and gamma = 0.05.

When using 10 features, the best accuracy is 84.46042726967971 and the best f1 score is 0.39509954058192953 on the val set.
