In [63]:
import numpy as np

from preprocessing import final_preprocessing
from utils import load_csv_data,train_test_split, predict_labels
from implementations import least_squares,predict_probabilities, least_squares_GD, least_squares_batch_GD, least_squares_SGD, logistic_regression, reg_logistic_regression, ridge_regression
from metrics import accuracy_score
from helper import tune_hyperparameter
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
#loading data
DATA_TRAIN_PATH = '../data/train.csv'
DATA_TEST_PATH = '../data/test.csv'
y, X, _ = load_csv_data(DATA_TRAIN_PATH)

In [65]:
#preprocessing data
X_processed = final_preprocessing(X)

100%|██████████| 30/30 [00:49<00:00,  1.66s/it]


In [66]:
#setting fixed values
seed=23
k_fold = 4
max_iters= 20

X_train, y_train, X_test, y_test = train_test_split(X_processed, y , test_size = 0.2 , random_state= 23)

In [67]:
#least squares

w_ls , _ = least_squares(y_train, X_train)
pred_prob = predict_labels( X_test, w_ls )
accuracy = accuracy_score(y_test, pred_prob)
print("Least squares- accuracy on test data: " + str(accuracy)  )

Least squares- accuracy on test data: 0.80614


In [68]:
#least_squares_GD

gammas= np.logspace(-10,-4,5)
gamma_gd, _, w_gd = tune_hyperparameter(y_train, X_train, seed = seed, max_iters= max_iters, k_fold= k_fold , hyperparameters=gammas, regression_function= least_squares_GD)
pred_prob = predict_labels( X_test, w_gd)
accuracy = accuracy_score(y_test, pred_prob)
print("Least squares GD- accuracy on test data: " + str(accuracy) )

Best hyperparameters: hyperparameter 1 = 0.0001 hyperparameter 2 = 0 accuracy: 0.69285
Least squares GD- accuracy on test data: 0.6971


In [69]:
#least_squares_SGD

gammas= np.logspace(-5, 0, 10)
gamma_sgd, _, w_sgd = tune_hyperparameter(y_train, X_train, seed = seed, max_iters= max_iters, k_fold= k_fold , hyperparameters=gammas, regression_function= least_squares_SGD )
pred_prob = predict_labels( X_test, w_sgd )
accuracy = accuracy_score(y_test, pred_prob)
print("Least squares SGD- accuracy on test data: " + str(accuracy) )

Best hyperparameters: hyperparameter 1 = 0.0004641588833612782 hyperparameter 2 = 0 accuracy: 0.68948
Least squares SGD- accuracy on test data: 0.69528


In [70]:
#least_squares_batch_GD

gammas= np.logspace(-5, 0, 5)
batch_sizes = [10,100, 1000]
gamma_batch_gd, _, w_batch_gd = tune_hyperparameter(y_train, X_train, seed = seed, max_iters= max_iters, k_fold= k_fold , hyperparameters=gammas, hyperparameters_2= batch_sizes, regression_function= least_squares_batch_GD )
pred_prob = predict_labels( X_test, w_batch_gd )
accuracy = accuracy_score(y_test, pred_prob)
print("Least squares batch GD- accuracy on test data: " + str(accuracy) )

Best hyperparameters: hyperparameter 1 = 0.0031622776601683794 hyperparameter 2 = 100 accuracy: 0.71581
Least squares batch GD- accuracy on test data: 0.72114


In [71]:
#ridge regression

lambdas= np.logspace(-10, 0, 10)
lambda_ridge_r, _, w_ridge_r = tune_hyperparameter(y_train, X_train, seed = seed, max_iters= max_iters, k_fold= k_fold , hyperparameters=lambdas, regression_function= ridge_regression)
pred_prob = predict_labels(X_test, w_ridge_r )
accuracy = accuracy_score(y_test, pred_prob)
print("Ridge regression- accuracy on test data: " + str(accuracy) )

Best hyperparameters: hyperparameter 1 = 2.782559402207126e-06 hyperparameter 2 = 0 accuracy: 0.80363
Ridge regression- accuracy on test data: 0.80606


In [72]:
# new training and test data using [0,1] labels for logistic regression methods

X_train, y_train, X_test, y_test = train_test_split(X_processed, y == 1 , test_size = 0.2 , random_state= 23)

In [73]:
#logistic regression

gammas= np.logspace(-1, 0, 10)
lambda_log_reg, _, w_log_reg = tune_hyperparameter(y_train, X_train, seed = seed, max_iters= max_iters, k_fold= k_fold , hyperparameters=gammas, regression_function= logistic_regression)
pred_prob = predict_probabilities(  X_test, w_log_reg ) > 0.5
accuracy = accuracy_score(y_test, pred_prob)
print("Logistic regression- accuracy on test data: " + str(accuracy) )

Best hyperparameters: hyperparameter 1 = 0.16681005372000587 hyperparameter 2 = 0 accuracy: 0.809765
Logistic regression- accuracy on test data: 0.81222


In [74]:
#regularized logistic regression

gammas= np.logspace(-1, 0, 5)
lambdas = np.logspace(-1, 0, 5)
lambda_reg_log_r, _, w_reg_log_r = tune_hyperparameter(y_train, X_train, seed = seed, max_iters= max_iters, k_fold= k_fold , hyperparameters=gammas, hyperparameters_2= lambdas, regression_function= reg_logistic_regression)
pred_prob = predict_probabilities(  X_test, w_reg_log_r ) > 0.5
accuracy = accuracy_score(y_test, pred_prob)
print("Regularized logistic regression- accuracy on test data: " + str(accuracy) )


Best hyperparameters: hyperparameter 1 = 0.1778279410038923 hyperparameter 2 = 0.1 accuracy: 0.80461
Regularized logistic regression- accuracy on test data: 0.80814


Experiment were also executed with a higher max_iter parameter, but since the result didn't change,
we used this for faster reproducibility. The best method is hence logistic regression, with the highest accuracy.