In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the data (both train and test data) into feature matrix, class labels, and event ids:


In [2]:
from proj1_helpers import *

In [3]:
DATA_TRAIN_PATH = '../data/train.csv'
y, train_input_data, ids = load_csv_data(DATA_TRAIN_PATH)

In [4]:
DATA_TEST_PATH = '../data/test.csv' 
y_test, test_input_data, ids_test = load_csv_data(DATA_TEST_PATH)

In [5]:
train_input_data.shape, y.shape

((250000, 30), (250000,))

In [6]:
test_input_data.shape, y_test.shape

((568238, 30), (568238,))

## Preprocess the data

In [7]:
from preprocessing import *

In [8]:
tX = preprocess_data(train_input_data)

In [9]:
tX_test = preprocess_data(test_input_data)

## Do your thing crazy machine learning thing here :) 

In [10]:
from implementations import *
from cross_validation import *
import time

For each model, we will use 4-fold cross-validation to find optimal hyper-parameters. Then we run our model with those parameters and produce the submissions

In [11]:
k_fold = 4

Then we use 5-fold cross-validation to compute variance of accurancy and loss

In [12]:
k_fold_acc = 5

In [13]:
# seed change for each model so it is not biased
seed = 1

### Ridge Regression

In [48]:
seed = 6

In [40]:
start_time = time.time()

degrees = np.arange(1,12)
lambdas = np.logspace(-4, -1, 30)
degree_RR, lambda_RR, rmse_RR, best_per_degree = select_best_hyperparams_ridge_regression(y, tX, k_fold, degrees, lambdas, seed)

end_time = time.time()
print(f"The execution time is: {end_time-start_time}")

The execution time is: 1623.5960257053375


In [41]:
degree_RR, lambda_RR, rmse_RR

(9, 0.00041753189365604, 0.7402028046388752)

In [42]:
best_per_degree

[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 [0.0001,
  0.0001,
  0.0001,
  0.0001,
  0.00016102620275609394,
  0.00032903445623126676,
  0.00221221629107045,
  0.0017433288221999873,
  0.00041753189365604,
  0.03039195382313198,
  0.04893900918477494],
 [0.8259859221699424,
  0.7845683560286713,
  0.7741277820976487,
  0.765394011611308,
  0.7575835736840003,
  0.7528053028955657,
  0.7466392103356925,
  0.7435133308421813,
  0.7402028046388752,
  0.7456305727496825,
  0.756661484831989]]

In [43]:
best_per_degree[0]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [44]:
best_per_degree[1]

[0.0001,
 0.0001,
 0.0001,
 0.0001,
 0.00016102620275609394,
 0.00032903445623126676,
 0.00221221629107045,
 0.0017433288221999873,
 0.00041753189365604,
 0.03039195382313198,
 0.04893900918477494]

In [45]:
best_per_degree[2]

[0.8259859221699424,
 0.7845683560286713,
 0.7741277820976487,
 0.765394011611308,
 0.7575835736840003,
 0.7528053028955657,
 0.7466392103356925,
 0.7435133308421813,
 0.7402028046388752,
 0.7456305727496825,
 0.756661484831989]

In [24]:
degree_RR, lambda_RR, rmse_RR

(9, 0.00041753189365604, 0.7402028046388752)

In [25]:
# build polynomials
extended_feature_matrix_train = build_poly(tX, degree_RR)
extended_feature_matrix_test = build_poly(tX_test, degree_RR)

In [26]:
weights_RR, loss_RR = ridge_regression(y, extended_feature_matrix_train, lambda_RR)

In [27]:
loss_RR

0.2717943911777913

In [28]:
# train accuracy
accuracy(y, extended_feature_matrix_train, weights_RR)

0.818672

In [29]:
# test accuracy
accuracy(y_test, extended_feature_matrix_test, weights_RR)

0.3050183197885393

In [None]:
# cross-validation for accuracy
rmse_tr_RR, rmse_te_RR, acc_RR = accuracy_variance(y, tX, k_fold_acc, degree_SGD, 'ridge_regression', lambda_RR, 0, 0, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_RR, rmse_te_RR])
ax2 = ax2.boxplot(acc_RR)

### Logistic regression using gradient descent

In [None]:
seed = 8

In [None]:
# cross-validation
start_time = time.time()

degrees = np.arange(1, 12)
gammas = np.logspace(-4, -1, 30)

degree_LOG, max_iters_LOG, gamma_LOG, rmse_LOG = select_best_hyperparams_logistic_regression(y, tX, k_fold, degrees, 300, gammas, seed)
seed += 1

end_time = time.time()
print(f"The execution time is: {end_time-start_time}")

In [None]:
degree_LOG, max_iters_LOG, gamma_LOG, rmse_LOG

In [None]:
# build polynomials
extended_feature_matrix_train = build_poly(tX, degree_LOG)
extended_feature_matrix_test = build_poly(tX_test, degree_LOG)

In [None]:
# We need to transform the categories from {-1,1} to {0,1}
y_01 = change_11_to_01_categories(y)

In [None]:
initial_w = np.zeros((extended_feature_matrix_train.shape[1]))

weights_LOG, loss_LOG = logistic_regression(y_01, extended_feature_matrix_train, initial_w, max_iters_LOG, gamma_LOG)

In [None]:
loss_LOG

In [None]:
# train accuracy
accuracy(y, extended_feature_matrix_train, weights_LOG)

In [None]:
# test accuracy
accuracy(y_test, extended_feature_matrix_test, weights_LOG)

In [None]:
# cross-validation for accuracy
rmse_tr_LOG, rmse_te_LOG, acc_LOG = accuracy_variance(y, tX, k_fold_acc, degree_SGD, 'logistic_regression', 0, max_iters_LOG, gamma_LOG, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_LOG, rmse_te_LOG])
ax2 = ax2.boxplot(acc_LOG)

### Regularized logistic regression using gradient descent

In [None]:
seed = 10

In [None]:
# cross-validation
start_time = time.time()

degrees = np.arange(1, 12)
lambdas = np.logspace(-4, -1, 30)
gammas = np.logspace(-4, -1, 30)

degree_RLOG, lambda_RLOG, gamma_RLOG, rmse_RLOG = select_best_hyperparams_reg_logistic_regression(y, tX, k_fold, degrees, lambdas, 300, gammas, seed)
seed += 1

end_time = time.time()
print(f"The execution time is: {end_time-start_time}")

In [None]:
degree_RLOG, lambda_RLOG, gamma_RLOG, rmse_RLOG

In [None]:
# build polynomials
extended_feature_matrix_train = build_poly(tX, degree_RLOG)
extended_feature_matrix_test = build_poly(tX_test, degree_RLOG)

In [None]:
# We need to transform the categories from {-1,1} to {0,1}
y_01 = change_11_to_01_categories(y)

In [None]:
initial_w = np.zeros((extended_feature_matrix_train.shape[1]))

weights_RLOG, loss_RLOG = reg_logistic_regression(y_01, extended_feature_matrix_train, lambda_RLOG, initial_w, max_iters_RLOG, gamma_RLOG)

In [None]:
loss_RLOG

In [None]:
# train accuracy
accuracy(y, extended_feature_matrix_train, weights_RLOG)

In [None]:
# test accuracy
accuracy(y_test, extended_feature_matrix_test, weights_RLOG)

In [None]:
# cross-validation for accuracy
rmse_tr_RLOG, rmse_te_RLOG, acc_RLOG = accuracy_variance(y, tX, k_fold_acc, degree_SGD, 'logistic_regression', lambda_RLOG, max_iters_RLOG, gamma_RLOG, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_RLOG, rmse_te_RLOG])
ax2 = ax2.boxplot(acc_RLOG)

## Compare RMSE and accuracies

In [None]:
rmse_tr = [rmse_tr_GD, rmse_tr_SGD, rmse_tr_LS, rmse_tr_RR, rmse_tr_LOG, rmse_tr_RLOG]
rmse_te = [rmse_te_GD, rmse_te_SGD, rmse_te_LS, rmse_te_RR, rmse_te_LOG, rmse_te_RLOG]
acc = [acc_GD, acc_SGD, acc_LS, acc_RR, acc_LOG, acc_RLOG]

In [None]:
names = ['GD', 'SGD', 'LS', 'RidReg', 'LogReg', 'RegLogReg']

In [None]:
ax = plt.boxplot(rmse_tr, labels=names)

In [None]:
ax = plt.boxplot(rmse_te, labels=names)

In [None]:
ax = plt.boxplot(acc, labels=names)

In [None]:
fig, [ax0, ax1, ax2] = plt.subplots(1, 3, figsize=(10,5))
ax0 = ax0.boxplot(rmse_tr, labels = names)
ax1 = ax1.boxplot(rmse_te, labels = names)
ax2 = ax2.boxplot(acc, labels = names)

## Generate predictions and save ouput in csv format for submission:

In [None]:
OUTPUT_PATH = '../data/submission.csv'
y_pred = predict_labels(w, extended_feature_matrix_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)