In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the data (both train and test data) into feature matrix, class labels, and event ids:


In [2]:
from proj1_helpers import *

In [3]:
DATA_TRAIN_PATH = '../data/train.csv'
y, train_input_data, ids = load_csv_data(DATA_TRAIN_PATH)

In [4]:
DATA_TEST_PATH = '../data/test.csv' 
y_test, test_input_data, ids_test = load_csv_data(DATA_TEST_PATH)

In [5]:
train_input_data.shape, y.shape

((250000, 30), (250000,))

In [6]:
test_input_data.shape, y_test.shape

((568238, 30), (568238,))

## Preprocess the data

In [7]:
from preprocessing import *

In [8]:
tX = preprocess_data(train_input_data)

In [9]:
tX_test = preprocess_data(test_input_data)

## Do your thing crazy machine learning thing here :) 

In [10]:
from implementations import *
from cross_validation import *
import time

For each model, we will use 4-fold cross-validation to find optimal hyper-parameters. Then we run our model with those parameters and produce the submissions

In [11]:
k_fold = 4

Then we use 5-fold cross-validation to compute variance of accurancy and loss

In [12]:
k_fold_acc = 5

In [13]:
# seed change for each model so it is not biased
seed = 1

### Linear Regression Using Gradient Descent

In [14]:
seed = 1

In [None]:
# cross-validation
start_time = time.time()

degrees = np.arange(1, 12)
max_iters = [10, 50, 80, 100, 250, 500, 1000]
gammas = np.logspace(-4, -1, 30)

degree_GD, max_iters_GD, gamma_GD, rmse_GD = select_best_hyperparams_least_squares_GD(y, tX, k_fold, degrees, max_iters, gammas, seed)
seed += 1

end_time = time.time()
print(f"The execution time is: {end_time-start_time}")

In [None]:
degree_GD, max_iters_GD, gamma_GD, rmse_GD

In [None]:
# build polynomials
extended_feature_matrix_train = build_poly(tX, degree_GD)
extended_feature_matrix_test = build_poly(tX_test, degree_GD)

In [None]:
initial_w = np.zeros((extended_feature_matrix_train.shape[1]))

weights_GD, loss_GD, ws, losses = least_squares_GD_complete(y, extended_feature_matrix_train, initial_w, max_iters_GD, gamma_GD)
# display
fig, [ax0, ax1] = plt.subplots(1, 2, figsize=(10,5))
ax0 = ax0.plot(losses)
ax1 = ax1.plot(ws)

In [None]:
loss_GD

In [None]:
# train accuracy
accuracy(y, extended_feature_matrix_train, weights_GD)

In [None]:
# test accuracy
accuracy(y_test, extended_feature_matrix_test, weights_GD)

In [None]:
# cross-validation for accuracy
rmse_tr_GD, rmse_te_GD, acc_GD = accuracy_variance(y, tX, k_fold_acc, degree_GD, 'least_squares_GD', 0, max_iters_GD, gamma_GD, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_GD,rmse_te_GD])
ax2 = ax2.boxplot(acc_GD)

### Linear Regression using SGD

In [None]:
seed = 3

In [None]:
# cross-validation
start_time = time.time()

degrees = np.arange(1, 12)
max_iters = [10, 50, 80, 100, 250, 500, 1000, 2500, 5000]
gammas = np.logspace(-4, -1, 30)

degree_SGD, max_iters_SGD, gamma_SGD, rmse_SGD = select_best_hyperparams_least_squares_SGD(y, tX, k_fold, degrees, max_iters, gammas, seed)
seed += 1

end_time = time.time()
print(f"The execution time is: {end_time-start_time}")

In [None]:
degree_SGD, max_iters_SGD, gamma_SGD, rmse_SGD

In [None]:
# build polynomials
extended_feature_matrix_train = build_poly(tX, degree_SGD)
extended_feature_matrix_test = build_poly(tX_test, degree_SGD)

In [None]:
np.random.seed(42)

initial_w = np.zeros((extended_feature_matrix_train.shape[1]))

weights_SGD, loss_SGD, ws, losses = least_squares_SGD_complete(y, extended_feature_matrix_train, initial_w, max_iters_SGD, gamma_SGD)

# display
fig, axes = plt.subplots(1, 2, figsize=(10,5))
axes[0] = axes[0].plot(losses)
axes[1] = axes[1].plot(ws)

In [None]:
loss_SGD

In [None]:
# train accuracy
accuracy(y, extended_feature_matrix_train, weights_SGD)

In [None]:
# test accuracy
accuracy(y_test, extended_feature_matrix_test, weights_SGD)

In [None]:
# cross-validation for accuracy
rmse_tr_SGD, rmse_te_SGD, acc_SGD = accuracy_variance(y, tX, k_fold_acc, degree_SGD, 'least_squares_SGD', 0, max_iters_SGD, gamma_SGD, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_SGD,rmse_te_SGD])
ax2 = ax2.boxplot(acc_SGD)

### Least Squares regression

In [None]:
seed = 5

In [None]:
weights_LS, loss_LS = least_squares(y, tX)

In [None]:
loss_LS

In [None]:
# train accuracy
accuracy(y, tX, weights_LS)

In [None]:
# test accuracy
accuracy(y_test, tX_test, weights_LS)

In [None]:
# cross-validation for accuracy
rmse_tr_LS, rmse_te_LS, acc_LS = accuracy_variance(y, tX, k_fold_acc, 1, 'least_squares', 0, 0, 0, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_LS,rmse_te_LS])
ax2 = ax2.boxplot(acc_LS)

### Ridge Regression

In [None]:
seed = 6

In [None]:
start_time = time.time()

degrees = np.arange(1,12)
lambdas = np.logspace(-4, -1, 30)
degree_RR, lambda_RR, rmse_RR = select_best_hyperparams_ridge_regression(y, tX, k_fold, degrees, lambdas, seed)
seed += 1

end_time = time.time()

In [None]:
degree_RR, lambda_RR, rmse_RR

In [None]:
# build polynomials
extended_feature_matrix_train = build_poly(tX, degree_RR)
extended_feature_matrix_test = build_poly(tX_test, degree_RR)

In [None]:
weights_RR, loss_RR = ridge_regression(y, extended_feature_matrix_train, lambda_RR)

In [None]:
loss_RR

In [None]:
# train accuracy
accuracy(y, extended_feature_matrix_train, weights_RR)

In [None]:
# test accuracy
accuracy(y_test, extended_feature_matrix_test, weights_RR)

In [None]:
# cross-validation for accuracy
rmse_tr_RR, rmse_te_RR, acc_RR = accuracy_variance(y, tX, k_fold_acc, degree_SGD, 'ridge_regression', lambda_RR, 0, 0, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_RR, rmse_te_RR])
ax2 = ax2.boxplot(acc_RR)

### Logistic regression using gradient descent

In [None]:
seed = 8

In [None]:
# cross-validation
start_time = time.time()

degrees = np.arange(1, 12)
max_iters = [10, 50, 80, 100, 250, 500, 1000, 2500]
gammas = np.logspace(-4, -1, 30)

degree_LOG, max_iters_LOG, gamma_LOG, rmse_LOG = select_best_hyperparams_logistic_regression(y, tX, k_fold, degrees, max_iters, gammas, seed)
seed += 1

end_time = time.time()
print(f"The execution time is: {end_time-start_time}")

In [None]:
degree_LOG, max_iters_LOG, gamma_LOG, rmse_LOG

In [None]:
# build polynomials
extended_feature_matrix_train = build_poly(tX, degree_LOG)
extended_feature_matrix_test = build_poly(tX_test, degree_LOG)

In [None]:
# We need to transform the categories from {-1,1} to {0,1}
y_01 = change_11_to_01_categories(y)

In [None]:
initial_w = np.zeros((extended_feature_matrix_train.shape[1]))

weights_LOG, loss_LOG = logistic_regression(y_01, extended_feature_matrix_train, initial_w, max_iters_LOG, gamma_LOG)

In [None]:
loss_LOG

In [None]:
# train accuracy
accuracy(y, extended_feature_matrix_train, weights_LOG)

In [None]:
# test accuracy
accuracy(y_test, extended_feature_matrix_test, weights_LOG)

In [None]:
# cross-validation for accuracy
rmse_tr_LOG, rmse_te_LOG, acc_LOG = accuracy_variance(y, tX, k_fold_acc, degree_SGD, 'logistic_regression', 0, max_iters_LOG, gamma_LOG, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_LOG, rmse_te_LOG])
ax2 = ax2.boxplot(acc_LOG)

### Regularized logistic regression using gradient descent

In [None]:
seed = 10

In [None]:
# cross-validation
start_time = time.time()

degrees = np.arange(1, 12)
lambdas = np.logspace(-4, -1, 30)
max_iters = [10, 50, 80, 100, 250, 500, 1000, 2500]
gammas = np.logspace(-4, -1, 30)

degree_RLOG, lambda_RLOG, max_iters_RLOG, gamma_RLOG, rmse_RLOG = select_best_hyperparams_reg_logistic_regression(y, tX, k_fold, degrees, lambdas, max_iters, gammas, seed)
seed += 1

end_time = time.time()
print(f"The execution time is: {end_time-start_time}")

In [None]:
degree_RLOG, lambda_RLOG, max_iters_RLOG, gamma_RLOG, rmse_RLOG

In [None]:
# build polynomials
extended_feature_matrix_train = build_poly(tX, degree_RLOG)
extended_feature_matrix_test = build_poly(tX_test, degree_RLOG)

In [None]:
# We need to transform the categories from {-1,1} to {0,1}
y_01 = change_11_to_01_categories(y)

In [None]:
initial_w = np.zeros((extended_feature_matrix_train.shape[1]))

weights_RLOG, loss_RLOG = reg_logistic_regression(y_01, extended_feature_matrix_train, lambda_RLOG, initial_w, max_iters_RLOG, gamma_RLOG)

In [None]:
loss_RLOG

In [None]:
# train accuracy
accuracy(y, extended_feature_matrix_train, weights_RLOG)

In [None]:
# test accuracy
accuracy(y_test, extended_feature_matrix_test, weights_RLOG)

In [None]:
# cross-validation for accuracy
rmse_tr_RLOG, rmse_te_RLOG, acc_RLOG = accuracy_variance(y, tX, k_fold_acc, degree_SGD, 'logistic_regression', lambda_RLOG, max_iters_RLOG, gamma_RLOG, seed)
seed += 1

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10,5))
ax1 = ax1.boxplot([rmse_tr_RLOG, rmse_te_RLOG])
ax2 = ax2.boxplot(acc_RLOG)

## Compare RMSE and accuracies

In [None]:
rmse_tr = [rmse_tr_GD, rmse_tr_SGD, rmse_tr_LS, rmse_tr_RR, rmse_tr_LOG, rmse_tr_RLOG]
rmse_te = [rmse_te_GD, rmse_te_SGD, rmse_te_LS, rmse_te_RR, rmse_te_LOG, rmse_te_RLOG]
acc = [acc_GD, acc_SGD, acc_LS, acc_RR, acc_LOG, acc_RLOG]

In [None]:
names = ['GD', 'SGD', 'LS', 'RidReg', 'LogReg', 'RegLogReg']

In [None]:
ax = plt.boxplot(rmse_tr, labels=names)

In [None]:
ax = plt.boxplot(rmse_te, labels=names)

In [None]:
ax = plt.boxplot(acc, labels=names)

In [None]:
fig, [ax0, ax1, ax2] = plt.subplots(1, 3, figsize=(10,5))
ax0 = ax0.boxplot(rmse_tr, labels = names)
ax1 = ax1.boxplot(rmse_te, labels = names)
ax2 = ax2.boxplot(acc, labels = names)

## Generate predictions and save ouput in csv format for submission:

In [None]:
OUTPUT_PATH = '../data/submission.csv'
y_pred = predict_labels(w, extended_feature_matrix_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)