# Project 1 - Team BAK

## Step 1 - Load and Preprocess Data

In [None]:
#Import some libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
%load_ext autoreload
%autoreload 2

from pre_process import *
from cross_validation import *
from implementations import *

In [None]:
#Load the data
y, x, ids = load_csv_data('train.csv')
_, x_test, ids_test = load_csv_data('test.csv')
x_train, x_test = pre_process_data(x, x_test)

In [None]:
y.shape, x_train.shape, ids.shape, x_test.shape, ids_test.shape

### HYPERPARAMETER SELECTION

In [None]:
# CONSTANTS
SEED = 7
K = 3
JET_COLUMN = 16

# canditates parameters
degrees_candidates = [3,6]
alphas_candidates=[5, 7, 9]

# SELECT METHOD
methods = [mean_squared_error_gd, mean_squared_error_sgd, least_squares, ridge_regression, 
           logistic_regression_gradient_descent, logistic_regression_regularized_gradient_descent]
params_per_method = []

In [None]:
# MSE GRADIENT DESCENT
method = methods[0]
lambdas_candidates = None
    
opt_degree, opt_lambda, opt_alpha, accuracy = select_parameters_jet(y,x_train, method, degrees_candidates,
                                                                  alphas_candidates,K, SEED, lambdas_candidates)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accuracy)
params_per_method.append([opt_alpha, opt_degree, opt_lambda, accuracy])

In [None]:
# MSE STOCHASTIC GRADIENT DESCENT
method = methods[1]
lambdas_candidates = None
    
opt_degree, opt_lambda, opt_alpha, accuracy = select_parameters_jet(y,x_train, method, degrees_candidates,
                                                                  alphas_candidates,K, SEED, lambdas_candidates)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accuracy)
params_per_method.append([opt_alpha, opt_degree, opt_lambda, accuracy])

In [None]:
# LEAST SQUARES
method = methods[2]
lambdas_candidates = None
    
opt_degree, opt_lambda, opt_alpha, accuracy = select_parameters_jet(y,x_train, method, degrees_candidates,
                                                                  alphas_candidates,K, SEED, lambdas_candidates)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accuracy)
params_per_method.append([opt_alpha, opt_degree, opt_lambda, accuracy])

In [None]:
# RIDGE REGRESSION
method = methods[3]
lambdas_candidates = [1e-05, 1e-06, 1e-07]
    
opt_degree, opt_lambda, opt_alpha, accuracy = select_parameters_jet(y,x_train, method, degrees_candidates,
                                                                  alphas_candidates,K, SEED, lambdas_candidates)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accuracy)
params_per_method.append([opt_alpha, opt_degree, opt_lambda, accuracy])

In [None]:
# LOGISTIC REGRESSION
method = methods[4]
lambdas_candidates = None
degrees_candidates = [1]
y_log = np.where(y == 1,1,0)

opt_degree, opt_lambda, opt_alpha, accuracy = select_parameters_jet(y_log,x_train, method, degrees_candidates,
                                                                  alphas_candidates,K, SEED, lambdas_candidates, log=True)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accuracy)
params_per_method.append([opt_alpha, opt_degree, opt_lambda, accuracy])

In [None]:
# REGULARIZED LOGISTIC REGRESSION
method = methods[5]
lambdas_candidates = [1e-05, 1e-07]
degrees_candidates = [1]
y_log = np.where(y == 1,1,0)
    
opt_degree, opt_lambda, opt_alpha, accuracy = select_parameters_jet(y,x_train, method, degrees_candidates,
                                                                  alphas_candidates,K, SEED, lambdas_candidates, log=True)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accuracy)
params_per_method.append([opt_alpha, opt_degree, opt_lambda, accuracy])

In [None]:
# Split data in k-fold
k_indices = build_k_indices(y, K, SEED)

if method != ridge_regression or method != logistic_regression_regularized_gradient_descent:
    opt_lambda = None

accs_train = []
accs_test = []

for k in range(K):
    acc_train, acc_test = cross_validation_result(y, x_train, method, k_indices, k, opt_degree, opt_alpha, opt_lambda)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

In [None]:
method_pred = np.zeros(x_test.shape[0])


jet_train_class = {
    0: x_train[:, JET_COLUMN] == 0,
    1: x_train[:, JET_COLUMN] == 1,
    2: x_train[:, JET_COLUMN] == 2, 
    3: x_train[:, JET_COLUMN] == 3
}

    
jet_test_class = {
    0: x_test[:, JET_COLUMN] == 0,
    1: x_test[:, JET_COLUMN] == 1,
    2: x_test[:, JET_COLUMN] == 2, 
    3: x_test[:, JET_COLUMN] == 3
}


for i in range(4):
    x_jet = x_train[jet_train_class[i]]
    x_jet_test = x_test[jet_test_class[i]]
    y_jet = y_log[jet_train_class[i]]
    
    # Pre-processing and transformation of the training set and test set
    x_jet, x_jet_test = modify_data(x_jet, x_jet_test, opt_alpha[i], opt_degree[i])
    
    # Train the model through Ridge Regression
    best_w, _ = method(y_jet, x_jet)
    
    # Prediction
    pred = get_predictions(best_w, x_jet_test)
    method_pred[jet_test_class[i]] = pred

method_pred

In [None]:
def savePredictions(pred, title="submission"):
    y_pred = np.c_[ids_test, pred].astype(str)
    y_pred = np.insert(y_pred, 0, ["Id", "Prediction"], axis=0)
    np.savetxt(title + ".csv", y_pred, fmt="%s", delimiter=",")

savePredictions(ridge_pred)