In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from helpers import *
import seaborn as sns
import math
from proj1_helpers import *
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:

DATA_TRAIN_PATH = '../data/train.csv'  
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
index0, y0, x_train0 = partition_data(tX, y, 0)
index1, y1, x_train1 = partition_data(tX, y, 1)
index2, y2, x_train2 = partition_data(tX, y, 2)
index3, y3, x_train3 = partition_data(tX, y, 3)

index0_t, y0_t, x_test0 = partition_data(tX_test, y_test, 0)
index1_t, y1_t, x_test1 = partition_data(tX_test, y_test, 1)
index2_t, y2_t, x_test2 = partition_data(tX_test, y_test, 2)
index3_t, y3_t, x_test3 = partition_data(tX_test, y_test, 3)

We standardize the 4 batches of data, and a bias component/ feature to each batch.

In [None]:
std_train0, mean_x0, std_x0 = standardize_data(x_train0)
std_train1, mean_x1, std_x1 = standardize_data(x_train1)
std_train2, mean_x2, std_x2 = standardize_data(x_train2)
std_train3, mean_x3, std_x3 = standardize_data(x_train3)

std_train0 = np.hstack((np.ones((x_train0.shape[0],1)),std_train0))
std_train1 = np.hstack((np.ones((x_train1.shape[0],1)),std_train1))
std_train2 = np.hstack((np.ones((x_train2.shape[0],1)),std_train2))
std_train3 = np.hstack((np.ones((x_train3.shape[0],1)),std_train3))

std_test0 = np.hstack((np.ones((x_test0.shape[0],1)),(x_test0-mean_x0)/std_x0))
std_test1 = np.hstack((np.ones((x_test1.shape[0],1)),(x_test1-mean_x1)/std_x1))
std_test2 = np.hstack((np.ones((x_test2.shape[0],1)),(x_test2-mean_x2)/std_x2))
std_test3 = np.hstack((np.ones((x_test3.shape[0],1)),(x_test3-mean_x3)/std_x3))

We try to find any correlations between features of the standardized data, by taking a look at their correlation matrices. 

In [None]:

corr_mat_0 = compute_corr(std_train0[:,1:-1])
corr_mat_1 = compute_corr(std_train1)[:,1:] 
corr_mat_2 = compute_corr(std_train2)[:,1:] 
corr_mat_3 = compute_corr(std_train3)[:,1:] 
    
corrs = [corr_mat_0,corr_mat_1,corr_mat_2,corr_mat_3]

for elem in corrs:
    visualize_corr(elem)

We observe that among the 4 batches of partitioned data, the first batch has a feature which is strictly correlated to the other ! 
Furthermore, for the second, thrid , and fourth batch, we observe that some features here and there have very high correlation coefficients, greater than 0.95 ... 

Here, we try to find the  combination of lambda and polynomial expansion degree that results in the most accurate model with respect to cross validation, using ridge regression.

In [None]:
# X = expand_features(std_test0,2)
# x_o = build_poly(std_test0,2)
# print(std_test0[0:2])
# print("----------------")
# print("Antoine's Version")
# print(x_o[0:2,0:60])
# print("----------------")
# print("Mine")
# print(X[0:2,])
# forward_selection(y0,std_test0)
# cross_validation(11, y0, expand_features(std_test0,12),model='ridge_reg',lambda_=0.5,logging=True)


In [None]:
def find_best_params(y, tx, K, max_degree=13):
    degrees= np.arange(1, max_degree+1)
    lambdas = np.logspace(-4, -2, 10)
    lambdas = np.append(lambdas, 0)
    acc = []
    ind = []
    for d in degrees: 
        for l in lambdas:
            ind.append((d, l))
            b = build_poly_2(tx, d)
            a = cross_validation_2(K, y, b, model='ridge_reg',logging=False, lambda_= l, seed=0)
            acc.append(a)
    best_d_l, acc = ind[np.argmax(acc)],np.max(acc)
    print("Best parameters: for  polynomial degree ={}, lambda={} Acuracy:{}".format(best_d_l[0], best_d_l[1], acc))
    return best_d_l, acc
    

In [None]:
find_best_params(y0,std_train0,4)
find_best_params(y1,std_train1,4)
find_best_params(y2,std_train2,4)
find_best_params(y3,std_train3,4)

In [None]:
w_0 = run_model(y0,build_poly_2(std_train0,12),model='ridge_reg', lambda_=0.0001668)
w_1 = run_model(y1,build_poly_2(std_train1,12),model='ridge_reg', lambda_=0.0007742)
w_2 = run_model(y2,build_poly_2(std_train2,12),model='ridge_reg', lambda_=0.0012915)
w_3 = run_model(y3,build_poly_2(std_train3,13),model='ridge_reg', lambda_=0.0007742)

In [None]:
label_0 =predict_labels(w_0,build_poly_2(std_test0,12))
label_1 =predict_labels(w_1,build_poly_2(std_test1,12))
label_2 =predict_labels(w_2,build_poly_2(std_test2,12))
label_3 =predict_labels(w_3,build_poly_2(std_test3,13))

In [None]:
labels = np.empty(len(y_test))
labels[index0_t] = label_0
labels[index1_t] = label_1
labels[index2_t] = label_2
labels[index3_t] = label_3

In [None]:
# Rename submission file name
# OUTPUT_PATH = '../data/submission-ridge.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, labels, OUTPUT_PATH)