# Project 1 - Team BAK

## Step 1 - Getting started

In [8]:
#Import some libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
from helpfulfun import *
from pre_process import pre_process_data
from ridge_regression_helpers import *
from cross_validation import cross_validation_jet


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
#Load the data
y, x, ids = load_csv_data('train.csv')
_, x_test, ids_test = load_csv_data('test.csv')

In [4]:
y.shape, x.shape, ids.shape, x_test.shape, ids_test.shape

((250000,), (250000, 30), (250000,), (568238, 30), (568238,))

# Step 2 - Implement ML Methods

### Linear regression using gradient descent

In [9]:
def mean_squared_error_gd(y, tx, max_iters, gamma):
    """The Gradient Descent (GD) algorithm.
        
    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        initial_w: numpy array of shape=(2, ). The initial guess (or the initialization) for the model parameters
        max_iters: a scalar denoting the total number of iterations of GD
        gamma: a scalar denoting the stepsize
        
    Returns:
        losses: a list of length max_iters containing the loss value (scalar) for each iteration of GD
        ws: a list of length max_iters containing the model parameters as numpy arrays of shape (2, ), for each iteration of GD 
    """
    # Define parameters to store w and loss
    loss = 0
    w = np.zeros((x.shape[1],), dtype=float) #initial_w
    for n_iter in range(max_iters):
        gradient = compute_gradient(y, tx, w)
        loss = compute_loss(y, tx, w)
        
        w = w - gamma * gradient
        
        # print w and loss
        #print("GD iter. {bi}/{ti}: loss={l}, w0={w0}, w1={w1}".format(
         #     bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))

    return w, loss

### Linear regression using stochastic gradient descent

In [10]:
def mean_squared_error_sgd(y, tx, max_iters, gamma):
    """The Stochastic Gradient Descent algorithm (SGD).
            
    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        initial_w: numpy array of shape=(2, ). The initial guess (or the initialization) for the model parameters
        batch_size: a scalar denoting the number of data points in a mini-batch used for computing the stochastic gradient
        max_iters: a scalar denoting the total number of iterations of SGD
        gamma: a scalar denoting the stepsize
        
    Returns:
        losses: a list of length max_iters containing the loss value (scalar) for each iteration of SGD
        ws: a list of length max_iters containing the model parameters as numpy arrays of shape (2, ), for each iteration of SGD 
    """
    # Define parameters to store w and loss
    loss = 0
    w = np.zeros((x.shape[1],), dtype=float) #initial w
    batch_size = 1000
    
    for n_iter in range(max_iters):
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
            gradient = compute_gradient(minibatch_y, minibatch_tx, w)
            loss = compute_loss(y, tx, w)
            w = w - gamma * gradient

        #print("SGD iter. {bi}/{ti}: loss={l}, w0={w0}, w1={w1}".format(
         #     bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return w, loss

### Least squares regression using normal equations

In [11]:
def least_squares(y, tx):
    opt_weights = np.linalg.solve(tx.T.dot(tx), tx.T.dot(y))
    e = y - tx.dot(opt_weights)
    mse = 1/(2*len(y)) * e.T.dot(e)
    return opt_weights, mse

### Ridge regression using normal equations


In [12]:
def ridge_regression(y, tx, lambda_):

    x_t = tx.T
    lambd = lambda_ * 2 * len(y)
    w = np.linalg.solve (np.dot(x_t, tx) + lambd * np.eye(tx.shape[1]), np.dot(x_t,y)) 
    loss = compute_mse(y, tx, w)

    return w,loss

### Logistic regression using gradient descent

In [14]:
def logistic_regression_gradient_descent(y, x):
    max_iter = 1200
    threshold = 1e-8
    gamma = .5
    losses = []

    # build tx
    tx = x
    w = np.zeros((tx.shape[1],), dtype=float)

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    print("loss={l}".format(l=calculate_loss_lr(y, tx, w)))
    
    return w, losses[-1]

### Regularized logistic regression using gradient descent

In [15]:
def logistic_regression_regularized_gradient_descent(y, x, gamma):
    # init parameters
    max_iter = 10000
    lambda_ = 0.1
    threshold = 1e-8
    losses = []

    # build tx
    tx = x
    w = np.zeros((tx.shape[1],), dtype=float)

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y, tx, w, gamma, lambda_)
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    # visualization
    print("loss={l}".format(l=calculate_loss_lr(y, tx, w)))
    return w, losses[-1]

# Cross Validation
In this section we use the cross validation on the splitted and preprocessed datasets with pre-processing parameters that are common for all the methods. Then we're going to choose the method with the largest accuracy and explore the best pre-processing parameters for that specific model. In our cas probably ridge regression... let's see.

# Get estimates from Ridge-Regression

In [None]:
# Ridge regression
seed = 7

# canditates parameters
degrees_candidates = [2,2,4,5,6,7,8,9,10]
alphas_candidates=[2,3,4,5,6,7,8,9]
lambdas_candidates = [1e-02, 1e-03, 1e-06, 1e-07]


k_fold = 3

opt_degree, opt_lambda, opt_alpha, accuracy = select_parameters_ridge_regression_jet(y,x,degrees_candidates,lambdas_candidates,
                                                                  alphas_candidates,k_fold,seed)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accuracy)

In [None]:
# Preprocessing parameters
degrees = [3, 3, 6, 6]
alphas = [7, 9, 5, 5]
lambdas = [0.00021544346900318823, 1e-07, 4.641588833612773e-06, 0.00021544346900318823]


# Split data in k-fold
k_fold = 3
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_jet(y, x, ridge_regression, k_indices, k, degrees, alphas, lambdas)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

In [None]:
ridge_pred = np.zeros(x_test.shape[0])

degrees = [3, 3, 6, 6]
alphas = [7, 9, 5, 5]
lambdas = [0.00021544346900318823, 1e-07, 4.641588833612773e-06, 0.00021544346900318823]


jet_train_class = {
    0: x[:, 22] == 0,
    1: x[:, 22] == 1,
    2: x[:, 22] == 2, 
    3: x[:, 22] == 3
}

    
jet_test_class = {
    0: x_test[:, 22] == 0,
    1: x_test[:, 22] == 1,
    2: x_test[:, 22] == 2, 
    3: x_test[:, 22] == 3
}


for i in range(4):
    x_jet = x[jet_train_class[i]]
    x_jet_test = x_test[jet_test_class[i]]
    y_jet = y[jet_train_class[i]]

    # Pre-processing and transformation of the training set and test set
    x_jet, x_jet_test = pre_process_data(x_jet, x_jet_test, alphas[i])
    x_jet = build_poly(x_jet, degrees[i])
    x_jet_test = build_poly(x_jet_test, degrees[i])
    
    # Train the model through Ridge Regression
    best_w, _ = ridge_regression(y_jet, x_jet, lambdas[i])
    
    # Prediction
    pred = transform_binary(best_w, x_jet_test)
    ridge_pred[jet_test_class[i]] = pred

ridge_pred

In [None]:
def savePredictions(pred, title="submission"):
    y_pred = np.c_[ids_test, pred].astype(str)
    y_pred = np.insert(y_pred, 0, ["Id", "Prediction"], axis=0)
    np.savetxt(title + ".csv", y_pred, fmt="%s", delimiter=",")

savePredictions(ridge_pred)

## Step 4 - Get Predictions

In [None]:
initial_w = np.zeros((x.shape[1],), dtype=float)

In [None]:
# Mean Squared Error Gradient Descent
w_gd, loss_gd = mean_squared_error_gd(y_tr, x, max_iters=150, gamma=.005)
pred = get_predictions(x_te, w_gd)
print("MSE - GD Loss: ", loss_gd)

In [None]:
# Mean Squared Error Stochastic Gradient Descent
w_sgd, loss_sgd = mean_squared_error_sgd(y_tr, x, max_iters=150, gamma=.005)
pred = get_predictions(x_te, w_sgd)
print("MSE - SGD Loss: ", loss_sgd)

In [None]:
# Least Squares
w_ls, loss_ls = least_squares(y_tr, x)
#w_poly, loss_poly = polynomial_regression_ls(y_tr, x)
pred = get_predictions(x_te, w_ls)
print("MSE - LS Loss: ", loss_sgd)

In [None]:
# Ridge Regression
# TODO: add split data
seed = 56
degree = 7
split_ratio = 0.5
#w_rr, loss_rr = ridge_regression_demo(x, y, degree, split_ratio, seed)
w_rr = ridge_regression(y_tr, x, .0005 )
pred_rr = get_predictions(x_te, w_rr)
#print("MSE - RR Loss: ", loss_rr)

In [None]:
# Logictic Regression Gradient Descent
y_tr = np.where(y == "s",1,0)
w_logreg, loss_logreg = logistic_regression_gradient_descent(y_tr, x)
pred_log = get_predictions(x_te, w_logreg)
print("RMSE - LogRed Loss: ", loss_logreg)
y_tr = np.where(y == "s",1,-1)

In [None]:
savePredictions(pred_log, title="submission")

In [None]:
# Regularized Logistic Regression with GD
y_tr = np.where(y == "s",1,0)
w_reglog, loss_reglog = logistic_regression_regularized_gradient_descent(y_tr, x, .001)
pred_reglog = get_predictions(x_te, w_reglog)
# submission : 0.680 dont know why loss is increasing but prediction got better

In [None]:
savePredictions(pred_reglog, title="submission")