# Project 1 - Team BAK

## Step 1 - Getting started

In [None]:
#Import some libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
from helpfulfun import *
%load_ext autoreload
%autoreload 2

In [None]:
#Load the data
y, x, ids = load_csv_data('train.csv')
_, x_test, ids_test = load_csv_data('test.csv')

In [None]:
y.shape, x.shape, ids.shape, x_test.shape, ids_test.shape

In [None]:
def pre_process_data(x_train, x_test, alpha=0):
    """
    Preprocessing: 
    - impute missing values using median,
    - feature study: see "plots" file,
    - impute outliers using alpha-percentiles,
    - standardization
    """
    # Missing Values: 
    
    # Consider the 0s in the 'PRI_jet_all_pt' as missing values
    x_train[:,-1]=np.where(x_train[:,-1]==0, -999, x_train[:,-1])
    
    # Impute missing data
    x_train, x_test = impute_missing(x_train, x_test) # see the impute_missing function in helpfulfun
    
    # Feature study:
    # Delete useless features
    x_train = np.delete(x_train, [15,16,18,19,20,21], 1) # rimuove features inutili, guardare file plot per capire
    x_test = np.delete(x_test, [15,16,18,19,20,21], 1)
    
    # Impute outliers
    x_train = outliers(x_train, alpha) # rimuove gli outliers giudicando i percentili
    x_test = outliers(x_test, alpha)
    
    # Standardization
    x_train, mean_x_train, std_x_train = standardize(x_train) # standardizza i dati.. come abiam fatto noi
    x_test, _, _ = standardize(x_test, mean_x_train, std_x_train)
     
    return x_train, x_test


In [None]:
def select_parameters_ridge_regression_jet(y,x,degrees,lambdas,alphas,k_fold,seed):
    """
    Given the training set and a set of tuples of parameters (alphas, lamdas, degrees) 
    for each jet_subset returns the tuple which maximize the accuracy predicted through Cross Validation 
    """
    par_degree = []
    par_lamb = []
    par_alpha = []
    accuracy = []

    # Split the training set in subsets according to the jet value 
    jet_class = {
        0: x[:, 22] == 0,
        1: x[:, 22] == 1,
        2: x[:, 22] == 2, 
        3: x[:, 22] == 3
        }

    for idx in range(len(jet_class)):
        x_jet = x[jet_class[idx]]
        y_jet = y[jet_class[idx]]
        
        degree,lamb,alpha,accu = select_parameters_ridge_regression(degrees, lambdas, alphas, k_fold, y_jet, x_jet, seed)
        par_degree.append(degree)
        par_lamb.append(lamb)
        par_alpha.append(alpha)
        accuracy.append(accu)

    return par_degree, par_lamb, par_alpha, accuracy

def select_parameters_ridge_regression(degrees, lambdas, alphas, k_fold, y, x, seed):
    """
    Given the training set and a set of tuples of parameters (alphas, lamdas, degrees) 
    returns the tuple which maximize the accuracy predicted through Cross Validation 
    """
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    best_param = []

    for degree in degrees:
        for lamb in lambdas:
            for alpha in alphas:
                accuracy_test = []
                for k in range(k_fold):
                        _, acc_test = cross_validation(y, x, ridge_regression, k_indices, k, degree, alpha, lamb)
                        accuracy_test.append(acc_test)
                best_param.append([degree,lamb,alpha,np.mean(accuracy_test)])
    
    best_param = np.array(best_param)
    ind_best =  np.argmax(best_param[:,3])  #param that maximizes the accuracy    
    best_degree = best_param[ind_best,0]
    best_lamb = best_param[ind_best,1]
    best_alpha = best_param[ind_best,2]
    accu = best_param[ind_best,3]
   
    return best_degree, best_lamb, best_alpha, accu

def cross_validation(y, x, method, k_indices, k, degree, alpha, lamb=None, log=False, **kwargs):
    """k-fold cross-validation for the different methods: LS with GD, LS with SGD, Normal Equations, Logistic and Regularized Logistic Regression with SGD"""
    # get k'th subgroup in test, others in train
    test_indeces = k_indices[k] # molto semplicemente prende il gruppo k-esimo e lo mette come test
    train_indeces = np.delete(k_indices, (k), axis=0).ravel() # qua invece prende tutti gli altri gruppi e li usa come train

    x_train = x[train_indeces, :] # crea i data set
    x_test = x[test_indeces, :]
    y_train = y[train_indeces] 
    y_test = y[test_indeces] 

    # initialize output vectors
    y_train_pred = np.zeros(len(y_train)) # crea due vettori vuoti che poi conterranno le previsioni per il train e per il test
    y_test_pred = np.zeros(len(y_test))
 
    # data pre-processing
    x_train, x_test = pre_process_data(x_train, x_test, alpha) # qui fa il preprocessing 
            
    # transformation
    x_train = build_poly(x_train, degree) 
    x_test = build_poly(x_test, degree) 
        
    # compute weights using given method
    if lamb == None:
        weights, _ = method(y_train, x_train, **kwargs) 
    else: 
        weights, _ = method(y_train, x_train, lamb, **kwargs) # ridge regression in this case
       
    # predict
    if log == True: # quindi se abbiamo a che fare con logistic
        y_train_pred = predict_labels_logistic(weights, x_train) # applica la funzione predict_labels_logistic
        y_test_pred = predict_labels_logistic(weights, x_test)
        print(y_train_pred, y_train)
    else:
        y_train_pred = transform_binary(weights, x_train) # se non è una logistic regression applica predict_labels
        y_test_pred = transform_binary(weights, x_test)
        

    # compute accuracy for train and test data
    acc_train = compute_accuracy(y_train_pred, y_train) # qua applica compute_accuracy function
                                                           
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test


def cross_validation_jet(y, x, method, k_indices, k, degrees, alphas, lambdas=None, log=False, **kwargs):
    """
    Completes k-fold cross-validation for Least Squares with GD, SGD, Normal Equations, Logistic and Regularized Logistic 
    Regression with SGD
    """
    # get k'th subgroup in test, others in train
    test_indeces = k_indices[k] # molto semplicemente prende il gruppo k-esimo e lo mette come test
    train_indeces = np.delete(k_indices, (k), axis=0).ravel() # qua invece prende tutti gli altri gruppi e li usa come train

    x_train_all_jets = x[train_indeces, :]
    x_test_all_jets = x[test_indeces, :]
    y_train_all_jets = y[train_indeces]
    y_test_all_jets = y[test_indeces]

    # split in 4 subsets the training set accordingly to JET class
    jet_train_class = {
        0: x_train_all_jets[:, 22] == 0,
        1: x_train_all_jets[:, 22] == 1,
        2: x_train_all_jets[:, 22] == 2, 
        3: x_train_all_jets[:, 22] == 3
    }
    
    jet_test_class = {
        0: x_test_all_jets[:, 22] == 0,
        1: x_test_all_jets[:, 22] == 1,
        2: x_test_all_jets[:, 22] == 2, 
        3: x_test_all_jets[:, 22] == 3
    }


    # initialize output vectors
    y_train_pred = np.zeros(len(y_train_all_jets))
    y_test_pred = np.zeros(len(y_test_all_jets))

    for idx in range(len(jet_train_class)):
        x_train = x_train_all_jets[jet_train_class[idx]]
        x_test = x_test_all_jets[jet_test_class[idx]]
        y_train = y_train_all_jets[jet_train_class[idx]]

        # data pre-processing
        x_train, x_test = pre_process_data(x_train, x_test, alphas[idx])
        x_train = build_poly(x_train, degrees[idx]) 
        x_test = build_poly(x_test, degrees[idx]) 
        
        # compute weights using given method
        if lambdas == None:
            weights, _ = method(y_train, x_train, **kwargs)
        else:
            weights, _ = method(y_train, x_train, lambdas[idx], **kwargs)
        
        # predict
        if log == True:
            y_train_pred[jet_train_class[idx]] = predict_labels_logistic(weights, x_train)
            y_test_pred[jet_test_class[idx]] = predict_labels_logistic(weights, x_test)
        else:
            y_train_pred[jet_train_class[idx]] = predict_labels(weights, x_train)
            y_test_pred[jet_test_class[idx]] = predict_labels(weights, x_test)
        
    # compute accuracy for train and test data
    acc_train = compute_accuracy(y_train_pred, y_train_all_jets)
    acc_test = compute_accuracy(y_test_pred, y_test_all_jets)
    
    return acc_train, acc_test


In [None]:
# Ridge regression

seed = 7

# canditates parameters
degrees_candidates = [2,2,4,5,6,7,8,9,10]
alphas_candidates=[2,3,4,5,6,7,8,9]
lambdas_candidates = [1e-02, 1e-03, 1e-06, 1e-07]


k_fold = 3

opt_degree, opt_lambda, opt_alpha, accuracy = select_parameters_ridge_regression_jet(y,x,degrees_candidates,lambdas_candidates,
                                                                  alphas_candidates,k_fold,seed)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accuracy)

In [None]:
# Preprocessing parameters
degrees = [3, 3, 6, 6]
alphas = [7, 9, 5, 5]
lambdas = [0.00021544346900318823, 1e-07, 4.641588833612773e-06, 0.00021544346900318823]


# Split data in k-fold
k_fold = 3
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_jet(y, x, ridge_regression, k_indices, k, degrees, alphas, lambdas)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

In [None]:
ridge_pred = np.zeros(x_test.shape[0])

degrees = [3, 3, 6, 6]
alphas = [7, 9, 5, 5]
lambdas = [0.00021544346900318823, 1e-07, 4.641588833612773e-06, 0.00021544346900318823]


jet_train_class = {
    0: x[:, 22] == 0,
    1: x[:, 22] == 1,
    2: x[:, 22] == 2, 
    3: x[:, 22] == 3
}

    
jet_test_class = {
    0: x_test[:, 22] == 0,
    1: x_test[:, 22] == 1,
    2: x_test[:, 22] == 2, 
    3: x_test[:, 22] == 3
}


for i in range(4):
    x_jet = x[jet_train_class[i]]
    x_jet_test = x_test[jet_test_class[i]]
    y_jet = y[jet_train_class[i]]

    # Pre-processing and transformation of the training set and test set
    x_jet, x_jet_test = pre_process_data(x_jet, x_jet_test, alphas[i])
    x_jet = build_poly(x_jet, degrees[i])
    x_jet_test = build_poly(x_jet_test, degrees[i])
    
    # Train the model through Ridge Regression
    best_w, _ = ridge_regression(y_jet, x_jet, lambdas[i])
    
    # Prediction
    pred = transform_binary(best_w, x_jet_test)
    ridge_pred[jet_test_class[i]] = pred

ridge_pred

In [None]:
def savePredictions(pred, title="submission"):
    y_pred = np.c_[ids_test, pred].astype(str)
    y_pred = np.insert(y_pred, 0, ["Id", "Prediction"], axis=0)
    np.savetxt(title + ".csv", y_pred, fmt="%s", delimiter=",")

savePredictions(ridge_pred)

# End of new part. Previous work:

In [None]:
#### import the datasets

train_list = np.genfromtxt("train.csv", dtype=None, delimiter=",", skip_header =1, unpack=True, encoding=None)
train = np.array(train_list)

test_list = np.genfromtxt("test.csv", dtype=None, delimiter=",", skip_header =1, unpack=True, encoding=None)
test = np.array(test_list)

In [None]:
x_te = test[2:].T

In [None]:
y = train[1]
x_tr = train[2:].T
print("x: ", x_tr.shape, " y: ", y.shape)

## Step 2 - Preprocessing

In [None]:
x_tr_float = x_tr.astype(float)

In [None]:
x_tr_float[2,0]

In [None]:
replaced=replace_with_median(x_tr_float)

In [None]:
print(x_tr_float[2,0],replaced[2,0])

In [None]:
features_VIF=colinearity_check(x_tr_float,4)
features_VIF_replaced=colinearity_check(replaced,4)

In [None]:
print(features_VIF)

In [None]:
print(features_VIF_replaced)

In [None]:
y = train[1]
y_tr = np.where(y == "s",1,-1)
print(y_tr.shape, " and y[:5]: ", y_tr[:5])

In [None]:
# delete features with more than 30% NaN values
x_tr_prep = np.delete(x_tr, 0, 1) # infer this one later
x_te_prep = np.delete(x_te, 0, 1) # infer this one later
for i in [4, 4, 4, 9, 18, 18, 18, 18, 18, 18, 18]:
    x_tr_prep = np.delete(x_tr_prep, i, 1)
    x_te_prep = np.delete(x_te_prep, i, 1)
x_tr_prep = x_tr_prep.astype(float)
x_te_prep = x_te_prep.astype(float)
x_tr_prep.shape

In [None]:
x, mean_x, std_x = standardize(x_tr_prep)
x_te, _, _ = standardize(x_te_prep)

In [None]:
x.shape, x_te.shape

In [None]:
x_tr_2 = np.genfromtxt("x_tr_2.csv", dtype=float, delimiter=",", skip_header =0, unpack=True, encoding=None)
x_tr_2 = np.array(x_tr_2).T

x_te_2 = np.genfromtxt("x_te_2.csv", dtype=float, delimiter=",", skip_header =0, unpack=True, encoding=None)
x_te_2 = np.array(x_te_2).T

x_tr_3 = np.genfromtxt("x_tr_3_no_out.csv", dtype=float, delimiter=",", skip_header =0, unpack=True, encoding=None)
x_tr_3 = np.array(x_tr_3).T

x_te_3 = np.genfromtxt("x_te_3_no_out.csv", dtype=float, delimiter=",", skip_header =0, unpack=True, encoding=None)
x_te_3 = np.array(x_te_3).T

x_tr_4 = np.genfromtxt("x_tr_4_no_out.csv", dtype=float, delimiter=",", skip_header =0, unpack=True, encoding=None)
x_tr_4 = np.array(x_tr_4).T
x_te_4 = np.genfromtxt("x_te_4_no_out.csv", dtype=float, delimiter=",", skip_header =0, unpack=True, encoding=None)
x_te_4 = np.array(x_te_4).T

In [None]:
x_tr_4.shape, x_te_4.shape

In [None]:
x, mean_x, std_x = standardize(x_tr_4)
x_te, _, _ = standardize(x_te_4)
#x = x_tr_2
#x_te = x_te_2

## Step 3 - Implement ML Methods

#### Linear regression using gradient descent

In [None]:
def mean_squared_error_gd(y, tx, max_iters, gamma):
    """The Gradient Descent (GD) algorithm.
        
    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        initial_w: numpy array of shape=(2, ). The initial guess (or the initialization) for the model parameters
        max_iters: a scalar denoting the total number of iterations of GD
        gamma: a scalar denoting the stepsize
        
    Returns:
        losses: a list of length max_iters containing the loss value (scalar) for each iteration of GD
        ws: a list of length max_iters containing the model parameters as numpy arrays of shape (2, ), for each iteration of GD 
    """
    # Define parameters to store w and loss
    loss = 0
    w = np.zeros((x.shape[1],), dtype=float) #initial_w
    for n_iter in range(max_iters):
        gradient = compute_gradient(y, tx, w)
        loss = compute_loss(y, tx, w)
        
        w = w - gamma * gradient
        
        # print w and loss
        #print("GD iter. {bi}/{ti}: loss={l}, w0={w0}, w1={w1}".format(
         #     bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))

    return w, loss
    

#### Linear regression using stochastic gradient descent


In [None]:
def mean_squared_error_sgd(y, tx, max_iters, gamma):
    """The Stochastic Gradient Descent algorithm (SGD).
            
    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        initial_w: numpy array of shape=(2, ). The initial guess (or the initialization) for the model parameters
        batch_size: a scalar denoting the number of data points in a mini-batch used for computing the stochastic gradient
        max_iters: a scalar denoting the total number of iterations of SGD
        gamma: a scalar denoting the stepsize
        
    Returns:
        losses: a list of length max_iters containing the loss value (scalar) for each iteration of SGD
        ws: a list of length max_iters containing the model parameters as numpy arrays of shape (2, ), for each iteration of SGD 
    """
    # Define parameters to store w and loss
    loss = 0
    w = np.zeros((x.shape[1],), dtype=float) #initial w
    batch_size = 1000
    
    for n_iter in range(max_iters):
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
            gradient = compute_gradient(minibatch_y, minibatch_tx, w)
            loss = compute_loss(y, tx, w)
            w = w - gamma * gradient

        #print("SGD iter. {bi}/{ti}: loss={l}, w0={w0}, w1={w1}".format(
         #     bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return w, loss
    

#### Build polimonial basis function which can be used with Least Squares and Ridge Regression

In [None]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
    
    Args:
        x: numpy array of shape (N,), N is the number of samples.
        degree: integer.
        
    Returns:
        poly: numpy array of shape (N,d+1)
    """
    poly = np.ones((len(x),1))
    for j in range( 1, degree + 1):
        poly = np.c_[poly, np.power(x, j)]
    return poly

#### Least squares regression using normal equations

In [None]:
def least_squares(y, tx):
    opt_weights = np.linalg.solve(tx.T.dot(tx), tx.T.dot(y))
    e = y - tx.dot(opt_weights)
    mse = 1/(2*len(y)) * e.T.dot(e)
    return opt_weights, mse

In [None]:
def polynomial_regression_ls(y, x, degrees=[1, 3, 7, 12]):
    """Constructing the polynomial basis function expansion of the data,
       and then running least squares regression."""
    ws = []
    losses = []
    for ind, degree in enumerate(degrees):
        tx = build_poly(x, degree)
        w, loss = least_squares(y, tx)
        rmse = np.sqrt(2 * loss)
        
        ws.append(w)
        losses.append(rmse)

        #print("Processing {i}th experiment, degree={d}, rmse={loss}".format(
         #     i=ind + 1, d=degree, loss=rmse))
    ind = argmin(losses)
    return ws[ind], losses[ind]

#### Ridge regression using normal equations

In [None]:
def ridge_regression_demo(x, y, degree, ratio, seed):
    """ridge regression demo."""
    lambdas = np.logspace(-5, 0, 15)
    # TODO split and add test data
    
    tx_tr = build_poly(x_tr, degree)
    rmse_tr = []
    for ind, lambda_ in enumerate(lambdas):
        weight = ridge_regression(y_tr, tx_tr, lambda_)
        rmse_tr.append(compute_rmse(y_tr, tx_tr, weight))
        
        #print("proportion={p}, degree={d}, lambda={l:.3f}, Training RMSE={tr:.3f}, Testing RMSE={te:.3f}".format(
         #      p=ratio, d=degree, l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind]))
    ind = argmin(rmse_tr)
    return ws[ind], losses[ind]


#### Logistic regression using gradient descent or SGD (y ∈ {0, 1})

In [None]:
def logistic_regression_gradient_descent(y, x):
    max_iter = 1200
    threshold = 1e-8
    gamma = .5
    losses = []

    # build tx
    tx = x
    w = np.zeros((tx.shape[1],), dtype=float)

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    print("loss={l}".format(l=calculate_loss_lr(y, tx, w)))
    
    return w, losses[-1]

#### Regularized Logistic Regression

In [None]:
def logistic_regression_regularized_gradient_descent(y, x, gamma):
    # init parameters
    max_iter = 10000
    lambda_ = 0.1
    threshold = 1e-8
    losses = []

    # build tx
    tx = x
    w = np.zeros((tx.shape[1],), dtype=float)

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y, tx, w, gamma, lambda_)
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    # visualization
    print("loss={l}".format(l=calculate_loss_lr(y, tx, w)))
    return w, losses[-1]

In [None]:
def logistic_regression_newton_method(y, x):
    # init parameters
    max_iter = 100
    threshold = 1e-8
    lambda_ = 0.1
    gamma = 1.
    losses = []

    # build tx
    tx = np.c_[np.ones((y.shape[0], )), x]
    w = np.zeros((tx.shape[1], 1))

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_newton_method(y, tx, w, gamma)
        # log info
        if iter % 1 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))

        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    # visualization
    #visualization(y, x, mean_x, std_x, w, "classification_by_logistic_regression_newton_method", True)
    print("loss={l}".format(l=calculate_loss(y, tx, w)))
    
    return w, losses[-1]

## Step 4 - Get Predictions

In [None]:
def get_predictions(x, best_w):
    preds = x.dot(best_w).reshape((x.shape[0],))
    y_te = np.where(preds < .5,-1,1)
    y_pred = np.c_[test[0], y_te]
    print(y_pred[0:5])
    y_pred = np.insert(y_pred, 0, ["Id", "Prediction"], axis=0)
    return y_pred

In [None]:
def savePredictions(pred, title="submission"):
    np.savetxt(title + ".csv", pred, fmt="%s", delimiter=",")

In [None]:
initial_w = np.zeros((x.shape[1],), dtype=float)

In [None]:
# Mean Squared Error Gradient Descent
w_gd, loss_gd = mean_squared_error_gd(y_tr, x, max_iters=150, gamma=.005)
pred = get_predictions(x_te, w_gd)
print("MSE - GD Loss: ", loss_gd)

In [None]:
# Mean Squared Error Stochastic Gradient Descent
w_sgd, loss_sgd = mean_squared_error_sgd(y_tr, x, max_iters=150, gamma=.005)
pred = get_predictions(x_te, w_sgd)
print("MSE - SGD Loss: ", loss_sgd)

In [None]:
# Least Squares
w_ls, loss_ls = least_squares(y_tr, x)
#w_poly, loss_poly = polynomial_regression_ls(y_tr, x)
pred = get_predictions(x_te, w_ls)
print("MSE - LS Loss: ", loss_sgd)

In [None]:
# Ridge Regression
# TODO: add split data
seed = 56
degree = 7
split_ratio = 0.5
#w_rr, loss_rr = ridge_regression_demo(x, y, degree, split_ratio, seed)
w_rr = ridge_regression(y_tr, x, .0005 )
pred_rr = get_predictions(x_te, w_rr)
#print("MSE - RR Loss: ", loss_rr)

In [None]:
# Logictic Regression Gradient Descent
y_tr = np.where(y == "s",1,0)
w_logreg, loss_logreg = logistic_regression_gradient_descent(y_tr, x)
pred_log = get_predictions(x_te, w_logreg)
print("RMSE - LogRed Loss: ", loss_logreg)
y_tr = np.where(y == "s",1,-1)

In [None]:
savePredictions(pred_log, title="submission")

In [None]:
# Regularized Logistic Regression with GD
y_tr = np.where(y == "s",1,0)
w_reglog, loss_reglog = logistic_regression_regularized_gradient_descent(y_tr, x, .001)
pred_reglog = get_predictions(x_te, w_reglog)
# submission : 0.680 dont know why loss is increasing but prediction got better

In [None]:
savePredictions(pred_reglog, title="submission")

## Step - 5 : Cross-Validation:

In [None]:
def cross_validation(method, y, x, k_indices, k, gamma, lambda_ridge, degree, lambda_logistic):
    """return the loss of ridge regression for a fold corresponding to k_indices"""
    
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    
    
    if method == "ridge_regression_demo":
        # form data with polynomial degree
        tx_tr = build_poly(x_tr, degree)
        tx_te = build_poly(x_te, degree)
        # ridge regression
        best_w = ridge_regression(y_tr, tx_tr, lambda_ridge)
        loss_tr = sum(get_predictions_cv(tx_tr, best_w) == y_tr)/len(y_tr)
        loss_te = sum(get_predictions_cv(tx_te, best_w) == y_te)/len(y_te)
        #y_pred = get_predictions_cv(tx_te, best_w)
        
    if method == "least_squares":
        best_w, loss_ls = least_squares(y_tr, x_tr)
        loss_tr = sum([get_predictions_cv(x_tr, best_w) == y_tr])/len(y_tr)
        loss_te = sum([get_predictions_cv(x_te, best_w) == y_te])/len(y_te)
        
    if method == "mean_squared_error_gd":
        best_w, loss = mean_squared_error_gd(y_tr, x_tr, max_iters = 150, gamma = gamma)
        loss_tr = sum([get_predictions_cv(x_tr, best_w) == y_tr])/len(y_tr)
        loss_te = sum([get_predictions_cv(x_te, best_w) == y_te])/len(y_te)
        
    if method == "mean_squared_error_sgd":
        best_w, loss = mean_squared_error_sgd(y_tr, x_tr, max_iters=150, gamma = gamma)
        loss_tr = sum([get_predictions_cv(x_tr, best_w) == y_tr])/len(y_tr)
        loss_te = sum([get_predictions_cv(x_te, best_w) == y_te])/len(y_te)    

    if method == "logistic_regression_gradient_descent":
        best_w, loss_logreg = logistic_regression_gradient_descent(y_tr, x_tr)
        loss_tr = sum(get_predictions_cv(x_tr, best_w) == y_tr)/len(y_tr)
        loss_te = sum(get_predictions_cv(x_te, best_w) == y_te)/len(y_te)           
        
    if method == "logistic_regression_regularized_gradient_descent":
        best_w, loss_logreg = logistic_regression_regularized_gradient_descent(y_tr, x_tr, lambda_logistic)
        loss_tr = sum(get_predictions_cv(x_tr, best_w) == y_tr)/len(y_tr)
        loss_te = sum(get_predictions_cv(x_te, best_w) == y_te)/len(y_te)                   
        
    return loss_tr, loss_te, best_w   #, y_pred, y_te


In [None]:
def k_fold_cross_validation(method, y, tx, max_iters, gamma, lambdas_ridge, lambdas_logistic):
    
    if method in ("mean_squared_error_gd", "mean_squared_error_sgd", 
                  "least_squares", "logistic_regression_gradient_descent"):
        seed = 7
        k_fold = 10
        #split data in k fold 
        k_indices = build_k_indices(y, k_fold, seed)
        # define lists to store the loss of training data and test data
        losses_tr = []
        losses_te = []
        lambda_ridge = 0 
        lambda_logistic = 0
        degree = 0
        # cross validation:
        for k in range(k_fold):
            print("k fold = ", k+1 , "/", k_fold)
            loss_tr, loss_te,_ = cross_validation(method,y, tx, k_indices, k, gamma, lambda_ridge, degree, lambda_logistic)
            losses_tr.append(loss_tr)
            losses_te.append(loss_te)
            
        loss_tr = np.mean(losses_tr)
        loss_te = np.mean(losses_te)
        
        return (loss_tr, loss_te)
            
        
    #if method == "polynomial_regression_ls":
        #####
        
    if method == "ridge_regression_demo":
        seed = 7
        degree = 7
        k_fold = 10
        # split data in k fold
        k_indices = build_k_indices(y, k_fold, seed)
        # define lists to store the loss of training data and test data
        losses_tr = []
        losses_te = []
        lambda_ridge = 0
        degree = 0
        # cross validation
        step = 1
        for lambda_ridge in lambdas_ridge:
            losses_tr_tmp = []
            losses_te_tmp = []
            print(step, "/" , len(lambdas_ridge))
            for k in range(k_fold):
                loss_tr, loss_te,_  = cross_validation(method,y, tx, k_indices, k, gamma, lambda_ridge, degree, lambda_logistic)
                losses_tr_tmp.append(loss_tr)
                losses_te_tmp.append(loss_te)
                print("step : ", step, "k_fold : ", k,"/", k_fold)
            losses_tr.append(np.mean(losses_tr_tmp))
            losses_te.append(np.mean(losses_te_tmp))
            step += 1
            

        #cross_validation_visualization(lambdas, losses_tr, losses_te)
        print("losses train = ", losses_tr, "\n\n", "losses test = ", losses_te)
        return (losses_tr, losses_te)
        
        
    if method == "logistic_regression_regularized_gradient_descent":
        seed = 7
        k_fold = 10
        # split data in k fold
        k_indices = build_k_indices(y, k_fold, seed)
        # define lists to store the loss of training data and test data
        losses_tr = []
        losses_te = []
        lambda_ridge = 0
        degree = 0
        # cross validation
        step = 1
        for lambda_logistic in lambdas_logistic:
            losses_tr_tmp = []
            losses_te_tmp = []
            print(step, "/" , len(lambdas_logistic))
            for k in range(k_fold):
                loss_tr, loss_te,_  = cross_validation(method,y, tx, k_indices, k, gamma, lambda_ridge, degree, lambda_logistic)
                losses_tr_tmp.append(loss_tr)
                losses_te_tmp.append(loss_te)
                print("step : ", step, "k_fold : ", k+1,"/", k_fold)
            losses_tr.append(np.mean(losses_tr_tmp))
            losses_te.append(np.mean(losses_te_tmp))
            step += 1
        print("losses train = ", losses_tr, "\n\n", "losses test = ", losses_te)
        return (losses_tr, losses_te)
        
    #if method == "logistic_regression_newton_method":
        #####

In [None]:
# y_tr = np.where(y == "s",1,-1)
# k_fold_cross_validation("ridge_regression_demo", y_tr, x,  150, .5, (4,2,1,.5), (4,2,1,.5))

In [None]:
y_tr = np.where(y == "s",1,0)
k_fold_cross_validation("logistic_regression_gradient_descent", y_tr, x,  150, .5, (4,2,1,.5), (4,2,1,.5))

In [None]:
k_fold_cross_validation("logistic_regression_regularized_gradient_descent", 
                        y_tr, x,  150, .5, (4,2,1,.5), (0.001, 0.1, 0.5))