In [15]:
import numpy as np
from implementations import *
from proj1_helpers import *

In [16]:
training_pred, training_data, ids_tr = load_csv_data("../data/train.csv")
testing_pred, testing_data, ids_te = load_csv_data("../data/test.csv")
training_data[training_data == -999] = 0
testing_data[testing_data == -999] = 0

In [20]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [10]:
def split_nparts(array, n):
    k, m = divmod(len(array), n)
    return (array[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

In [11]:
def get_split_indexes(x, y, k_fold, seed=1):
    """split the dataset based on the split ratio."""

    # Set seed
    np.random.seed(seed)

    # Generate random indices
    subdivision = int(len(x)/k_fold)
    indices = np.random.permutation(len(y))

    index_split_te = list(split_nparts(indices, k_fold))
    index_split_tr = np.zeros((k_fold, len(x) - subdivision))

    for i in range(0, k_fold):
        index_split_tr[i,:] = list(set(range(x.shape[0])) - set(index_split_te[i]))

    return index_split_te, index_split_tr

In [12]:
def standardize(x_tr, isTestingData = False, x_te = None):
    """ Standardize the testing data by substracting the mean and dividing
    by the variance. If isTestingData is true it standardize the testing data 
    only using the training data """
    
    centered_data = x_tr - np.mean(x_tr, axis=0)
    std_data = centered_data / np.std(centered_data, axis=0)
    
    if(isTestingData and x_te is not None):
        centered_data_te = x_te - np.mean(x_tr, axis=0)
        std_data_te = centered_data_te / np.std(centered_data, axis=0)

    return std_data, std_data_te

In [41]:
def cross_validation(optim_method, loss_function, tx, y, indexes_te, indexes_tr,
                    k_fold, isBuildPoly = False, args_optim = (), args_loss = ()):
    err_tr_list = []
    err_te_list = []
    accuracy_list = []
    for i in range(k_fold):
        x_te = tx[indexes_te[i]]
        y_te = y[indexes_te[i]]
        x_tr = tx[(indexes_tr[i]).astype(int)]
        y_tr = y[(indexes_tr[i]).astype(int)]

        if not isBuildPoly:
            x_tr, x_te = standardize(x_tr, True, x_te)
        else:
            # Does not take into account the column containing only ones to avoid a std of 0
            # It happens when we try to add polynomial features
            x_tr[:,1:], x_te[:,1:] = standardize(x_tr[:,1:], True, x_te[:,1:])
            

        w, err_tr = optim_method(y_tr, x_tr, *args_optim)

        err_te = loss_function(y_te, x_te, w, *args_loss)
        y_predicted = predict_labels(w, x_te)
        
        accuracy_list.append(np.sum(np.equal(y_predicted, y_te)/len(y_te)))

        err_tr_list.append(err_tr)
        err_te_list.append(err_te)

    mse_tr_mean = np.mean(err_tr_list)
    mse_te_mean = np.mean(err_te_list)
    rmse_tr_mean = np.sqrt(2*mse_tr_mean)
    rmse_te_mean = np.sqrt(2*mse_te_mean)
    accuracy_mean = np.mean(accuracy_list)

    return mse_tr_mean, mse_te_mean, rmse_tr_mean, rmse_te_mean, accuracy_mean

In [42]:
#tx = training_data 
tx = build_poly(training_data, 2) 
initial_w = np.zeros(tx.shape[1])
max_iters = 40
gamma = 0.01

k_fold = 4
indexes_te, indexes_tr = get_split_indexes(training_data, training_pred, k_fold)
cross_validation(least_squares_GD, compute_mse, tx, training_pred, indexes_te, indexes_tr,
                 k_fold, True, (initial_w, max_iters, gamma))

(0.80131906480355064,
 0.80440364983593959,
 1.265953446856203,
 1.2683876771996325,
 0.67111600000000005)

# CV for least squares GD

In [38]:
cross_validation(least_squares_GD, compute_mse, x, y, index_te, index_tr, k_fold, (w, 10, 0.01))

(0.48432933737208833, 0.49975411098122596, 0.5)

# CV for least squares SGD

In [39]:
cross_validation(least_squares_SGD, compute_mse, x, y, index_te, index_tr, k_fold, (w, 10, 0.01))

(0.49266641255846966, 0.50013040213696258, 0.5)

# CV for least squares analytical

In [40]:
cross_validation(least_squares, compute_mse, x, y, index_te, index_tr, k_fold)

(0.35742270690435518, 0.60197946830835392, 0.5)

# CV for ridge regression

In [41]:
lambda_ = 0.1
cross_validation(ridge_regression, compute_mse, x, y, index_te, index_tr, k_fold, (lambda_,))

(0.39641465920534397, 0.49848741293835275, 0.5)

# CV for logistic regression

In [42]:
cross_validation(logistic_regression, loss_logistic_regression, x, y, index_te, index_tr, k_fold, (w, 10, 0.01))

(0.67701474914971094, 0.69302239588289749, 0.5)

# CV for regularized logistic regression

In [43]:
lambda_ = 0.1
cross_validation(reg_logistic_regression, reg_logistic_regression_loss, x, y, index_te, index_tr, k_fold, 
                 (lambda_, w, 10, 0.1), (lambda_,))

(0.57261241819429809, 0.69540134788635488, 0.66666666666666663)