In [32]:
import numpy as np
from implementations import *
from proj1_helpers import *

In [33]:
def split_nparts(array, n):
    k, m = divmod(len(array), n)
    return (array[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

In [34]:
def get_split_indexes(x, y, k_fold, seed=1):
    """split the dataset based on the split ratio."""

    # Set seed
    np.random.seed(seed)

    # Generate random indices
    subdivision = int(len(x)/k_fold)
    indices = np.random.permutation(len(y))

    index_split_te = list(split_nparts(indices, k_fold))
    index_split_tr = np.zeros((k_fold, len(x) - subdivision))

    for i in range(0, k_fold):
        index_split_tr[i,:] = list(set(range(x.shape[0])) - set(index_split_te[i]))

    return index_split_te, index_split_tr

In [35]:
def standardize(x_tr, isTestingData = False, x_te = None):
    """ Standardize the testing data by substracting the mean and dividing
    by the variance. If isTestingData is true it standardize the testing data 
    only using the training data """
    
    centered_data = x_tr - np.mean(x_tr, axis=0)
    std_data = centered_data / np.std(centered_data, axis=0)
    
    if(isTestingData and x_te is not None):
        centered_data_te = x_te - np.mean(x_tr, axis=0)
        std_data_te = centered_data_te / np.std(centered_data, axis=0)

    return std_data, std_data_te

In [36]:
def cross_validation(optim_method, loss_function, tx, y, indexes_te, indexes_tr,
                    k_fold, args_optim = (), args_loss = ()):
    err_tr_list = []
    err_te_list = []
    accuracy_list = []
    for i in range(k_fold):
        x_te = tx[indexes_te[i]]
        y_te = y[indexes_te[i]]
        x_tr = tx[(indexes_tr[i]).astype(int)]
        y_tr = y[(indexes_tr[i]).astype(int)]

        x_tr, x_te = standardize(x_tr, True, x_te)

        w, err_tr = optim_method(y_tr, x_tr, *args_optim)

        err_te = loss_function(y_te, x_te, w, *args_loss)
        y_predicted = predict_labels(w, x_te)
        
        accuracy_list.append(np.sum(np.equal(y_predicted, y_te)/len(y_te)))

        err_tr_list.append(err_tr)
        err_te_list.append(err_te)

    mse_tr_mean = np.mean(err_tr_list)
    mse_te_mean = np.mean(err_te_list)
    accuracy_mean = np.mean(accuracy_list)

    return mse_tr_mean, mse_te_mean, accuracy_mean

In [37]:
x = np.array([[0.1,0.2],[0.1,0.33],[0.11,0.4],[0.1,0.9],[0.2,0.8],[0.1,0.1]])
y = np.array([1,1,-1,-1,1,-1])
w = np.zeros(x.shape[1])

k_fold = 3

index_te, index_tr = get_split_indexes(x,y,3)
print(index_te, index_tr)

[array([2, 1]), array([4, 0]), array([3, 5])] [[ 0.  3.  4.  5.]
 [ 1.  2.  3.  5.]
 [ 0.  1.  2.  4.]]


# CV for least squares GD

In [38]:
cross_validation(least_squares_GD, compute_mse, x, y, index_te, index_tr, k_fold, (w, 10, 0.01))

(0.48432933737208833, 0.49975411098122596, 0.5)

# CV for least squares SGD

In [39]:
cross_validation(least_squares_SGD, compute_mse, x, y, index_te, index_tr, k_fold, (w, 10, 0.01))

(0.49266641255846966, 0.50013040213696258, 0.5)

# CV for least squares analytical

In [40]:
cross_validation(least_squares, compute_mse, x, y, index_te, index_tr, k_fold)

(0.35742270690435518, 0.60197946830835392, 0.5)

# CV for ridge regression

In [41]:
lambda_ = 0.1
cross_validation(ridge_regression, compute_mse, x, y, index_te, index_tr, k_fold, (lambda_,))

(0.39641465920534397, 0.49848741293835275, 0.5)

# CV for logistic regression

In [42]:
cross_validation(logistic_regression, loss_logistic_regression, x, y, index_te, index_tr, k_fold, (w, 10, 0.01))

(0.67701474914971094, 0.69302239588289749, 0.5)

# CV for regularized logistic regression

In [43]:
lambda_ = 0.1
cross_validation(reg_logistic_regression, reg_logistic_regression_loss, x, y, index_te, index_tr, k_fold, 
                 (lambda_, w, 10, 0.1), (lambda_,))

(0.57261241819429809, 0.69540134788635488, 0.66666666666666663)