In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt

%reload_ext autoreload
%autoreload 2
from implementations import *

In [2]:
def split_data(x, y, ratio, seed):   
    """split the train dataset to train and validation dataset based on the split ratio."""
    np.random.seed(seed)
    # generate random indices
    num_row = len(x)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_train = indices[: index_split]
    index_val = indices[index_split:]
    # create split
    x_tr = x[index_train]
    x_val = x[index_val]
    y_tr = y[index_train]
    y_val = y[index_val] 
    return x_tr, x_val, y_tr, y_val

In [3]:
def _accuracy(Y_pred, Y_true):
    # This function calculates prediction accuracy
    acc = 1 - np.mean(np.abs(Y_pred - Y_true))
    return acc

In [4]:
def build_k_indices(num, k, seed):
    """build k indices for k-fold.  
    Args:
        y:      shape=(N,)
        k: K in K-fold, i.e. the fold num
        seed:   the random seed
    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold
    """
    interval = int(num / k)
    np.random.seed(seed)
    indices = np.random.permutation(num)
    k_indices = [indices[i * interval: (i + 1) * interval] for i in range(k)]
    return np.array(k_indices)

In [5]:
def cross_validation(y, x, k_indices, k):
    """Split the train dataset to train and validation dataset with respect to k-fold cross validation."""
    y_test = np.array([])
    x_test = []
    for i in k_indices[k]:
        y_test = np.append(y_test, y[i])
        x_test.append(x[i])
    k_indices = np.delete(k_indices, k, axis=0)
    k_indices = k_indices.ravel()
    y_train = np.array([])
    x_train = []
    for i in k_indices:  
        y_train = np.append(y_train, y[i])
        x_train.append(x[i])
    return np.array(x_train), np.array(x_test), y_train.reshape(-1, 1), y_test

In [6]:
from helpers import load_csv_data
from helpers import create_csv_submission

# load dataset
y_train, tx_train, ids_train = load_csv_data("train.csv")
y_test, tx_test, ids_test = load_csv_data("test.csv")

# mapping labels from {-1, 1} to {0, 1} in order to apply logistic regression
y_train[np.where(y_train == -1)] = 0

# tx_train = np.c_[np.ones((tx_train.shape[0], 1)), tx_train]
# tx_test = np.c_[np.ones((tx_test.shape[0], 1)), tx_test]

In [7]:
# extract usable data                TBD
# y_train, tx_train, ids_train
# y_test, tx_test, ids_test
# x_tr, x_val, y_tr, y_val

# data preprocessing -- mean subtraction and normalization                TBD
# y_train, tx_train, ids_train
# y_test, tx_test, ids_test

In [8]:
# y_train = y_train[0:1000].reshape(1000,1)
# tx_train = tx_train[0:1000]
# tx_test = tx_test[0:100]
# ids_test = ids_test[0:100]
y_train = y_train.reshape(len(y_train),1)
num = y_train.shape[0]

In [9]:
seed = 123
k_fold = 5
k_indices = build_k_indices(num, k_fold, seed)
# x_tr, x_val, y_tr, y_val = cross_validation(y_train, tx_train, k_indices, k=3)
# x_tr.shape, x_val.shape, y_tr.shape, y_val.shape

In [10]:
# x_tr, x_val, y_tr, y_val = split_data(tx_train, y_train, 0.75, 123)
# y_tr = y_tr[0:5000].reshape(5000,1)
# x_tr = x_tr[0:5000]
# y_val = y_val[0:500].reshape(500,1)
# x_val = x_val[0:500]
# tx_test = tx_test[0:500]
# ids_test = ids_test[0:500]

In [11]:
# set the parameters
lambda_ = 0.001
max_iters = 100
gamma = 1E-9

Gradient Descent using logistic regression

In [None]:
losses = []
accs = []
for k in range(k_fold):
    x_tr, x_val, y_tr, y_val = cross_validation(y_train, tx_train, k_indices, k)
    initial_w = np.zeros((x_tr.shape[1],1))
    w, loss = logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma)
    y_pred = predict_labels(w, x_val)
    acc = _accuracy(y_pred, y_val)
    accs.append(acc)
    losses.append(loss)
    print("test prediction acc of fold " + str(k) + " is " + str(_accuracy(y_pred, y_val)))
print("Average test prediction accuracy over " + str(k_fold) + " folds is " + str(np.mean(accs)))
print("Average loss over " + str(k_fold) + " folds is " + str(np.mean(losses)))

# create_csv_submission(ids_test, predictions, "submission_logistic_regression.csv")

Average loss with Gradient Descent(GD) using Logistic Regression:  0.6850475886633434


Gradient Descent using the regularized logistic regression

In [None]:
losses = []
accs = []
for k in range(k_fold):
    x_tr, x_val, y_tr, y_val = cross_validation(y_train, tx_train, k_indices, k)
    initial_w = np.zeros((x_tr.shape[1],1))
    w, loss = reg_logistic_regression(y_tr, x_tr, lambda_, initial_w, max_iters, gamma)
    y_pred = predict_labels(w, x_val)
    acc = _accuracy(y_pred, y_val)
    accs.append(acc)
    losses.append(loss)
    print("test prediction acc of fold " + str(k) + " is " + str(_accuracy(y_pred, y_val)))
print("Average test prediction accuracy over " + str(k_fold) + " folds is " + str(np.mean(accs)))
print("Average loss over " + str(k_fold) + " folds is " + str(np.mean(losses)))

# create_csv_submission(ids_test, predictions, "submission_reg_logistic_regression.csv")

Average loss with Gradient Descent(GD) using Regularized Logistic Regression:  0.6850475886638354


In [None]:
Gradient Descent (GD) algorithm using mean squared error

In [24]:
losses = []
accs = []
for k in range(k_fold):
    x_tr, x_val, y_tr, y_val = cross_validation(y_train, tx_train, k_indices, k)
    initial_w = np.zeros((x_tr.shape[1],1))
    w, loss = mean_squared_error_gd(y_tr, x_tr, initial_w, max_iters, gamma)
    y_pred = predict_labels(w, x_val)
    acc = _accuracy(y_pred, y_val)
    accs.append(acc)
    losses.append(loss)
    print("test prediction acc of fold " + str(k) + " is " + str(_accuracy(y_pred, y_val)))
print("Average test prediction accuracy over " + str(k_fold) + " folds is " + str(np.mean(accs)))
print("Average loss over " + str(k_fold) + " folds is " + str(np.mean(losses)))

# create_csv_submission(ids_test, predictions, "submission_mean_squared_error_gd.csv")


Average loss with Gradient Descent(GD):  0.15795799708846436
test prediction acc of fold 0 is 0.635
Average loss with Gradient Descent(GD):  0.16126874247942374
test prediction acc of fold 1 is 0.6599999999999999
Average loss with Gradient Descent(GD):  0.15550824691841403
test prediction acc of fold 2 is 0.62
Average loss with Gradient Descent(GD):  0.16385478688187705
test prediction acc of fold 3 is 0.685
Average loss with Gradient Descent(GD):  0.16095864919180175
test prediction acc of fold 4 is 0.655
Average test prediction accuracy over 5 folds is 0.651
Average loss over 5 folds is 0.14974622349778105


In [None]:
Stochastic Gradient Descent algorithm (SGD) using mean squared error

In [25]:
losses = []
accs = []
for k in range(k_fold):
    x_tr, x_val, y_tr, y_val = cross_validation(y_train, tx_train, k_indices, k)
    initial_w = np.zeros((x_tr.shape[1],1))
    w, loss = mean_squared_error_sgd(y_tr, x_tr, initial_w, max_iters, gamma)
    y_pred = predict_labels(w, x_val)
    acc = _accuracy(y_pred, y_val)
    accs.append(acc)
    losses.append(loss)
    print("test prediction acc of fold " + str(k) + " is " + str(_accuracy(y_pred, y_val)))
print("Average test prediction accuracy over " + str(k_fold) + " folds is " + str(np.mean(accs)))
print("Average loss over " + str(k_fold) + " folds is " + str(np.mean(losses)))


# create_csv_submission(ids_test, predictions, "submission_mean_squared_error_sgd.csv")


Average loss with Stochastic Gradient Descent(SGD):  0.17249324095030835
test prediction acc of fold 0 is 0.635
Average loss with Stochastic Gradient Descent(SGD):  0.1708745343707575
test prediction acc of fold 1 is 0.6599999999999999
Average loss with Stochastic Gradient Descent(SGD):  0.16063356367274031
test prediction acc of fold 2 is 0.62
Average loss with Stochastic Gradient Descent(SGD):  0.15484728776554804
test prediction acc of fold 3 is 0.685
Average loss with Stochastic Gradient Descent(SGD):  0.1683924914865847
test prediction acc of fold 4 is 0.655
Average test prediction accuracy over 5 folds is 0.651
Average loss over 5 folds is 0.16025889510562785


In [None]:
Least Squares solution

In [None]:
losses = []
accs = []
for k in range(k_fold):
    x_tr, x_val, y_tr, y_val = cross_validation(y_train, tx_train, k_indices, k)
    initial_w = np.zeros((x_tr.shape[1],1))
    w, loss = least_squares(y_tr, x_tr)
    y_pred = predict_labels(w, x_val)
    acc = _accuracy(y_pred, y_val)
    accs.append(acc)
    losses.append(loss)
    print("test prediction acc of fold " + str(k) + " is " + str(_accuracy(y_pred, y_val)))
print("Average test prediction accuracy over " + str(k_fold) + " folds is " + str(np.mean(accs)))
print("Average loss over " + str(k_fold) + " folds is " + str(np.mean(losses)))


# create_csv_submission(ids_test, predictions, "submission_least_squares.csv")


NameError: name 'k_fold' is not defined

In [None]:
Ridge Regression

In [27]:
losses = []
accs = []
for k in range(k_fold):
    x_tr, x_val, y_tr, y_val = cross_validation(y_train, tx_train, k_indices, k)
    initial_w = np.zeros((x_tr.shape[1],1))
    w, loss = ridge_regression(y_tr, x_tr, lambda_)
    y_pred = predict_labels(w, x_val)
    acc = _accuracy(y_pred, y_val)
    accs.append(acc)
    losses.append(loss)
    print("test prediction acc of fold " + str(k) + " is " + str(acc))
print("Average test prediction accuracy over " + str(k_fold) + " folds is " + str(np.mean(accs)))
print("Average loss over " + str(k_fold) + " folds is " + str(np.mean(losses)))


# create_csv_submission(ids_test, predictions, "submission_ridge_regression.csv")


Loss with Ridge Regression:  0.09262448403953812
test prediction acc of fold 0 is 0.5783
Loss with Ridge Regression:  0.0917095006290168
test prediction acc of fold 1 is 0.5880000000000001
Loss with Ridge Regression:  0.08798840439255518
test prediction acc of fold 2 is 0.5816
Loss with Ridge Regression:  0.09172792878266332
test prediction acc of fold 3 is 0.611
Loss with Ridge Regression:  0.08961510249957844
test prediction acc of fold 4 is 0.59455
Average test prediction accuracy over 5 folds is 0.59069
Average loss over 5 folds is 0.09073308406867037
