# Machine Learning (CS-433)
## Class Project 1
Simon Canales, Jordan Willemin, Claudio Loureiro

In [26]:
import numpy as np

from proj1_helpers import *
from implementations import *
from utils import *

# Plot libraries
import matplotlib
import matplotlib.pyplot as plt

In [27]:
training_pred, training_data, ids_tr = load_csv_data("../data/train.csv")
testing_pred, testing_data, ids_te = load_csv_data("../data/test.csv")

# Preprocessing

In [31]:
def adjust_wrong_data(data, y, wrongValue, data_set):
    #Assign the mean value of the uncorrupted data of a feature to the corrupted data of this feature.
    #If training data, then the corrupted data take the mean value of the uncorrupted data of the same class.
    #If testing data, then the corrupted data take the mean value of all the uncorrpted data of this feature.
    
    if data_set == "train":
        for i in range(np.shape(data)[1]):
            feature = data[:,i] 
            feature_class_1 = feature[y==-1]
            feature_class_1_clean = feature_class_1[feature_class_1!=wrongValue]
            feature_mean_1 = np.mean(feature_class_1_clean)
            feature_class_2 = feature[y==1]
            feature_mean_2 = np.mean(feature_class_2[feature_class_2!=wrongValue])
            for j in range(np.shape(data)[0]):
                if y[j] == -1 and data[j,i] == wrongValue:
                    data[j,i] = feature_mean_1
                if y[j] == 1 and data[j,i] == wrongValue:
                    data[j,i] = feature_mean_2
    
    if data_set == "test":
        for i in range(np.shape(data)[1]):
            feature = data[:,i]
            feature_clean = feature[feature!=wrongValue]
            feature_mean = np.mean(feature_clean)
            for j in range(np.shape(data)[0]):
                if data[j,i] == wrongValue:
                    data[j,i] = feature_mean
    
    return data


In [29]:
wrong_value = -999
training_data = adjust_wrong_data(training_data, training_pred, wrong_value, "train")

In [3]:
training_data[training_data==-999] = 0
testing_data[testing_data==-999] = 0

# Least Squares Gradient Descent

In [4]:
poly_degree = 2
tx = build_poly(training_data, poly_degree)
initial_w = np.zeros(tx.shape[1])
max_iters = 40
gamma = 0.02

k_fold = 4
indexes_te, indexes_tr = get_split_indexes(training_data, training_pred, k_fold)
mse_tr_mean, mse_te_mean, rmse_tr_mean, rmse_te_mean, accuracy = cross_validation(least_squares_GD, 
                                                                                  compute_mse, tx, training_pred, indexes_te, 
                                                                                  indexes_tr, k_fold, True, 
                                                                                  (initial_w, max_iters, gamma))

print('Mse: ', format(mse_te_mean))
print('Rmse: ', format(rmse_te_mean))
print('Accuracy: ', format(accuracy))

Mse:  0.80619754333174
Rmse:  1.2698011996621676
Accuracy:  0.7162200000000001


# Least Squares Stochastic Gradient Descent

In [12]:
poly_degree = 2
tx = build_poly(training_data, poly_degree)
initial_w = np.zeros(tx.shape[1])
max_iters = 200
gamma = 0.001

k_fold = 4
indexes_te, indexes_tr = get_split_indexes(training_data, training_pred, k_fold)
mse_tr_mean, mse_te_mean, rmse_tr_mean, rmse_te_mean, accuracy = cross_validation(least_squares_SGD, 
                                                                                  compute_mse, tx, training_pred, indexes_te, 
                                                                                  indexes_tr, k_fold, True, 
                                                                                  (initial_w, max_iters, gamma))

print('Mse: ', format(mse_te_mean))
print('Rmse: ', format(rmse_te_mean))
print('Accuracy: ', format(accuracy))

Mse:  0.3711797218912981
Rmse:  0.8616028341310144
Accuracy:  0.8706880000000001


# Least Squares

In [16]:
poly_degree = 2
tx = build_poly(training_data, poly_degree)
initial_w = np.zeros(tx.shape[1])

k_fold = 4
indexes_te, indexes_tr = get_split_indexes(training_data, training_pred, k_fold)
mse_tr_mean, mse_te_mean, rmse_tr_mean, rmse_te_mean, accuracy = cross_validation(least_squares, 
                                                                                  compute_mse, tx, training_pred, indexes_te, 
                                                                                  indexes_tr, k_fold, True)

print('Mse: ', format(mse_te_mean))
print('Rmse: ', format(rmse_te_mean))
print('Accuracy: ', format(accuracy))

Mse:  0.34374992828993883
Rmse:  0.8291561111032576
Accuracy:  0.9085400000000002


# Ridge regression 

In [19]:
poly_degree = 2
tx = build_poly(training_data, poly_degree)
initial_w = np.zeros(tx.shape[1])
lambda_ = 1e-6

k_fold = 4
indexes_te, indexes_tr = get_split_indexes(training_data, training_pred, k_fold)
mse_tr_mean, mse_te_mean, rmse_tr_mean, rmse_te_mean, accuracy = cross_validation(ridge_regression, 
                                                                                  compute_mse, tx, training_pred, indexes_te, 
                                                                                  indexes_tr, k_fold, True, (lambda_,))

print('Mse: ', format(mse_te_mean))
print('Rmse: ', format(rmse_te_mean))
print('Accuracy: ', format(accuracy))

Mse:  0.34370970218773156
Rmse:  0.8291075951741506
Accuracy:  0.9085280000000002


# Logistic regression

In [20]:
poly_degree = 3
tx = build_poly(training_data, poly_degree)
initial_w = np.zeros(tx.shape[1])

max_iters = 100
gamma = 0.01

k_fold = 4
indexes_te, indexes_tr = get_split_indexes(training_data, training_pred, k_fold)
mse_tr_mean, mse_te_mean, rmse_tr_mean, rmse_te_mean, accuracy = cross_validation(logistic_regression, loss_logistic_regression,
                                                                                  tx, training_pred, indexes_te, 
                                                                                  indexes_tr, k_fold, True, 
                                                                                  (initial_w, max_iters, gamma))

print('Negative log likelihood: ', format(mse_te_mean))
print('Accuracy: ', format(accuracy))

Negative log likelihood:  -0.8262917604900777
Accuracy:  0.8764520000000002


# Regularized Logistic Regression

In [21]:
poly_degree = 3
tx = build_poly(training_data, poly_degree)
initial_w = np.zeros(tx.shape[1])

lambda_ = 0.001
max_iters = 100
gamma = 0.01

k_fold = 4
indexes_te, indexes_tr = get_split_indexes(training_data, training_pred, k_fold)
mse_tr_mean, mse_te_mean, rmse_tr_mean, rmse_te_mean, accuracy = cross_validation(reg_logistic_regression, 
                                                                                  reg_logistic_regression_loss,
                                                                                  tx, training_pred, indexes_te, 
                                                                                  indexes_tr, k_fold, True, 
                                                                                  (lambda_, initial_w, max_iters, gamma),
                                                                                  (lambda_,))

print('Negative log likelihood: ', format(mse_te_mean))
print('Accuracy: ', format(accuracy))

Negative log likelihood:  -0.8249747825530349
Accuracy:  0.8764520000000002
