In [1]:
import numpy as np
#from proj1_helpers import *
import matplotlib.pyplot as plt
from implementations import *
from helpers import *
from data_cleaning import *
from data_processing import *
from helper_functions import *


In [2]:
TRAIN_PATH = '../dataset/train.csv'
TEST_PATH = '../dataset/test.csv'

In [3]:
y_train, x_train, ids = load_csv_data(TRAIN_PATH)
y_test , x_test, ids_test = load_csv_data(TEST_PATH)

### DIFFERENT MODEL TRIALS USING CROSS VALIDATION

In [4]:
import cross_validation as cv

#### 1. Stochastic gradient descent

In [5]:
y = y_train.copy()
tX_train, masks_train = cleaning(x_train)
tX_test, masks_test = cleaning(x_test)

# Stochastic gradient descent
max_iters = 50
gamma = 1e-10
degree = 7
k_fold = 10
seed = 10

accuracies = []
weights_SGD = []
total_loss = 0

for i in range(len(masks_train)):
    train_data = tX_train[masks_train[i]]
    train_y = y[masks_train[i]]
    test_data = tX_test[masks_test[i]]
        
    # Data processing
    train_data = process_data(train_data)
    test_data = process_data(test_data)

    # Build poly
    train_phi = build_polynomial(train_data, degree)
    test_phi = build_polynomial(test_data, degree)
        
    w_init = np.zeros(train_phi.shape[1])
    w, _ = mean_squared_error_sgd(train_y, train_phi, w_init, max_iters, gamma)

    k_indices = cv.build_k_indices(train_y, k_fold, seed)

    for k in range(k_fold):
        accuracy = cv.cross_validation(train_y, train_phi, k_indices, k, mean_squared_error_sgd, initial_w = w_init, max_iters = max_iters,
        gamma = gamma)
        accuracies.append(accuracy)

print('Accuracy for SGD:', np.mean(accuracies))
print('std: ', np.std(accuracies))


  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  X_clean[:,i] = (X_clean[:,i] - np.mean(X_clean[:,i])) / np.std(X_clean[:,i])


Accuracy for SGD: 0.574473982657946
std:  0.05152257965744931


#### 2.  Ridge Regression :

In [6]:
y = y_train.copy()
tX_train, masks_train = cleaning(x_train)
tX_test, masks_test = cleaning(x_test)

# Parameters
degree = 7
lambda_ = 1e-5
accuracies = []
seed = 10
k_fold = 10

for i in range(len(masks_train)):
    
    train_data = tX_train[masks_train[i]]
    train_y = y[masks_train[i]]
    test_data = tX_test[masks_test[i]]

    train_data = process_data(train_data)
    test_data = process_data(test_data)

    # Build poly
    train_phi = build_polynomial(train_data, degree)
    test_phi = build_polynomial(test_data, degree)
    
    # Obtain weight
    weight, _ = ridge_regression(train_y, train_phi, lambda_)

    # Compute accuracy using cross validation
    k_indices = cv.build_k_indices(train_y, k_fold, seed)

    for k in range(k_fold):
        accuracy = cv.cross_validation(train_y, train_phi, k_indices, k, ridge_regression, lambda_=lambda_)
        accuracies.append(accuracy)

print('Accuracy for Ridge regression is:', np.mean(accuracies))
print('std: ', np.std(accuracies))

Accuracy for Ridge regression is: 0.8219350855490302
std:  0.015870927048251337


#### 3. GRADIENT DESCENT

In [7]:
y = y_train.copy()
tX_train, masks_train = cleaning(x_train)
tX_test, masks_test = cleaning(x_test)

# Gradient descent parameters
max_iters = 50
gamma = 1e-10
degree = 4
seed = 10
k_fold = 10

accuracies = []

for i in range(len(masks_train)):
    train_data = tX_train[masks_train[i]]
    train_y = y[masks_train[i]]
    test_data = tX_test[masks_test[i]]
        
    # Data processing
    train_data = process_data(train_data)
    test_data = process_data(test_data)

    # Build poly
    train_phi = build_polynomial(train_data, degree)
    test_phi = build_polynomial(test_data, degree)
        
    w_init = np.zeros(train_phi.shape[1])
    
    # Compute accuracy using cross validation
    k_indices = cv.build_k_indices(train_y, k_fold, seed)

    for k in range(k_fold):
        accuracy = cv.cross_validation(train_y, train_phi, k_indices, k, mean_squared_error_gd, initial_w = w_init, max_iters = max_iters, 
        gamma = gamma)
        accuracies.append(accuracy)

print('Accuracy for Gradient descent is:', np.mean(accuracies))
print('std: ', np.std(accuracies))

Accuracy for Gradient descent is: 0.6904112061048273
std:  0.03874337772585025


#### 4.  Logistic regression

In [8]:
y = y_train.copy()
tX_train, masks_train = cleaning(x_train)
tX_test, masks_test = cleaning(x_test)

# Parameters
max_iters = 50
gamma = 1e-10
degree = 7
k_fold = 10
seed = 10

for i in range(len(masks_train)):
    
    train_data = tX_train[masks_train[i]]
    train_y = y[masks_train[i]]
    test_data = tX_test[masks_test[i]]

    train_data = process_data(train_data)
    test_data = process_data(test_data)

    # Build poly
    train_phi = build_polynomial(train_data, degree)
    test_phi = build_polynomial(test_data, degree)

    #Logistic regression:
    w_init = np.zeros(train_phi.shape[1])

    # Compute accuracy using cross validation
    k_indices = cv.build_k_indices(train_y, k_fold, seed)

    for k in range(k_fold):
        accuracy = cv.cross_validation(train_y, train_phi, k_indices, k, logistic_regression, initial_w = w_init, max_iters = max_iters,
        gamma = gamma)
        accuracies.append(accuracy)
    
print('Accuracy for Logistic regression is:', np.mean(accuracies))
print('std: ', np.std(accuracies))

  sigmoid = 1.0 / (1 + np.exp(-t))
  loss = y.T.dot(np.log(logistic_function)) + (1 - y).T.dot(


Accuracy for Logistic regression is: 0.6167690151166128
std:  0.08127706941645539
