In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from implementations import *
from evaluation import *
import datetime

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# Standardize data
tX = standardize(tX)

In [3]:
tX.shape

(250000, 30)

## Properties of dataset

In [73]:
from collections import Counter
Counter(y)

Counter({1.0: 85667, -1.0: 164333})

## Train/Test split

In [65]:
x_train, x_test, y_train, y_test = split_data(tX, y, 0.8)
print(x_train.shape)
print(x_test.shape)
evaluation_data = []

(200000, 30)
(50000, 30)


## 1. Linear regression using gradient descent

In [90]:
# Parameters
#max_iters = [10, 100, 500, 750, 1000, 1500]
#gammas = [0.01, 0.049, 0.05, 0.053, 0.07, 0.1]
max_iters = [100]
gammas = [0.1]


for max_iter in max_iters:
    for gamma in gammas:
        # execute training and loss calculation
        start_time = datetime.datetime.now()
        w_initial = np.array(np.zeros(30))
        w, loss_train = least_squares_GD(y_train, x_train, w_initial, max_iter, gamma)
        loss_test = compute_loss(y_test, x_test, w)
        end_time = datetime.datetime.now()
        
        # create dict with results for run
        evaluation_result = {}
        evaluation_result['method'] = 'least_squares_GD'
        evaluation_result['parameters'] = {}
        evaluation_result['parameters']['max_iter'] = max_iter
        evaluation_result['parameters']['gamma'] = gamma
        evaluation_result['training_and_loss_calc_time_in_sec'] = str((end_time - start_time).total_seconds())
        evaluation_result['train_loss'] = str(loss_train)
        evaluation_result['test_loss'] = str(loss_test)
        evaluation_result['accuracy'] = str(calculate_accuracy(w, x_test, y_test))
        f1_score, precision, recall, tp, fp, tn, fn = calculate_f1_score(w, x_test, y_test)
        evaluation_result['f1_score'] = str(f1_score)
        evaluation_result['precision'] = str(precision)
        evaluation_result['recall'] = str(recall)
        evaluation_result['confusion_matrix'] = {}
        evaluation_result['confusion_matrix']['tp'] = tp
        evaluation_result['confusion_matrix']['fp'] = fp
        evaluation_result['confusion_matrix']['tn'] = tn
        evaluation_result['confusion_matrix']['fn'] = fn
        
        evaluation_data.append(evaluation_result)
        print(evaluation_result)
        
        print()
        print('Confusion Matrix')
        print(str(evaluation_result['confusion_matrix']['tp']) + ' ' + str(evaluation_result['confusion_matrix']['fp']))
        print(str(evaluation_result['confusion_matrix']['tn']) + ' ' + str(evaluation_result['confusion_matrix']['fn']))

{'method': 'least_squares_GD', 'parameters': {'max_iter': 100, 'gamma': 0.1}, 'training_and_loss_calc_time_in_sec': '0.787098', 'train_loss': '0.3925930461858477', 'test_loss': '0.3932601383256776', 'accuracy': '0.71208', 'f1_score': '0.6624302396473292', 'precision': '0.5557522820270696', 'recall': '0.8197910621009866', 'confusion_matrix': {'tp': 14125, 'fp': 11291, 'tn': 21479, 'fn': 3105}}

Confusion Matrix
14125 11291
21479 3105


## 2. Linear regression using stochastic gradient descent

In [91]:
# Parameters
#max_iters = [10, 100, 500, 750, 1000, 1500]
#gammas = [0.01, 0.049, 0.05, 0.053, 0.07, 0.1]
max_iters = [100]
gammas = [0.1]

for max_iter in max_iters:
    for gamma in gammas:
        # execute training and loss calculation
        start_time = datetime.datetime.now()
        w_initial = np.array(np.zeros(30))
        w, loss_train = least_squares_SGD(y_train, x_train, w_initial, max_iter, gamma)
        loss_test = compute_loss(y_test, x_test, w)
        end_time = datetime.datetime.now()
        
        # create dict with results for run
        evaluation_result = {}
        evaluation_result['method'] = 'least_squares_SGD'
        evaluation_result['parameters'] = {}
        evaluation_result['parameters']['max_iter'] = max_iter
        evaluation_result['parameters']['gamma'] = gamma
        evaluation_result['training_and_loss_calc_time_in_sec'] = str((end_time - start_time).total_seconds())
        evaluation_result['train_loss'] = str(loss_train)
        evaluation_result['test_loss'] = str(loss_test)
        evaluation_result['accuracy'] = str(calculate_accuracy(w, x_test, y_test))
        f1_score, precision, recall, tp, fp, tn, fn = calculate_f1_score(w, x_test, y_test)
        evaluation_result['f1_score'] = str(f1_score)
        evaluation_result['precision'] = str(precision)
        evaluation_result['recall'] = str(recall)
        evaluation_result['confusion_matrix'] = {}
        evaluation_result['confusion_matrix']['tp'] = tp
        evaluation_result['confusion_matrix']['fp'] = fp
        evaluation_result['confusion_matrix']['tn'] = tn
        evaluation_result['confusion_matrix']['fn'] = fn
        
        evaluation_data.append(evaluation_result)
        print(evaluation_result)

{'method': 'least_squares_SGD', 'parameters': {'max_iter': 100, 'gamma': 0.1}, 'training_and_loss_calc_time_in_sec': '4.211122', 'train_loss': '77.66278943722936', 'test_loss': '77.87536604619282', 'accuracy': '0.50516', 'f1_score': '0.3945874522854067', 'precision': '0.341103308232507', 'recall': '0.46796285548461986', 'confusion_matrix': {'tp': 8063, 'fp': 15575, 'tn': 17195, 'fn': 9167}}


## 3. Least squares regression using normal equations

In [92]:
# execute training and loss calculation
start_time = datetime.datetime.now()
w_initial = np.array(np.zeros(30))
w, loss_train = least_squares(y_train, x_train)
loss_test = compute_loss(y_test, x_test, w)
end_time = datetime.datetime.now()

# create dict with results for run
evaluation_result = {}
evaluation_result['method'] = 'least_squares'
evaluation_result['parameters'] = {}
evaluation_result['training_and_loss_calc_time_in_sec'] = str((end_time - start_time).total_seconds())
evaluation_result['train_loss'] = str(loss_train)
evaluation_result['test_loss'] = str(loss_test)
evaluation_result['accuracy'] = str(calculate_accuracy(w, x_test, y_test))
f1_score, precision, recall, tp, fp, tn, fn = calculate_f1_score(w, x_test, y_test)
evaluation_result['f1_score'] = str(f1_score)
evaluation_result['precision'] = str(precision)
evaluation_result['recall'] = str(recall)
evaluation_result['confusion_matrix'] = {}
evaluation_result['confusion_matrix']['tp'] = tp
evaluation_result['confusion_matrix']['fp'] = fp
evaluation_result['confusion_matrix']['tn'] = tn
evaluation_result['confusion_matrix']['fn'] = fn

evaluation_data.append(evaluation_result)
print(evaluation_result)

{'method': 'least_squares', 'parameters': {}, 'training_and_loss_calc_time_in_sec': '0.024671', 'train_loss': '0.38885794352313174', 'test_loss': '0.38936695348103784', 'accuracy': '0.71676', 'f1_score': '0.6650425733207189', 'precision': '0.5612375249500998', 'recall': '0.8159605339524086', 'confusion_matrix': {'tp': 14059, 'fp': 10991, 'tn': 21779, 'fn': 3171}}


## 4. Ridge regression using normal equations

In [93]:
# Parameters
#lambda = [10, 100, 500, 750, 1000, 1500]
lambdas = [100]

for alambda in lambdas:
    # execute training and loss calculation
    start_time = datetime.datetime.now()
    w_initial = np.array(np.zeros(30))
    w, loss_train = ridge_regression(y_train, x_train, alambda)
    loss_test = compute_loss(y_test, x_test, w)
    end_time = datetime.datetime.now()

    # create dict with results for run
    evaluation_result = {}
    evaluation_result['method'] = 'ridge_regression'
    evaluation_result['parameters'] = {}
    evaluation_result['parameters']['lambda'] = alambda
    evaluation_result['training_and_loss_calc_time_in_sec'] = str((end_time - start_time).total_seconds())
    evaluation_result['train_loss'] = str(loss_train)
    evaluation_result['test_loss'] = str(loss_test)
    evaluation_result['accuracy'] = str(calculate_accuracy(w, x_test, y_test))
    f1_score, precision, recall, tp, fp, tn, fn = calculate_f1_score(w, x_test, y_test)
    evaluation_result['f1_score'] = str(f1_score)
    evaluation_result['precision'] = str(precision)
    evaluation_result['recall'] = str(recall)
    evaluation_result['confusion_matrix'] = {}
    evaluation_result['confusion_matrix']['tp'] = tp
    evaluation_result['confusion_matrix']['fp'] = fp
    evaluation_result['confusion_matrix']['tn'] = tn
    evaluation_result['confusion_matrix']['fn'] = fn

    evaluation_data.append(evaluation_result)
    print(evaluation_result)

{'method': 'ridge_regression', 'parameters': {'lambda': 100}, 'training_and_loss_calc_time_in_sec': '0.030448', 'train_loss': '0.4970710829640738', 'test_loss': '0.4970506235215197', 'accuracy': '0.63862', 'f1_score': '0.5369891095451634', 'precision': '0.48075246616196377', 'recall': '0.608125362739408', 'confusion_matrix': {'tp': 10478, 'fp': 11317, 'tn': 21453, 'fn': 6752}}


## 5. Logistic regression using gradient descent or SGD

In [84]:
# Parameters
#max_iters = [10, 100, 500, 750, 1000, 1500]
#gammas = [0.01, 0.049, 0.05, 0.053, 0.07, 0.1]
max_iters = [100]
gammas = [0.1]

for max_iter in max_iters:
    for gamma in gammas:
        # execute training and loss calculation
        start_time = datetime.datetime.now()
        w_initial = np.array(np.zeros(30))
        w, loss_train = logistic_regression(y_train, x_train, w_initial, max_iter, gamma)
        loss_test = compute_loss(y_test, x_test, w)
        end_time = datetime.datetime.now()
        
        # create dict with results for run
        evaluation_result = {}
        evaluation_result['method'] = 'logistic_regression'
        evaluation_result['parameters'] = {}
        evaluation_result['parameters']['max_iter'] = max_iter
        evaluation_result['parameters']['gamma'] = gamma
        evaluation_result['training_and_loss_calc_time_in_sec'] = str((end_time - start_time).total_seconds())
        evaluation_result['train_loss'] = str(loss_train)
        evaluation_result['test_loss'] = str(loss_test)
        evaluation_result['accuracy'] = str(calculate_accuracy(w, x_test, y_test))
        f1_score, precision, recall, tp, fp, tn, fn = calculate_f1_score(w, x_test, y_test)
        evaluation_result['f1_score'] = str(f1_score)
        evaluation_result['precision'] = str(precision)
        evaluation_result['recall'] = str(recall)
        evaluation_result['confusion_matrix'] = {}
        evaluation_result['confusion_matrix']['tp'] = tp
        evaluation_result['confusion_matrix']['fp'] = fp
        evaluation_result['confusion_matrix']['tn'] = tn
        evaluation_result['confusion_matrix']['fn'] = fn
        
        evaluation_data.append(evaluation_result)
        print(evaluation_result)

  def ridge_regression(y, tx, lambda_):


{'method': 'logistic_regression', 'parameters': {'max_iter': 100, 'gamma': 0.1}, 'training_and_loss_calc_time_in_sec': '1.315524', 'train_loss': 'nan', 'test_loss': '6886392392.59792', 'accuracy': '0.71078', 'f1_score': '0.6624337636266019', 'precision': '0.5540630247178726', 'recall': '0.8235055136390017'}


  print("max = ", max(t))


## Final overview

In [94]:
for el in evaluation_data:
    print(el)

{'method': 'least_squares_GD', 'parameters': {'max_iter': 100, 'gamma': 0.1}, 'training_and_loss_calc_time_in_sec': '0.759231', 'train_loss': '0.3925930461858477', 'test_loss': '0.3932601383256776', 'accuracy': '0.71208', 'f1_score': '0.6624302396473292', 'precision': '0.5557522820270696', 'recall': '0.8197910621009866'}
{'method': 'least_squares_GD', 'parameters': {'max_iter': 100, 'gamma': 0.05}, 'training_and_loss_calc_time_in_sec': '0.713638', 'train_loss': '0.3987838539784226', 'test_loss': '0.39948206088489485', 'accuracy': '0.70718', 'f1_score': '0.6579764991706964', 'precision': '0.550611877859014', 'recall': '0.8173534532791642'}
{'method': 'least_squares_GD', 'parameters': {'max_iter': 300, 'gamma': 0.1}, 'training_and_loss_calc_time_in_sec': '2.105392', 'train_loss': '0.3894785858650492', 'test_loss': '0.39012322881498535', 'accuracy': '0.71586', 'f1_score': '0.6649371477087804', 'precision': '0.5600492630408009', 'recall': '0.818165989553105'}
{'method': 'least_squares_GD',