# Machine learning basic methods

In this notebook we try out each of the basic machine learning methods we have seen in class, and we calculate their accuracy via 4-fold cross validation.
We sum up the results in a table at the end.

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import pandas as pd

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from helpers import *
DATA_TRAIN_PATH = 'data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# Gradient descent

In [10]:
from implementations import least_squares_GD
from cross_validation import cross_validation

In [11]:
# MSE
initial_w = np.zeros(tX.shape[1])
gd_w, gd_loss = least_squares_GD(y, tX, initial_w, 100, 1e-7)
print (gd_loss, gd_w)

0.415607829341 [  3.80346724e-04  -1.05111647e-04  -4.01889967e-05   1.04035338e-05
   1.70150747e-05   1.82378837e-04   1.41987918e-05  -5.18640770e-07
  -2.19005185e-05  -6.90004830e-05  -2.14951239e-06   1.63099023e-06
   1.66426513e-05   2.34443344e-05  -2.47834061e-08  -7.33113014e-08
  -3.35914729e-05  -1.93731739e-08   5.42758416e-08  -2.10924320e-05
   9.69234347e-08  -9.87881549e-05  -1.06453411e-06   3.94465994e-05
   5.07333661e-05   5.07285001e-05  -7.45642182e-06   1.65052584e-05
   1.64630455e-05  -5.88533325e-05]


In [12]:
gd_accuracy = cross_validation(y, tX, 1e-7, method='least_squares_GD')
print(gd_accuracy)

(0.665876, [0.665664, 0.66816, 0.664336, 0.665344])


# Stochastic Gradient descent

In [13]:
from implementations import least_squares_SGD

In [14]:
# MSE unfiltered data
initial_w = np.zeros(tX.shape[1])
sgd_w, sgd_loss = least_squares_SGD(y, tX, initial_w, 10, 1e-7)
print(sgd_loss, sgd_w)

0.404678200185 [  4.78332140e-04  -4.39132721e-04  -1.75768269e-04   9.93507739e-05
  -3.84743387e-05   4.95538898e-04  -4.85378685e-05  -2.55916345e-06
  -8.63975764e-05  -2.03458230e-04  -9.78167912e-06   7.10082931e-06
  -3.94356324e-05   1.37168382e-04  -1.26248989e-07  -3.14509656e-07
  -1.37264810e-04  -1.26777917e-07   2.89749142e-07  -3.40684727e-05
   4.32899800e-07  -3.22579923e-04  -4.49920816e-06   4.51371667e-05
   5.42852151e-05   5.42757751e-05  -1.35812093e-04  -4.00271879e-05
  -4.02381008e-05  -2.03361739e-04]


In [15]:
sgd_accuracy = cross_validation(y, tX, 1e-7, method='least_squares_SGD')
print(sgd_accuracy)

(0.6988079999999999, [0.700896, 0.697248, 0.6996, 0.697488])


# Least squares

In [16]:
from implementations import least_squares

In [17]:
# MSE unfiltered data
ls_w, ls_loss = least_squares(y, tX)
print (ls_loss,ls_w)

0.339686809915 [  8.03911103e-05  -7.20111773e-03  -6.05471144e-03  -5.47536204e-04
  -1.93854515e-02   4.73443447e-04  -2.60377713e-02   3.25108467e-01
  -3.81085734e-05  -2.72787181e+00  -2.21219603e-01   9.50810773e-02
   6.40330195e-02   2.73613146e+00  -3.31801854e-04  -9.54327420e-04
   2.74088821e+00  -5.34165287e-04   9.73498892e-04   3.69225050e-03
   3.54487161e-04  -5.43344617e-04  -3.30448034e-01  -1.40800496e-03
   8.31432873e-04   1.02117271e-03  -1.68047418e-03  -5.83664769e-03
  -1.11088005e-02   2.72833175e+00]


In [18]:
ls_accuracy = cross_validation(y, tX, 1e-7, method='least_squares')
print(ls_accuracy)

(0.744256, [0.744272, 0.743616, 0.746464, 0.742672])


# Affine least squares regression

In [19]:
# MSE with filtered data, affine model
affine_tX = affine = np.hstack((np.ones((tX.shape[0],1)),tX))
ls_w_affine, ls_loss_affine = least_squares(y, affine_tX)
print (ls_loss_affine,ls_w_affine)

0.339445598528 [ -3.60647213e-01   7.22888164e-05  -7.14508760e-03  -6.24041586e-03
  -4.77105873e-04  -3.08442134e-03   4.49420586e-04  -2.38153006e-02
   3.42160433e-01  -1.09962265e-04  -2.84138770e+00  -2.16218152e-01
   9.55433766e-02   4.52049754e-02   2.84995415e+00  -2.62721028e-04
  -9.95639034e-04   2.85442834e+00  -3.54662741e-04   8.49821612e-04
   3.69247840e-03   2.18113350e-04  -5.00574858e-04  -2.11522352e-01
  -2.18882120e-04   2.01549536e-04   3.43135866e-04  -6.98404087e-05
  -6.58071021e-03  -1.18306604e-02   2.84063518e+00]


In [20]:
affine_ls_accuracy = cross_validation(y, affine_tX, 1e-7, method='least_squares')
print(affine_ls_accuracy)

(0.744952, [0.74504, 0.743856, 0.747216, 0.743696])


# Ridge Regression

In [21]:
from implementations import ridge_regression

In [22]:
lamb = 1e-1
rr_w, rr_loss = ridge_regression(y, tX, lamb)
print (rr_loss,rr_w)

0.364399812303 [  1.98628815e-04  -8.39656035e-03  -3.22311998e-03  -2.06213002e-03
  -1.13848439e-02   4.91720836e-04  -2.05135585e-02   1.01683862e-01
  -1.49557627e-05   3.83138841e-03  -8.60779329e-02   8.16367032e-02
   3.42443158e-02   5.15657721e-03  -4.37703313e-04  -1.27074718e-03
   4.15478660e-03  -5.41025171e-04   9.28858575e-04   4.71409380e-03
   4.53983949e-04  -7.26027320e-04  -9.49952316e-02   8.41627937e-04
  -4.41338632e-04  -2.00086360e-04  -2.04653783e-04   1.59679327e-04
  -2.54485619e-03  -5.47587190e-03]


In [23]:
rr_accuracy = cross_validation(y, tX, 1e-7, method='ridge_regression')
print(rr_accuracy)

(0.7380120000000001, [0.737792, 0.73696, 0.739904, 0.737392])


# Logistic regression

In [24]:
def clean_data(tX):
    N = tX.shape[0]
    D = tX.shape[1]
    tX_clean = np.copy(np.array(tX))
    for n in range(N):
        for d in range(D):
            if tX_clean[n,d] == -999.0:
                tX_clean[n,d] = 0
    return tX_clean

In [25]:
from implementations import logistic_regression

tX_norm, _, _ = standardize(clean_data(tX), None, None)
y_shifted = np.array([1 if i==1 else 0 for i in y])
initial_w = np.zeros((tX_norm.shape[1],1))

lr_w, lr_loss = logistic_regression(y_shifted, tX_norm, initial_w, max_iters=20, gamma=0.00008)
print(lr_loss, lr_w)

[ 5237.55233243] [[ -7.47263324e-01]
 [  1.49910434e-01]
 [ -5.82005431e-01]
 [ -2.15167209e-01]
 [  1.67060763e-01]
 [  8.90656448e-02]
 [  1.77274698e-01]
 [ -1.14337573e-01]
 [  2.85837484e-01]
 [ -7.99720638e-02]
 [  9.32590213e-03]
 [ -2.40694011e-01]
 [  2.78148015e-01]
 [  1.79845572e-01]
 [  3.28216928e-01]
 [ -2.93834075e-03]
 [ -1.57004055e-02]
 [  1.31843213e-01]
 [  2.94058746e-04]
 [  9.27588668e-03]
 [  9.31304875e-03]
 [  1.29900370e-03]
 [ -5.62660799e-02]
 [ -1.21997739e-01]
 [  3.17158097e-02]
 [ -1.86252913e-02]
 [ -1.86336943e-03]
 [ -1.08234565e-01]
 [ -2.25790370e-03]
 [ -3.77425955e-04]
 [ -9.37203852e-02]]


In [27]:
lr_accuracy = cross_validation(y_shifted, tX_norm, 0.00008, method='logistic_regression')
print(lr_accuracy)

(0.750348, [0.750848, 0.749296, 0.752576, 0.748672])


# Regularised logistic regression

In [6]:
from implementations import reg_logistic_regression

tX_norm, _, _ = standardize(clean_data(tX), None, None)
y_shifted = np.array([1 if i==1 else 0 for i in y])
initbial_w = np.zeros((tX_norm.shape[1],1))

rlr_w, rlr_loss = reg_logistic_regression(y_shifted, tX_norm, 0.1, initial_w, max_iters=20, gamma=0.00008)
print(rlr_loss, rlr_w)

  y_batch = np.zeros((batch_size,1))
  yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]


[ 15669.27741059] [[-0.92377853]
 [ 0.17800081]
 [-0.75130433]
 [-0.50996402]
 [ 0.23041456]
 [ 0.02995862]
 [ 0.36586601]
 [-0.03669818]
 [ 0.63282032]
 [-0.21779261]
 [-0.16676848]
 [-0.37493274]
 [ 0.2622428 ]
 [ 0.21495459]
 [ 0.58053175]
 [-0.00311884]
 [-0.00743836]
 [ 0.33647252]
 [ 0.00888818]
 [ 0.01650483]
 [ 0.0236441 ]
 [ 0.0019276 ]
 [-0.24297976]
 [-0.32840178]
 [-0.06904297]
 [-0.00311622]
 [-0.00565082]
 [-0.39965793]
 [ 0.01500818]
 [ 0.00245289]
 [-0.4053564 ]]


In [9]:

rlr_accuracy = cross_validation(y_shifted, tX_norm, 0.00008, method='reg_logistic_regression')
print(rlr_accuracy)

  y_batch = np.zeros((batch_size,1))
  yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]


(0.751792, [0.751808, 0.748576, 0.754512, 0.752272])


# Summary

In [29]:


df_results = pd.DataFrame([[gd_loss, sgd_loss, ls_loss, rr_loss, lr_loss[0], rlr_loss[0]],
                           [gd_accuracy[0], sgd_accuracy[0], ls_accuracy[0], rr_accuracy[0], lr_accuracy[0], rlr_accuracy[0]]
                          ])
df_results.columns = ['Gradient Descent', 'Stochastic Gradient Descent', 'Least Squares', 'Ridge Regression', 'Logistic Regression', 'Regularised Logistic Regression']
df_results.index = ['Loss mse','Accuracy']
df_results

Unnamed: 0,Gradient Descent,Stochastic Gradient Descent,Least Squares,Ridge Regression,Logistic Regression,Regularised Logistic Regression
Loss mse,0.415608,0.404678,0.339687,0.3644,5237.552332,15669.277411
Accuracy,0.665876,0.698808,0.744256,0.738012,0.750348,0.751792
