In [1]:
%run algebra.py
%run cache.py
%run costs.py
%run features.py
%run gradients.py
%run helpers.py
%run model.py
%run models.py
%run splits.py

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

In [2]:
SUB_SAMPLE = False
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

In [3]:
y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

# 1 - Only Using Clean Features

### Ridge Regression with Fixed Degree

##### On All Samples

In [None]:
class RidgeRegression_MSE_FixedDegree_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h):

        lambda_ = float(h['lambda'])
        degree = int(h['degree'])

        return ridge_regression(y, x, lambda_)     
        
    def test(self, x, y, w, h):
                
        return { 'mse': compute_mse(y, x, w) }

In [None]:
myModel = RidgeRegression_MSE_FixedDegree_Model()

hs = { 
    'degree': np.arange(4, 16), 
    'lambda': np.logspace(-8, -2, 7),

res = myModel.evaluate(x, y, hs,filename=CACHE_DIR+'RidgeRegression_MSE_FixedDegree')
res_mse = np.vectorize(lambda x: x['mse'])(res)

plot_heatmap(res, hs, 'mse', 'degree', 'lambda')
find_arg_min(res, 'mse')

##### Using Cross-Validation

In [None]:
myModel = CrossValidationModel(RidgeRegression_MSE_FixedDegree_Model())

hs = { 
    'degree': np.arange(4, 16), 
    'lambda': np.logspace(-8, -2, 7),
    'k_fold': 4,
    'seed': 0
}

res = myModel.evaluate(x, y, hs, CACHE_DIR+'RidgeRegression_MSE_FixedDegree_CrossValidation')

plot_heatmap(res, hs, 'avg_mse_te', 'degree', 'lambda')
best_h = find_arg_min(res, 'avg_mse_te')
best_h

### Ridge Regression with Fixed Degree & Cross Validation

In [None]:
class RidgeRegression_MSE_FixedDegree_CrossValidation_Model(Model):

    def prepare(self, x, y, h={}):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h={}):

        lambda_ = float(h['lambda'])
        k_fold = int(h['k_fold'])
        seed = int(h['seed'])
        degree = int(h['degree'])

        # split data in k fold
        k_indices = build_k_indices(y, k_fold, seed)
        
        avg_mse_tr = 0
        avg_mse_te = 0
        
#         print(f'--- Starting with lambda {lambda_} and degree {degree}')

        for k in range(0, k_fold):
            
#             print(f'Round {k}')
            
            # get split data
            x_tr, x_te, y_tr, y_te = cross_data(y, x, k_indices, k)

            # ridge regression:
            w = ridge_regression(y_tr, x_tr, lambda_)
            
            # calculate the loss for train and test data + add it:
            avg_mse_tr = avg_mse_tr + (compute_mse(y_tr, x_tr, w) / k_fold)
            avg_mse_te = avg_mse_te + (compute_mse(y_te, x_te, w) / k_fold)
            
#             print(f'mse tr {avg_mse_tr}, mse_te {avg_mse_te}')

        return {
            'avg_mse_tr': avg_mse_tr,
            'avg_mse_te': avg_mse_te
        }, w

In [None]:
myModel = RidgeRegression_MSE_FixedDegree_CrossValidation_Model()

hs = { 
    'degree': np.arange(4, 16), 
    'lambda': np.logspace(-6, -2, 5),
#     'degree': 6, 
#     'lambda': 0.0534,
    'k_fold': 4,
    'seed': 0
}

res = myModel.evaluate(x, y, hs, CACHE_DIR+'RidgeRegression_MSE_FixedDegree_CrossValidation_Model_TEST5')

plot_heatmap(res, hs, 'avg_mse_te', 'degree', 'lambda')
best_h = find_arg_min(res, 'avg_mse_te')
best_h

In [None]:
# predict(myModel, best_h, x, y, SUBMISSIONS_DIR + 'RidgeRegression_MSE_FixedDegree_CrossValidation_Model')

In [4]:
class SGD_Lasso_MSE_FixedDegree_CrossValidation_Model(Model):

    def prepare(self, x, y):

        return clean_data(x), y

    def fit(self, x, y, h={}):
        
        degree = int(h['degree'])
        lambda_ = float(h['lambda'])
        k_fold = int(h['k_fold'])
        seed = int(h['seed'])
        precision = float(h['precision'])
        gamma = float(h['gamma'])
        
        batch_size = 1

        avg_mse_tr = 0
        avg_mse_te = 0

        # split data in k fold
        k_indices = build_k_indices(y, k_fold, seed)

        for k in range(0, k_fold):
            
            # get split data
            x_tr, x_te, y_tr, y_te = cross_data(y, x, k_indices, k)

            # form data with polynomial degree:
            x_tr = build_poly(x_tr, degree)
            x_te = build_poly(x_te, degree)
            
            w = np.zeros(x_tr.shape[1])
            loss = float("inf")
            diff = float("inf")

            while diff > precision:
                for y_batch, tx_batch in batch_iter(y_tr, x_tr, batch_size=batch_size, num_batches=1):
                    
                    # compute a stochastic gradient and loss
                    err = y_batch - tx_batch.dot(w)
                    grad = -tx_batch.T.dot(err) / len(err)
                    
                    # compute lasso
                    omega = np.vectorize(lambda wi: -np.sign(wi))(w)          
            
                    # update w through the stochastic gradient update
                    w = w - gamma * (grad + lambda_ * omega)
                    
                    loss += err * batch_size / y_tr.shape[0] 

                # calculate loss & update diff
                diff = loss
                loss = compute_mse(y_tr, x_tr, w)
                diff = diff - loss
                
            # calculate the loss for train and test data + add it:
            avg_mse_tr += compute_mse(y_tr, x_tr, w) / k_fold
            avg_mse_te += compute_mse(y_te, x_te, w) / k_fold

        return {
            "avg_mse_tr": avg_mse_tr,
            "avg_mse_te": avg_mse_te
        }, None

In [5]:
class RidgeRegression_MSE_FixedDegree_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h):

        lambda_ = float(h['lambda'])
        degree = int(h['degree'])

        return ridge_regression(y, x, lambda_)     
        
    def test(self, x, y, w, h):
                
        return { 'mse': compute_mse(y, x, w) }

test is [-7.47852280e-02 -3.68208870e-01 -1.04610391e-02  6.56991747e-02
 -1.02160497e-02 -2.21538949e-02 -1.26214080e-01 -1.42249136e-03
  3.39515710e-03  5.87767137e-03  7.77253992e-02 -4.20345670e-02
  6.18475762e-03  1.64108504e-01 -2.20843637e-02  4.83692141e-03
  2.16536987e-03 -4.20289992e-02 -2.17026899e-04  7.25936612e-03
 -6.98219319e-04 -1.65258259e-01  7.03553186e-02 -1.71424006e-02
  1.41645857e-03  1.59420446e-02  4.01499060e-02 -1.08902731e-02
  7.43158979e-04 -2.04417731e-02  1.84932836e-01  1.08135646e-01
 -1.13824470e-01  4.15664272e-01 -1.27848131e-01  1.18368623e-02
  2.70925028e-04 -5.30336144e-03 -1.65177923e-02  8.82202033e-04
 -4.98691794e-03 -1.09868919e-03 -5.48139137e-03  1.34151669e-03
  2.10337630e-03  8.72529285e-02 -5.86980080e-02  1.95720750e-02
 -1.79605881e-03  3.94467868e-03 -7.66279040e-02 -9.51642501e-04
  4.53836697e-03  5.26340517e-03 -2.87352087e-03 -1.93677820e-03
  4.90020363e-04  1.35944518e-01  4.95780156e-02 -3.47207307e-02
  4.46187460e-03 

[[ 1.00000000e+00  7.83626794e-01  6.14070952e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  8.91105826e-01  7.94069594e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  3.40297513e-01  1.15802397e-01 ...  3.59455507e-02
  -6.81502906e-03  1.29208261e-03]
 ...
 [ 1.00000000e+00 -6.96513023e-01  4.85130391e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00 -1.36181113e+00  1.85452956e+00 ...  2.73744627e+00
   4.52916775e+00  7.49361210e+00]
 [ 1.00000000e+00 -1.99762897e-01  3.99052150e-02 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]]
[ 1. -1. -1. ... -1. -1. -1.]
w is [-8.01957538e-02 -3.59444818e-01 -1.13185667e-02  6.20284988e-02
 -9.47475446e-03 -2.75097636e-02 -1.23014987e-01 -3.01570991e-03
  3.60851039e-03  1.46116102e-02  8.19980817e-02 -4.45482206e-02
  6.62717235e-03  1.68162369e-01 -2.32047907e-02  5.63532663e-03
  2.17057270e-03 -4.25467991e-02  3.23996698e-03  5.99450672e-03
 -5.

test is [-7.47852280e-02 -3.68208870e-01 -1.04610391e-02  6.56991747e-02
 -1.02160497e-02 -2.21538949e-02 -1.26214080e-01 -1.42249136e-03
  3.39515710e-03  5.87767137e-03  7.77253992e-02 -4.20345670e-02
  6.18475762e-03  1.64108504e-01 -2.20843637e-02  4.83692141e-03
  2.16536987e-03 -4.20289992e-02 -2.17026899e-04  7.25936612e-03
 -6.98219319e-04 -1.65258259e-01  7.03553186e-02 -1.71424006e-02
  1.41645857e-03  1.59420446e-02  4.01499060e-02 -1.08902731e-02
  7.43158979e-04 -2.04417731e-02  1.84932836e-01  1.08135646e-01
 -1.13824470e-01  4.15664272e-01 -1.27848131e-01  1.18368623e-02
  2.70925028e-04 -5.30336144e-03 -1.65177923e-02  8.82202033e-04
 -4.98691794e-03 -1.09868919e-03 -5.48139137e-03  1.34151669e-03
  2.10337630e-03  8.72529285e-02 -5.86980080e-02  1.95720750e-02
 -1.79605881e-03  3.94467868e-03 -7.66279040e-02 -9.51642501e-04
  4.53836697e-03  5.26340517e-03 -2.87352087e-03 -1.93677820e-03
  4.90020363e-04  1.35944518e-01  4.95780156e-02 -3.47207307e-02
  4.46187460e-03 

[[ 1.00000000e+00  7.83626794e-01  6.14070952e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  8.91105826e-01  7.94069594e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  3.40297513e-01  1.15802397e-01 ...  3.59455507e-02
  -6.81502906e-03  1.29208261e-03]
 ...
 [ 1.00000000e+00 -6.96513023e-01  4.85130391e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00 -1.36181113e+00  1.85452956e+00 ...  2.73744627e+00
   4.52916775e+00  7.49361210e+00]
 [ 1.00000000e+00 -1.99762897e-01  3.99052150e-02 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]]
[ 1. -1. -1. ... -1. -1. -1.]
w is [-8.01957538e-02 -3.59444818e-01 -1.13185667e-02  6.20284988e-02
 -9.47475446e-03 -2.75097636e-02 -1.23014987e-01 -3.01570991e-03
  3.60851039e-03  1.46116102e-02  8.19980817e-02 -4.45482206e-02
  6.62717235e-03  1.68162369e-01 -2.32047907e-02  5.63532663e-03
  2.17057270e-03 -4.25467991e-02  3.23996698e-03  5.99450672e-03
 -5.

test is [-7.54518912e-02 -3.68101031e-01 -1.04187845e-02  6.56570112e-02
 -1.02097165e-02 -2.19935268e-02 -1.26374748e-01 -1.37597279e-03
  3.39055152e-03  6.11529736e-03  7.77951068e-02 -4.20939334e-02
  6.19877967e-03  1.64015958e-01 -2.20599013e-02  4.82038884e-03
  2.17395206e-03 -4.20296701e-02 -2.36476458e-04  7.26895660e-03
 -6.99562595e-04 -1.58668481e-01  6.98964607e-02 -1.71490647e-02
  1.41717678e-03  1.38406663e-02  4.07878613e-02 -1.10399737e-02
  7.55437678e-04 -2.03134075e-02  1.84350189e-01  1.07959490e-01
 -1.13417484e-01  4.12369849e-01 -1.27174987e-01  1.17422427e-02
  2.74155692e-04 -5.30330717e-03 -1.65154238e-02  8.81854350e-04
 -4.98764158e-03 -1.10252478e-03 -5.48842710e-03  1.34322000e-03
  2.10475164e-03  8.75181903e-02 -5.89653026e-02  1.96423437e-02
 -1.80232318e-03  3.94317891e-03 -7.66174968e-02 -9.51039663e-04
  4.53430132e-03  5.26235231e-03 -2.87835704e-03 -1.93704082e-03
  4.90350864e-04  1.35843872e-01  4.95550330e-02 -3.47119044e-02
  4.46007834e-03 

[[ 1.00000000e+00  8.85717390e-01  7.84495295e-01 ...  7.44441201e-02
  -2.03116686e-02  5.54192701e-03]
 [ 1.00000000e+00  3.06895000e-01  9.41845409e-02 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  4.55859200e-01  2.07807611e-01 ...  8.73019351e-02
  -2.57950162e-02  7.62162788e-03]
 ...
 [ 1.00000000e+00 -6.96513023e-01  4.85130391e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00 -1.36181113e+00  1.85452956e+00 ...  2.73744627e+00
   4.52916775e+00  7.49361210e+00]
 [ 1.00000000e+00 -1.99762897e-01  3.99052150e-02 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]]
[ 1. -1. -1. ... -1. -1. -1.]
w is [-7.54518912e-02 -3.68101031e-01 -1.04187845e-02  6.56570112e-02
 -1.02097165e-02 -2.19935268e-02 -1.26374748e-01 -1.37597279e-03
  3.39055152e-03  6.11529736e-03  7.77951068e-02 -4.20939334e-02
  6.19877967e-03  1.64015958e-01 -2.20599013e-02  4.82038884e-03
  2.17395206e-03 -4.20296701e-02 -2.36476458e-04  7.26895660e-03
 -6.

[[ 1.00000000e+00  7.83626794e-01  6.14070952e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  8.91105826e-01  7.94069594e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  3.40297513e-01  1.15802397e-01 ...  3.59455507e-02
  -6.81502906e-03  1.29208261e-03]
 ...
 [ 1.00000000e+00 -1.16643685e+00  1.36057492e+00 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00 -1.11284218e+00  1.23841772e+00 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  6.50277472e-01  4.22860790e-01 ...  2.72459003e+00
   4.49729882e+00  7.42339084e+00]]
[ 1. -1. -1. ...  1. -1. -1.]
w is [-6.96362112e-02 -3.65139385e-01 -1.33738857e-02  6.54346898e-02
 -1.00806452e-02 -2.53665225e-02 -1.21945007e-01 -4.36572133e-03
  3.80256081e-03 -7.10870844e-03  7.77725167e-02 -3.66364618e-02
  5.10754583e-03  1.65513546e-01 -2.30476997e-02  6.10777186e-03
  1.88413181e-03 -3.94082082e-02  6.86278588e-03  3.75025752e-03
 -2.

[[ 1.          0.78362679  0.61407095 ...  0.59465365 -0.45855996
   0.35361296]
 [ 1.          0.89110583  0.79406959 ...  0.59465365 -0.45855996
   0.35361296]
 [ 1.          0.34029751  0.1158024  ...  0.03594555 -0.00681503
   0.00129208]
 ...
 [ 1.         -0.49036187  0.24045476 ...  0.59465365 -0.45855996
   0.35361296]
 [ 1.          0.94542474  0.89382795 ...  0.59465365 -0.45855996
   0.35361296]
 [ 1.          0.16725917  0.02797563 ...  0.59465365 -0.45855996
   0.35361296]]
[ 1. -1. -1. ... -1. -1. -1.]
w is [-8.09262945e-02 -3.67105305e-01 -1.00152089e-02  6.52537389e-02
 -1.01486294e-02 -2.05560567e-02 -1.27719565e-01 -9.93404135e-04
  3.35317061e-03  8.14985921e-03  7.83455378e-02 -4.25760823e-02
  6.31283119e-03  1.63155156e-01 -2.18713338e-02  4.68800335e-03
  2.24399501e-03 -4.20426914e-02 -4.05657132e-04  7.34888121e-03
 -7.10056503e-04 -1.05851988e-01  6.59341266e-02 -1.71241809e-02
  1.41701607e-03 -3.58206678e-03  4.60764747e-02 -1.22836981e-02
  8.57755561e-04 -

[[ 1.00000000e+00  7.83626794e-01  6.14070952e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  8.91105826e-01  7.94069594e-01 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  3.40297513e-01  1.15802397e-01 ...  3.59455507e-02
  -6.81502906e-03  1.29208261e-03]
 ...
 [ 1.00000000e+00 -1.16643685e+00  1.36057492e+00 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00 -1.11284218e+00  1.23841772e+00 ...  5.94653648e-01
  -4.58559960e-01  3.53612961e-01]
 [ 1.00000000e+00  6.50277472e-01  4.22860790e-01 ...  2.72459003e+00
   4.49729882e+00  7.42339084e+00]]
[ 1. -1. -1. ...  1. -1. -1.]
w is [-7.49263734e-02 -3.64171398e-01 -1.29498748e-02  6.50350805e-02
 -1.00214391e-02 -2.38932218e-02 -1.23295047e-01 -3.98925049e-03
  3.76645642e-03 -4.96834592e-03  7.83026748e-02 -3.71326880e-02
  5.22396243e-03  1.64627631e-01 -2.28586235e-02  5.97467890e-03
  1.95465931e-03 -3.94527422e-02  6.66908387e-03  3.84956643e-03
 -3.

KeyboardInterrupt: 