In [3]:
%load_ext autoreload
%autoreload 2

from algebra import *
from cache import *
from costs import *
from features import *
from gradients import *
from helpers import *
from model import *
from splits import *

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Tutorial

Here we will see how to run a basic model and how to do some grid search on the model's parameter.

First, we define the directories path. You can specifiy if you want to load a sub sample of the data set or the full dataset by changing the `SUB_SAMPLE` constant to `True` or `False`:

In [4]:
SUB_SAMPLE = True
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

Then, we load our data set:

In [5]:
y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

### Pre-processing

We now define how we want to process our dataset before doing any training. For this dataset we do the following preprocessing steps:
1. we remove all the `-999` values
2. we remove the outliers with a clamping
3. we standardize our dataset
4. we do a polynomial expansion with the `degree` value passed as hyperparameter

The functions used in this function are in the file `features.py`

In [6]:
def clean_standardize_expand(y, x, h):
        
    degree = int(h['degree'])

    x = remove_errors(x)
    x = remove_outliers(x)
    x = standardize_all(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)

    return y, x

### Parameters exploration for a simple Least Squares model

Now we want to try some models with different parameters to see which one is the best for our problem.

We need to define our fitting function:

In [14]:
def least_squares_analytical(y, x, h):

    degree = int(h['degree'])

    w = least_squares(y, x)
    
    return {
        'w': w,
        'mse': compute_mse(y, x, w)
    }

Then we define the parameters we want to explore for our model. Since least squares is a simple model we only have the degree expansion to explore, let's see the results of this model with the degree varying from 10 to 13:

In [8]:
#where we store our results
cache = Cache(CACHE_DIR + 'Tutorial_Results_Least_Squares')

#the parameters we want to try
hs = { 
    'degree': np.arange(10, 14), 
}

evaluate(
    clean = clean_standardize_expand, 
    fit   = fit_with_cache(least_squares_analytical, cache),  
    x     = x, 
    y     = y, 
    hs    = hs
)

[{'degree': 10,
  'w': array([-3.15707300e+01, -2.55094915e-01, -2.69387668e-01, -4.40364511e-02,
          3.22180467e-01, -4.25471022e-02, -9.31028062e-02,  4.09865063e-02,
         -5.22361598e-03, -8.47476236e-05,  4.13828385e-05,  2.92309729e-01,
         -6.81009190e-01, -5.44916271e-01,  4.07911195e-01,  7.49438216e-02,
         -7.46001490e-02,  8.52726886e-04,  6.06098332e-03, -1.22163594e-03,
          7.27932852e-05, -4.16282739e-02,  1.61618890e-02,  1.53724144e-01,
         -2.73128116e-01,  7.61145556e-03,  2.82597669e-01, -2.29969077e-01,
          7.87416263e-02, -1.27013354e-02,  7.93674196e-04,  9.03241980e-03,
          1.27048370e-01,  2.51075006e-01, -1.32402432e-01, -7.52289259e-02,
          3.48158646e-02,  1.12305054e-02, -4.23170961e-03, -5.24067665e-04,
          1.74828831e-04, -3.02131177e-01,  2.78896368e-01,  6.33262018e-01,
         -9.01033279e-01,  3.06374995e-02,  5.56719476e-01, -4.10767097e-01,
          1.34858088e-01, -2.17252883e-02,  1.39451774e

Now if we take a look at `test/cache/Tutorial_Results.csv` we can see that the best model is the one with the polynomial expansion of 13th degree. 

The only error we have is from the training, to have the error on a test set we can use the `cross_validation` wrapper function. We first need to define the loss function that will feedback our cross validation. 

Here we will use the basic `compute_mse` function. The `mse` key added is used by the cache to name correctly the column in the generated file. The columns will have the name `avg_mse_tr` and `avg_mse_te`.

Note that if we change the key name to `loss` for example, in the file we would get `avg_loss_tr` and `avg_loss_te`.

In [17]:
def mse(y, x, w, h):
    return {
        'mse' : compute_mse(y, x, w)
    }

In [18]:
#where we store our results
cache = Cache(CACHE_DIR + 'Tutorial_Results_Least_Squares_Cross_Validation')

#the parameters we want to try
hs = { 
    'degree': np.arange(10, 14), 
    'k_fold': 4,
    'seed': 1
}

evaluate(
    clean = clean_standardize_expand, 
    fit   = fit_with_cache(cross_validate(least_squares_analytical, mse), cache),  
    x     = x, 
    y     = y, 
    hs    = hs
)

[{'degree': 10,
  'k_fold': 4,
  'seed': 1,
  'avg_mse_te': 0.31388900798768843,
  'avg_mse_tr': 0.2697523006711879,
  'w': array([-1.48128653e+02, -2.61257657e-01, -2.66942945e-01, -3.29203930e-02,
          3.16035006e-01, -4.91547234e-02, -8.88551013e-02,  4.18793318e-02,
         -6.30928178e-03,  1.69675617e-04,  2.23309295e-05,  2.94002762e-01,
         -6.85472735e-01, -5.48943112e-01,  4.16272639e-01,  7.37544363e-02,
         -7.62833611e-02,  1.49654001e-03,  6.00838657e-03, -1.22810237e-03,
          7.36427162e-05, -3.94848517e-02,  9.24040236e-03,  1.52025390e-01,
         -2.62335213e-01,  3.86328754e-03,  2.77495102e-01, -2.24603393e-01,
          7.66395611e-02, -1.23204519e-02,  7.67189014e-04,  1.30223246e-02,
          1.29362702e-01,  2.44043743e-01, -1.36571652e-01, -7.00316332e-02,
          3.56929864e-02,  9.94032131e-03, -4.19422714e-03, -4.16391676e-04,
          1.58521317e-04, -3.01961094e-01,  2.64623657e-01,  6.33378047e-01,
         -8.76594848e-01,  2.04

Now we can see that the best `avg_mse_te` we have is with the model with the polynomial expansion of 10th degree.

Let's verify our results on the full dataset:

In [24]:
SUB_SAMPLE = False
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

#Now we will store on cache/
cache = Cache(CACHE_DIR + 'Tutorial_Results_Least_Squares_Cross_Validation')

evaluate(
    clean = clean_standardize_expand, 
    fit   = fit_with_cache(cross_validate(least_squares_analytical, mse), cache),  
    x     = x, 
    y     = y, 
    hs    = hs
)

[{'degree': 10,
  'k_fold': 4,
  'seed': 1,
  'avg_mse_tr': 0.2943150811051833,
  'avg_mse_te': 0.29488026234833575,
  'w': array([-5.65009703e+00, -4.04677515e-01, -1.05693508e-01,  1.78421688e-01,
          5.34318347e-02, -7.05823375e-02,  1.88976307e-03,  1.33636276e-02,
         -4.99876121e-03,  7.38192584e-04, -4.02441339e-05,  2.88204974e-01,
         -7.14967631e-01, -5.21046162e-01,  4.39928075e-01,  4.27450813e-02,
         -7.15586929e-02,  6.23373595e-03,  3.87713230e-03, -9.01908638e-04,
          5.63053510e-05,  1.70149164e-01, -1.13765091e-01, -3.02243520e-01,
          4.94600393e-01, -3.32083307e-02, -2.80076835e-01,  1.97504427e-01,
         -5.97815975e-02,  8.68781149e-03, -4.95497896e-04,  1.67004286e-01,
          1.28360997e-01,  4.16779534e-02, -1.15685550e-01,  2.01496684e-02,
          2.02029918e-02, -4.45340124e-03, -1.21824620e-03,  3.24360070e-04,
         -9.78088014e-06, -8.06083383e-02,  1.43324683e-01,  1.11324292e-01,
         -3.26210203e-01,  8.76



Now we want to submit our best model. We do the following steps:

In [33]:
hs = { 
    'degree': 13, 
    'k_fold': 4,
    'seed': 1
}

res = evaluate(
    clean = clean_standardize_expand, 
    fit   = fit_with_cache(cross_validate(least_squares_analytical, mse), cache),  
    x     = x, 
    y     = y, 
    hs    = hs
)

best_w = res[0]['w']

print(best_w)

[-4.18938332e+02 -4.04680293e-01 -8.17720960e-02  1.66092500e-01
 -1.23413333e-03 -2.70214013e-02  3.05723896e-02 -2.56665648e-02
  1.23447808e-03  8.89171513e-03 -5.02350097e-03  1.23043664e-03
 -1.46085547e-04  6.87354081e-06  4.20166760e-01 -6.65989252e-01
 -9.73659482e-01  3.95402262e-01  4.60613782e-01 -1.39379467e-01
 -1.27669956e-01  5.71928846e-02  6.24802938e-03 -8.30286476e-03
  2.02532229e-03 -2.14431036e-04  8.68138543e-06  1.49264903e-01
 -2.02184578e-01 -1.92580140e-01  7.29737756e-01 -2.97021832e-01
 -4.07239509e-01  4.33118535e-01 -1.21914825e-01 -2.67761074e-02
  2.65217096e-02 -7.22002001e-03  9.04440876e-04 -4.45107827e-05
  1.15017739e-01  2.41786834e-01  1.91956945e-01 -3.02170286e-01
 -9.67777772e-02  1.27281803e-01  2.67368422e-02 -2.85273663e-02
 -2.09682713e-03  3.15234506e-03 -1.27582066e-04 -1.33609473e-04
  1.62728121e-05 -1.18248641e-01  1.81987595e-01  3.69274003e-01
 -6.34989424e-01 -2.43850093e-01  7.85370793e-01 -2.77674283e-01
 -2.32855737e-01  2.48069

We fetched the weight of our best model from the cache, now we can do our predictions on the true test set:

In [36]:
y_test, x_test, ids_test = load_csv_data('data/test.csv', SUB_SAMPLE)

#we need to preprocess the test set too
y_test, x_test = clean_standardize_expand(y_test, x_test, hs)

In [38]:
y_pred = predict_values(x_test, best_w)

We now make our submission file in the `submission/` folder:

In [39]:
create_csv_submission(ids_test, y_pred, SUBMISSIONS_DIR + 'Tutorial_Least_Squares_Cross_Validation_Degree_13')

### Parameters exploration for a Stochastic Gradient Descent with Least Squares

In [None]:
SUB_SAMPLE = True
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

Let's explore the parameters of a simple SGD models with Least Squares.

First we try without cross validation:

In [None]:
evaluate(
    clean = map_logistic(clean_standardize_expand), 
    fit   = descent_with_cache(
        descent    = stochastic_gradient_descent_e(logistic_gradient_ridge), 
        round_size = 100,
        cache      = cache,
        log        = True #print the loss of the steps each round_size iterations
        ), 
        y     = y,
        x     = x,
        hs    = hs
    )