In [None]:
%load_ext autoreload
%autoreload 2

from cache import *
from costs import *
from features import *
from helpers import *
from evaluate import *
from predict import *
from validate import *
from implementations import *

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

# Tutorial

Here we will see how to run a basic model and how to do some grid search on the model's parameter.

First, we define the directories path. You can specifiy if you want to load a sub sample of the data set or the full dataset by changing the `SUB_SAMPLE` constant to `True` or `False`:

In [None]:
SUB_SAMPLE = True
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

Then, we load our data set:

In [None]:
y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

### Pre-processing

We now define how we want to process our dataset before doing any training. For this dataset we do the following preprocessing steps:
1. we remove all the `-999` values
2. we remove the outliers with a clamping
3. we standardize our dataset
4. we do a polynomial expansion with the `degree` value passed as hyperparameter

The functions used in this function are in the file `features.py`:

In [None]:
def clean_standardize_expand(y, x, h):
        
    degree = int(h['degree'])

    x = remove_errors(x)
    x = remove_outliers(x)
    x = standardize_all(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)

    return y, x

### Parameters exploration for a simple Least Squares model

Now we want to try some models with different parameters to see which one is the best for our problem.

Here we define the parameters we want to explore. Since least squares is a simple model we only have the degree expansion to explore, let's see the results with the degree varying from 10 to 13:

In [None]:
#where we store our results
cache = Cache(CACHE_DIR + 'Tutorial_Least_Squares')

#the parameters we want to try
hs = { 
    'degree': np.arange(10, 14), 
}

#compute the values for each model
evaluate(
    fit   = clean_and_fit_with_cache(
        clean = clean_standardize_expand,
        fit = least_squares_weights, 
        cache = cache
    ),  
    y     = y, 
    x     = x, 
    hs    = hs
)

Now if we take a look at `test/cache/Tutorial_Results.csv` we can see that the best model is the one with the polynomial expansion of 13th degree. 

This workflow is very basic and only return the weights of the differents model we trained. To have the error on a test set we can use the `cross_validation` wrapper function. We first need to define the loss function that will feedback our cross validation. 

Here we will use the basic `compute_mse` function. The `mse` key added is used by the cache to name correctly the column in the generated file. The columns will have the name `avg_mse_tr` and `avg_mse_te`.

Note that if we change the key name to `loss` for example, in the file we would get `avg_loss_tr` and `avg_loss_te`.

In [None]:
def mse(y, x, w, h):
    return {
        'mse' : compute_mse(y, x, w)
    }

In [None]:
#where we store our results
cache = Cache(CACHE_DIR + 'Tutorial_Least_Squares_Cross_Validation')

#the parameters we want to try
hs = { 
    'degree': np.arange(10, 14), 
    'k_fold': 4,
    'seed': 1
}

res = evaluate(
    fit   = clean_and_fit_with_cache(
        clean = clean_standardize_expand,
        fit = cross_validate(
                fit = least_squares_weights,
                validate = mse
        ),
        cache = cache
    ),  
    y     = y, 
    x     = x, 
    hs    = hs
)

The big advantage of our cross validation implementation is that we can easily find the best parameters with the function `find_arg_min` based on a specific criteria. 

Here we want to have the model that have the smallest `avg_mse_te` since the test erros are more representative of the actual quality of the model:

In [None]:
find_arg_min(res, 'avg_mse_te')

Now we can see that the best `avg_mse_te` we have is with the model with the polynomial expansion of 10th degree.

Let's verify our results on the full dataset:

In [None]:
SUB_SAMPLE = False
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

#Now we will store on cache/
cache = Cache(CACHE_DIR + 'Tutorial_Least_Squares_Cross_Validation')

res = evaluate(
    fit   = clean_and_fit_with_cache(
        clean = clean_standardize_expand,
        fit = cross_validate(
                fit = least_squares_weights,
                validate = mse
        ),
        cache = cache
    ),  
    y     = y, 
    x     = x, 
    hs    = hs
)

find_arg_min(res, 'avg_mse_te')



Now we want to submit our best model. We do the following steps:

In [None]:
hs = { 
    'degree': 13, 
    'k_fold': 4,
    'seed': 1
}

res = evaluate(
    clean_and_fit_with_cache(
        clean = clean_standardize_expand, 
        fit = cross_validate(least_squares_weights, mse),
        cache = cache
    ),
    y     = y, 
    x     = x, 
    hs    = hs
)

best_w = res[0]['w']

print(best_w)

We fetched the weight of our best model from the cache, now we can do our predictions on the true test set:

In [None]:
y_test, x_test, ids_test = load_csv_data('data/test.csv', SUB_SAMPLE)

#we need to preprocess the test set too
y_test, x_test = clean_standardize_expand(y_test, x_test, hs)

In [None]:
y_pred = predict_values(x_test, best_w)

We now make our submission file in the `submission/` folder:

In [None]:
create_csv_submission(ids_test, y_pred, SUBMISSIONS_DIR + 'Tutorial_Least_Squares_Cross_Validation_Degree_13')

### Parameters exploration for a Stochastic Gradient Descent with Least Squares

In [None]:
SUB_SAMPLE = True
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

Let's explore the parameters of a simple GD models with the Least Squares gradient. We do the same cleaning as before.

First we try without cross validation:

In [None]:
cache = Cache(CACHE_DIR + 'Tutorial_Gradient_Descent_Least_Squares')

hs = {
    'degree': np.arange(1,5),
    'gamma': np.logspace(-10, -5, 5),
    'max_iters': 100,
    'seed': 1
}

evaluate(
    fit   = clean_and_descent_with_cache(
        clean = clean_standardize_expand,
        descent = descent_with_loss(
                    descent = gradient_descent(least_squares_gradient),
                    loss = mse
        ),
        round_size = 50,
        cache = cache,
        multiple = False
    ),  
    y     = y, 
    x     = x, 
    hs    = hs
)

Note that we wrapped `gradient_descent` with the function `descent_with_loss` in order to print the loss in the log. Any loss function can be used.

We can also use a cross validation for the descent model by wrapping our descent model:

In [None]:
cache = Cache(CACHE_DIR + 'Tutorial_Gradient_Descent_Least_Squares_Cross_Validation')

hs = {
    'degree': np.arange(1,5),
    'gamma': np.logspace(-10, -5, 5),
    'max_iters': 100,
    'k_fold': 4,
    'seed_cv': 1,
    'seed': 1
}

res = evaluate(
    fit   = clean_and_descent_with_cache(
        clean = clean_standardize_expand,
        descent = cross_validate_descent(
                    descent = descent_with_loss(
                        descent = gradient_descent(least_squares_gradient),
                        loss = mse
                    ),
                    validate = mse
        ),                               
        round_size = 50,
        cache = cache,
        log = False
    ),  
    y     = y, 
    x     = x, 
    hs    = hs
)

And finally we find the best parameters for this gradient descent:

In [None]:
find_arg_min(res, 'avg_mse_te')

### Plots for parameters exploration

We have a useful fonction `plot_heatmap` that takes the result of the evaluate function, the name of the loss we want to plot, and the x-y axis.

Here we want to visualize the `avg_mse_te` we have as a parameter of `degree` and `gamma`:

In [None]:
plot_heatmap(res, 'avg_mse_te', 'degree', 'gamma')