In [None]:
%run algebra.py
%run cache.py
%run costs.py
%run features.py
%run gradients.py
%run helpers.py
%run model.py
%run models.py
%run splits.py

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

In [None]:
SUB_SAMPLE = False
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

In [None]:
y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

# 1 - Only Using Clean Features

### Ridge Regression with Fixed Degree

##### Without Validation

In [None]:
class RidgeRegression_MSE_FixedDegree_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h):

        lambda_ = float(h['lambda'])
        degree = int(h['degree'])

        return ridge_regression(y, x, lambda_)     
        
    def test(self, x, y, w, h):
                
        return { 'mse': compute_mse(y, x, w) }

In [None]:
myModel = RidgeRegression_MSE_FixedDegree_Model()

hs = { 
    'degree': np.arange(4, 16), 
    'lambda': np.logspace(-8, -2, 7),
}

res = myModel.evaluate(x, y, hs,filename=CACHE_DIR+'RidgeRegression_MSE_FixedDegree')
res_mse = np.vectorize(lambda x: x['mse'])(res)

plot_heatmap(res, hs, 'mse', 'degree', 'lambda')
find_arg_min(res, 'mse')

##### Using Cross-Validation

Here, we implement the same model with cross-validation.

In [None]:
myModel = CrossValidationModel(RidgeRegression_MSE_FixedDegree_Model())

hs = { 
    'degree': np.arange(4, 16), 
    'lambda': np.logspace(-8, -2, 7),
    'k_fold': 4,
    'seed': 0
}

res = myModel.evaluate(x, y, hs, CACHE_DIR+'RidgeRegression_MSE_FixedDegree_CrossValidation')

plot_heatmap(res, hs, 'avg_mse_te', 'degree', 'lambda')
best_h = find_arg_min(res, 'avg_mse_te')
best_h

In [None]:
myModel.predict(best_h, x, y, SUBMISSIONS_DIR + 'RidgeRegression_MSE_FixedDegree_CrossValidation_Model')

### SGD Lasso

In [None]:
class SGD_Lasso_MSE_FixedDegree_CrossValidation_Model(Model):

    def prepare(self, x, y):

        return clean_data(x), y

    def fit(self, x, y, h={}):
        
        degree = int(h['degree'])
        lambda_ = float(h['lambda'])
        k_fold = int(h['k_fold'])
        seed = int(h['seed'])
        precision = float(h['precision'])
        gamma = float(h['gamma'])
        
        batch_size = 1

        avg_mse_tr = 0
        avg_mse_te = 0

        # split data in k fold
        k_indices = build_k_indices(y, k_fold, seed)

        for k in range(0, k_fold):
            
            # get split data
            x_tr, x_te, y_tr, y_te = cross_data(y, x, k_indices, k)

            # form data with polynomial degree:
            x_tr = build_poly(x_tr, degree)
            x_te = build_poly(x_te, degree)
            
            w = np.zeros(x_tr.shape[1])
            loss = float("inf")
            diff = float("inf")

            while diff > precision:
                for y_batch, tx_batch in batch_iter(y_tr, x_tr, batch_size=batch_size, num_batches=1):
                    
                    # compute a stochastic gradient and loss
                    err = y_batch - tx_batch.dot(w)
                    grad = -tx_batch.T.dot(err) / len(err)
                    
                    # compute lasso
                    omega = np.vectorize(lambda wi: -np.sign(wi))(w)          
            
                    # update w through the stochastic gradient update
                    w = w - gamma * (grad + lambda_ * omega)
                    
                    loss += err * batch_size / y_tr.shape[0] 

                # calculate loss & update diff
                diff = loss
                loss = compute_mse(y_tr, x_tr, w)
                diff = diff - loss
                
            # calculate the loss for train and test data + add it:
            avg_mse_tr += compute_mse(y_tr, x_tr, w) / k_fold
            avg_mse_te += compute_mse(y_te, x_te, w) / k_fold

        return {
            "avg_mse_tr": avg_mse_tr,
            "avg_mse_te": avg_mse_te
        }, None

In [None]:
class RidgeRegression_MSE_FixedDegree_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h):

        lambda_ = float(h['lambda'])
        degree = int(h['degree'])

        return ridge_regression(y, x, lambda_)     
        
    def test(self, x, y, w, h):
                
        return { 'mse': compute_mse(y, x, w) }