In [None]:
%run algebra.py
%run cache.py
%run costs.py
%run features.py
%run gradients.py
%run helpers.py
%run model.py
%run models.py
%run splits.py

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_csv_data(data_path, sub_sample=True):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1

    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

In [None]:
SUB_SAMPLE = True
OUTPUT_DIR = "results_test/" if SUB_SAMPLE else "results/"

In [None]:
y, x, ids = load_csv_data('data/train.csv', sub_sample=SUB_SAMPLE)

# 1 - Only Using Clean Features

In [None]:
def plot_heatmap(res, hs, value, x, y):
    val = np.vectorize(lambda x: x[value])(res)
    
    index = 0
    
    for key in sorted(hs.keys()):
        if key == x or key == y:
            index = index + 1
        else:
            val = np.apply_along_axis(np.mean, index, val)        
    
    ax = plt.imshow(1 / val, cmap='hot', interpolation='none')
    plt.show()

def find_arg_min(res, value):
    val = np.vectorize(lambda x: x[value])(res)
    index = np.where(val == val.min())

    print(res[tuple([i[0] for i in index])])

### Ridge Regression with Fixed Degree

In [None]:
class RidgeRegression_MSE_Degree_Model(Model):

    def prepare(self, x, y):

        return clean_data(self.raw_x), y

    def fit(self, x, y, h={}):

        degree = int(h['degree'])
        lambda_ = float(h['lambda'])

        tx = build_poly(x, degree)

        w = ridge_regression(y, tx, lambda_)
        mse = compute_mse(y, tx, w)

        return {
            "mse": mse
        }, w

In [None]:
myModel = RidgeRegression_MSE_Degree_Model(x, y)

hs = { 
    'degree': np.arange(20), 
    'lambda': np.logspace(-9, -3, 20)
}

res = myModel.evaluate(hs, filename=OUTPUT_DIR+'RidgeRegression_MSE_Degree_Model')
res_mse = np.vectorize(lambda x: x['mse'])(res)

plot_heatmap(res, hs, 'mse', 'degree', 'lambda')
find_arg_min(res, 'mse')

### Ridge Regression with Fixed Degree & Cross Validation

In [None]:
class RidgeRegression_MSE_FixedDegree_CrossValidation_Model(Model):

    def prepare(self, x, y):

        return clean_data(self.raw_x), y

    def fit(self, x, y, h={}):
        
        degree = int(h['degree'])
        lambda_ = float(h['lambda'])
        k_fold = int(h['k_fold'])
        seed = int(h['seed'])

        avg_mse_tr = 0
        avg_mse_te = 0

        # split data in k fold
        k_indices = build_k_indices(y, k_fold, seed)

        for k in range(0, k_fold):
            
            # get split data
            x_tr, x_te, y_tr, y_te = cross_data(y, x, k_indices, k)

            # form data with polynomial degree:
            x_tr = build_poly(x_tr, degree)
            x_te = build_poly(x_te, degree)

            # ridge regression:
            w = ridge_regression(y_tr, x_tr, lambda_)

            # calculate the loss for train and test data + add it:
            avg_mse_tr += compute_mse(y_tr, x_tr, w) / k_fold
            avg_mse_te += compute_mse(y_te, x_te, w) / k_fold

        return {
            'avg_mse_tr': avg_mse_tr,
            'avg_mse_te': avg_mse_te
        }, w

In [None]:
import seaborn as sns

myModel = RidgeRegression_MSE_FixedDegree_CrossValidation_Model(x, y)



hs={ 
    'degree': np.arange(20), 
    'lambda': np.logspace(-9, -3, 20),
    'k_fold': 5,
    'seed': 0
}

res = myModel.evaluate(hs=hs, filename=OUTPUT_DIR+'RidgeRegression_MSE_FixedDegree_CrossValidation_Model')

plot_heatmap(res, hs, 'avg_mse_te', 'degree', 'lambda')
find_arg_min(res, 'avg_mse_te')

In [None]:
class SGD_Lasso_MSE_FixedDegree_CrossValidation_Model(Model):

    def prepare(self, x, y):

        return clean_data(self.raw_x), y

    def fit(self, x, y, h={}):
        
        degree = int(h['degree'])
        lambda_ = float(h['lambda'])
        k_fold = int(h['k_fold'])
        seed = int(h['seed'])
        precision = float(h['precision'])
        gamma = float(h['gamma'])
        
        batch_size = 1

        avg_mse_tr = 0
        avg_mse_te = 0

        # split data in k fold
        k_indices = build_k_indices(y, k_fold, seed)

        for k in range(0, k_fold):
            
            # get split data
            x_tr, x_te, y_tr, y_te = cross_data(y, x, k_indices, k)

            # form data with polynomial degree:
            x_tr = build_poly(x_tr, degree)
            x_te = build_poly(x_te, degree)
            
            w = np.zeros(x_tr.shape[1])
            loss = float("inf")
            diff = float("inf")

            while diff > precision:
                for y_batch, tx_batch in batch_iter(y_tr, x_tr, batch_size=batch_size, num_batches=1):
                    
                    # compute a stochastic gradient and loss
                    err = y_batch - tx_batch.dot(w)
                    grad = -tx_batch.T.dot(err) / len(err)
                    
                    # compute lasso
                    omega = np.vectorize(lambda wi: -np.sign(wi))(w)          
            
                    # update w through the stochastic gradient update
                    w = w - gamma * (grad + lambda_ * omega)
                    
                    loss += err * batch_size / y_tr.shape[0] 

                # calculate loss & update diff
                diff = loss
                loss = compute_mse(y_tr, x_tr, w)
                diff = diff - loss
                
            # calculate the loss for train and test data + add it:
            avg_mse_tr += compute_mse(y_tr, x_tr, w) / k_fold
            avg_mse_te += compute_mse(y_te, x_te, w) / k_fold

        return {
            "avg_mse_tr": avg_mse_tr,
            "avg_mse_te": avg_mse_te
        }, None

In [None]:
myModel = SGD_Lasso_MSE_FixedDegree_CrossValidation_Model(x, y)

hs = { 
    'degree': np.arange(10), 
    'lambda': np.logspace(-10, -5, 10),
    'k_fold': 5,
    'seed': 0,
    'precision': 0.001,
    'gamma': np.logspace(-10, -5, 5)
}

res = myModel.evaluate(hs, filename='results/SGD_Lasso_MSE_FixedDegree_CrossValidation_Model')

plot_heatmap(res, hs, 'avg_mse_te', 'degree', 'lambda')
plot_heatmap(res, hs, 'avg_mse_te', 'degree', 'gamma')
plot_heatmap(res, hs, 'avg_mse_te', 'lambda', 'gamma')

find_arg_min(res, 'avg_mse_te')