In [6]:
%load_ext autoreload
%autoreload 2

from algebra import *
from cache import *
from costs import *
from features import *
from gradients import *
from helpers import *
from model import *
from splits import *

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
SUB_SAMPLE = False
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

In [8]:
y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

# 1 - Analytical Results

### Ridge Regression with Fixed Degree

##### Without Validation

In [9]:
def clean_standardize_expand(y, x, h):
        
    degree = int(h['degree'])

    x = remove_errors(x)
    x = remove_outliers(x)
    x = standardize_all(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)

    return y, x

In [None]:
def ridge_regression_analytical(y, x, h):

    lambda_ = float(h['lambda'])
    degree = int(h['degree'])

    w = ridge_regression(y, x, lambda_)
    
    return {
        'w': w,
        'mse': compute_mse(y, x, w)
    }

In [None]:
hs = { 
    'degree': [5, 6, 7], 
    'lambda': 1e-4,
}

_ = evaluate(
    clean = clean_standardize_expand, 
    fit   = ridge_regression_analytical, 
    x     = x, 
    y     = y, 
    hs    = hs, 
    cache = CACHE_DIR + 'clean_standardize_expand_ridge_regression_analytical'
)

##### Using Cross-Validation

Here, we implement the same model with cross-validation.

In [None]:
hs = { 
    'degree': np.arange(4, 16), 
    'lambda': np.logspace(-8, -4, 5),
    'k_fold': 4,
    'seed': 0
}

def mse(y, x, w):
    return { 'mse' : compute_mse(y, x, w) }

fit_function = cross_validate(ridge_regression_analytical, mse)

evaluate(
    clean = clean_standardize_expand, 
    fit   = fit_function, 
    x     = x,
    y     = y, 
    hs    = hs, 
    cache = CACHE_DIR + 'clean_standardize_expand_cross_validate_ridge_regression_analytical_mse'
)

In [None]:
# myModel.predict(best_h, x, y, SUBMISSIONS_DIR + 'RidgeRegression_MSE_FixedDegree_CrossValidation_Model')

## Gradient Descents

#### Least Square

#### Ridge Regression

#### Lasso

## Logistic Regression

In [10]:
def map_logistic(clean):
    
    def inner_function(y, x, h):
        y, x = clean(y, x, h)
        y = np.where(y == -1, 1, 0)
        return y, x
    
    return inner_function

##### Stochastic Gradient Descent

In [None]:
def logistic_gradient(y, x, w, h):
    
    return compute_logistic_gradient(y, x, w)
            
def logistic_error(y, x, w):
    
    return { 'err': compute_logistic_error(y, x, w) }

hs = {
    'batch_size': 2500,
    'degree': np.concat([[-2], np.arange(1, 7)]),
    'gamma': [1e-2, 1e-3], 
    'k_fold': 4,
    'lambda': 0,
    'max_iters': 3000,
    'num_batches': 1,
    'seed': 1,
}


stochastic_logistic_descent = stochastic_gradient_descent_e(logistic_gradient, compute_logistic_error, True)
_ = evaluate(
    clean = map_logistic(clean_standardize_expand), 
    fit   = stochastic_gradient_descent_e(
        logistic_gradient, 
        logistic_error, 
        True
    ), 
    y     = y, 
    x     = x, 
    hs    = hs, 
    cache = CACHE_DIR + 'clean_standardize_expand_stochastic_logistic_descent')

iteration 0 - err = {'err': 172990.08696124953}
iteration 50 - err = {'err': 161964.0305706002}
iteration 100 - err = {'err': 155461.3253420889}
iteration 150 - err = {'err': 151012.992179333}
iteration 200 - err = {'err': 147759.38347746438}
iteration 250 - err = {'err': 145274.78911853925}
iteration 300 - err = {'err': 143344.85159421063}
iteration 350 - err = {'err': 141795.15260002683}
iteration 400 - err = {'err': 140532.65878743728}
iteration 450 - err = {'err': 139463.93193769513}
iteration 500 - err = {'err': 138593.81965106502}
iteration 550 - err = {'err': 137843.37670337656}
iteration 600 - err = {'err': 137206.79834315073}
iteration 650 - err = {'err': 136650.53114688542}
iteration 700 - err = {'err': 136182.2422368654}
iteration 750 - err = {'err': 135761.27072757765}
iteration 800 - err = {'err': 135389.27112363384}
iteration 850 - err = {'err': 135065.385452786}
iteration 900 - err = {'err': 134771.28328439203}
iteration 950 - err = {'err': 134513.37056380778}
iteration 