In [20]:
%load_ext autoreload
%autoreload 2

from algebra import *
from cache import *
from costs import *
from features import *
from gradients import *
from helpers import *
from model import *
from splits import *

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
SUB_SAMPLE = False
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

In [22]:
y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

# 1 - Analytical Results

### Ridge Regression with Fixed Degree

##### Without Validation

In [6]:
def clean_standardize_expand(y, x, h):
        
    degree = int(h['degree'])

    x = remove_errors(x)
    x = remove_outliers(x)
    x = standardize_all(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)

    return y, x

In [None]:
def ridge_regression_analytical(y, x, h):

    lambda_ = float(h['lambda'])
    degree = int(h['degree'])

    w = ridge_regression(y, x, lambda_)
    
    return {
        'w': w,
        'mse': compute_mse(y, x, w)
    }

In [None]:
hs = { 
    'degree': [5, 6, 7], 
    'lambda': 1e-4,
}

_ = evaluate(
    clean = clean_standardize_expand, 
    fit   = ridge_regression_analytical, 
    x     = x, 
    y     = y, 
    hs    = hs, 
    cache = CACHE_DIR + 'clean_standardize_expand_ridge_regression_analytical'
)

##### Using Cross-Validation

Here, we implement the same model with cross-validation.

In [None]:
hs = { 
    'degree': np.arange(4, 16), 
    'lambda': np.logspace(-8, -4, 5),
    'k_fold': 4,
    'seed': 0
}

def mse(y, x, w):
    return { 'mse' : compute_mse(y, x, w) }



evaluate(
    clean = cross_validate(ridge_regression_analytical, mse), 
    fit   = fit_function, 
    x     = x,
    y     = y, 
    hs    = hs, 
    cache = CACHE_DIR + 'clean_standardize_expand_cross_validate_ridge_regression_analytical_mse'
)

In [None]:
# myModel.predict(best_h, x, y, SUBMISSIONS_DIR + 'RidgeRegression_MSE_FixedDegree_CrossValidation_Model')

## Gradient Descents

#### Least Square

#### Ridge Regression

#### Lasso

## Logistic Regression

In [8]:
def map_logistic(clean):
    
    def inner_function(y, x, h):
        y, x = clean(y, x, h)
        y = np.where(y == 1, 0, 1)
        return y, x
    
    return inner_function

def logistic_gradient(y, x, w, h):
    
    return compute_logistic_gradient(y, x, w)
            
def logistic_error(y, x, w):
    
    return { 
        'logistic_err': compute_logistic_error(y, x, w),
        'n_err': compute_error_count(y, x, w)
    }

##### Stochastic Gradient Descent

In [None]:
hs = {
    'batch_size': 2500,
    'degree': np.concatenate([[-2], np.arange(1, 7)]),
    'gamma': [1e-2, 1e-3], 
    'k_fold': 4,
    'lambda': 0,
    'max_iters': 3000,
    'num_batches': 1,
    'seed': 1,
}

stochastic_logistic_descent = stochastic_gradient_descent_e(logistic_gradient, compute_logistic_error, True)
_ = evaluate(
    clean = map_logistic(clean_standardize_expand), 
    fit   = stochastic_gradient_descent_e(
        logistic_gradient, 
        logistic_error, 
        True
    ), 
    y     = y, 
    x     = x, 
    hs    = hs, 
    cache = CACHE_DIR + 'clean_standardize_expand_stochastic_logistic_descent')

###### Stochastic Gradient Descent With Ridge Regression

In [None]:
hs = {
    'batch_size': 2500,
    'degree': np.arange(3, 4),
    'gamma': [1e-2, 1e-3], 
    'lambda': [1e-2, 1e-3],
    'k_fold': 4,
    'max_iters': 1000,
    'num_batches': 1,
    'seed': 1,
}

def logistic_gradient_ridge(y, x, w, h):
    
    lambda_ = h['lambda']
    
    return compute_logistic_gradient(y, x, w) + lambda_ * w


_ = evaluate(
    clean = map_logistic(clean_standardize_expand), 
    fit   = stochastic_gradient_descent_e(
                logistic_gradient_ridge, 
                logistic_error, 
                True
            ), 
    y     = y, 
    x     = x, 
    hs    = hs, 
    cache = CACHE_DIR + 'clean_standardize_expand_stochastic_logistic_ridge_descent'
)

In [None]:
h = {
    'batch_size': 2500,
    'degree': -2,
    'gamma': 1e-2, 
    'lambda': 1e-3,
    'k_fold': 4,
    'max_iters': 3000,
    'num_batches': 1,
    'seed': 1,
}

res = fit_with_cache(
    stochastic_gradient_descent_e(
        logistic_gradient_ridge, 
        logistic_error, 
        True
    ),
    Cache(CACHE_DIR + 'clean_standardize_expand_stochastic_logistic_ridge_descent')
)(y, x, h)

In [None]:
y_te, x_te, ids = load_csv_data('data/test.csv', False)

In [None]:
y_te, x_te = map_logistic(clean_standardize_expand)(y_te, x_te, h)

y_pred = x_te @ res['w']
y_pred[np.where(y_pred <= 0.5)] = -1
y_pred[np.where(y_pred > 0.5)] = 1
y_pred

In [None]:
create_csv_submission(ids, y_pred, 'submissions/clean_standardize_expand_stochastic_logistic_ridge_descent')

In [None]:
def logistic_gradient_lasso(y, x, w, h):
    
    lambda_ = h['lambda']
    
    return compute_logistic_gradient(y, x, w) + lambda_ * np.sign(w)

def logistic_error_and_lasso(y, x, w, h):
    
    lambda_ = h['lambda']
    
    lasso_norm = np.linalg.norm(w, 1) * lambda_
    logistic_err = compute_logistic_error(y, x, w)
    
    return {
        'logistic_err': logistic_err,
        'lasso_norm': lasso_norm,
        'total_loss': logistic_err + lasso_norm
    }
    

hs = {
    'batch_size': 2500,
    'degree': [-2, 1, 2, 3, 4, 5, 6],
    'gamma': [1e-1, 1e-2, 1e-3], 
    'lambda': [1e-1, 1e-2, 1e-3],
    'k_fold': 4,
    'max_iters': 2000,
    'num_batches': 1,
    'seed': 0,
}

res = evaluate(
    map_logistic(clean_standardize_expand),
    stochastic_gradient_descent_e(
        logistic_gradient_lasso, 
        logistic_error_and_lasso, 
        True
    ), y, x, hs, cache = CACHE_DIR + 'clean_standardize_expand_stochastic_logistic_lasso_descent')

iteration 0 - err = {'logistic_err': 0.6415968992649881, 'lasso_norm': 0.10010474185782553, 'total_loss': 0.7417016411228137}
iteration 50 - err = {'logistic_err': 0.6330129538213997, 'lasso_norm': 0.19836868375686806, 'total_loss': 0.8313816375782677}
iteration 100 - err = {'logistic_err': 0.630834572764508, 'lasso_norm': 0.20381396436213592, 'total_loss': 0.834648537126644}
iteration 150 - err = {'logistic_err': 0.6031273793050423, 'lasso_norm': 0.18642360716539957, 'total_loss': 0.789550986470442}
iteration 200 - err = {'logistic_err': 0.6605590398874823, 'lasso_norm': 0.22622182121892626, 'total_loss': 0.8867808611064085}
iteration 250 - err = {'logistic_err': 0.6880252490682705, 'lasso_norm': 0.24058821538591446, 'total_loss': 0.9286134644541849}
iteration 300 - err = {'logistic_err': 0.6284586648408684, 'lasso_norm': 0.22162952950439854, 'total_loss': 0.850088194345267}
iteration 350 - err = {'logistic_err': 0.6254755848978096, 'lasso_norm': 0.2006472454997247, 'total_loss': 0.82

iteration 1200 - err = {'logistic_err': 0.4542352165671203, 'lasso_norm': 0.05041039909123808, 'total_loss': 0.5046456156583584}
iteration 1250 - err = {'logistic_err': 0.4572168933205635, 'lasso_norm': 0.05023309431866192, 'total_loss': 0.5074499876392254}
iteration 1300 - err = {'logistic_err': 0.46835059682825664, 'lasso_norm': 0.051425483588248215, 'total_loss': 0.5197760804165048}
iteration 1350 - err = {'logistic_err': 0.45740934044055864, 'lasso_norm': 0.05179220642751012, 'total_loss': 0.5092015468680687}
iteration 1400 - err = {'logistic_err': 0.4550795919171246, 'lasso_norm': 0.05065710371065369, 'total_loss': 0.5057366956277783}
iteration 1450 - err = {'logistic_err': 0.45703681228890003, 'lasso_norm': 0.050302059188842535, 'total_loss': 0.5073388714777426}
iteration 1500 - err = {'logistic_err': 0.45217573183004905, 'lasso_norm': 0.050900920207250365, 'total_loss': 0.5030766520372995}
iteration 1550 - err = {'logistic_err': 0.45263603422110515, 'lasso_norm': 0.0503221711613

iteration 400 - err = {'logistic_err': 0.5958294537701954, 'lasso_norm': 0.0700218166528883, 'total_loss': 0.6658512704230837}
iteration 450 - err = {'logistic_err': 0.5942872506427292, 'lasso_norm': 0.0718208782267316, 'total_loss': 0.6661081288694608}
iteration 500 - err = {'logistic_err': 0.5936246986402013, 'lasso_norm': 0.07251627583060818, 'total_loss': 0.6661409744708094}
iteration 550 - err = {'logistic_err': 0.5922062490938933, 'lasso_norm': 0.07324935566256173, 'total_loss': 0.665455604756455}
iteration 600 - err = {'logistic_err': 0.591828509700423, 'lasso_norm': 0.07484210133284872, 'total_loss': 0.6666706110332717}
iteration 650 - err = {'logistic_err': 0.5904136374730659, 'lasso_norm': 0.07473805961959297, 'total_loss': 0.6651516970926589}
iteration 700 - err = {'logistic_err': 0.5905973576644816, 'lasso_norm': 0.07391599845545671, 'total_loss': 0.6645133561199383}
iteration 750 - err = {'logistic_err': 0.5896287425179432, 'lasso_norm': 0.07461013024534612, 'total_loss': 

iteration 1600 - err = {'logistic_err': 0.4609965765905285, 'lasso_norm': 0.043205706120032814, 'total_loss': 0.5042022827105613}
iteration 1650 - err = {'logistic_err': 0.46059763535491377, 'lasso_norm': 0.04338595673728371, 'total_loss': 0.5039835920921975}
iteration 1700 - err = {'logistic_err': 0.4603002783381497, 'lasso_norm': 0.043500582237755096, 'total_loss': 0.5038008605759048}
iteration 1750 - err = {'logistic_err': 0.45989872780782215, 'lasso_norm': 0.043725202747911106, 'total_loss': 0.5036239305557333}
iteration 1800 - err = {'logistic_err': 0.4595180077216633, 'lasso_norm': 0.04394142087238871, 'total_loss': 0.503459428594052}
iteration 1850 - err = {'logistic_err': 0.4591520602921259, 'lasso_norm': 0.044169981911727996, 'total_loss': 0.5033220422038539}
iteration 1900 - err = {'logistic_err': 0.4588856699698698, 'lasso_norm': 0.04427383588365078, 'total_loss': 0.5031595058535205}
iteration 1950 - err = {'logistic_err': 0.4585652776568025, 'lasso_norm': 0.0444704912492359

iteration 800 - err = {'logistic_err': 0.6253530792681616, 'lasso_norm': 0.03887110319118453, 'total_loss': 0.6642241824593462}
iteration 850 - err = {'logistic_err': 0.6241562576292841, 'lasso_norm': 0.03975737551435657, 'total_loss': 0.6639136331436406}
iteration 900 - err = {'logistic_err': 0.6231154513288852, 'lasso_norm': 0.04039817254545667, 'total_loss': 0.6635136238743419}
iteration 950 - err = {'logistic_err': 0.6220488406616482, 'lasso_norm': 0.0410655253170176, 'total_loss': 0.6631143659786658}
iteration 1000 - err = {'logistic_err': 0.6209870030022426, 'lasso_norm': 0.04183352306747903, 'total_loss': 0.6628205260697216}
iteration 1050 - err = {'logistic_err': 0.6200836482980222, 'lasso_norm': 0.04246548368124622, 'total_loss': 0.6625491319792685}
iteration 1100 - err = {'logistic_err': 0.6192000651075614, 'lasso_norm': 0.04308055174030613, 'total_loss': 0.6622806168478675}
iteration 1150 - err = {'logistic_err': 0.6183138403800373, 'lasso_norm': 0.04367375463735563, 'total_

iteration 0 - err = {'logistic_err': 0.6919963735703388, 'lasso_norm': 1.001047418578255e-05, 'total_loss': 0.6920063840445246}
iteration 50 - err = {'logistic_err': 0.6563057795417223, 'lasso_norm': 0.0003376889283477395, 'total_loss': 0.65664346847007}
iteration 100 - err = {'logistic_err': 0.6347468743283621, 'lasso_norm': 0.0005895215015911671, 'total_loss': 0.6353363958299533}
iteration 150 - err = {'logistic_err': 0.6190696505647099, 'lasso_norm': 0.0008060607297552645, 'total_loss': 0.6198757112944652}
iteration 200 - err = {'logistic_err': 0.6070676794909772, 'lasso_norm': 0.0009909622969439123, 'total_loss': 0.6080586417879211}
iteration 250 - err = {'logistic_err': 0.5975492842325992, 'lasso_norm': 0.0011496368791182978, 'total_loss': 0.5986989211117175}
iteration 300 - err = {'logistic_err': 0.5897451500088429, 'lasso_norm': 0.0012906168920632018, 'total_loss': 0.5910357669009061}
iteration 350 - err = {'logistic_err': 0.5831623928181143, 'lasso_norm': 0.0014164632662175501,