In [7]:
%load_ext autoreload
%autoreload 2

from algebra import *
from cache import *
from costs import *
from features import *
from gradients import *
from helpers import *
from model import *
from splits import *

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
SUB_SAMPLE = True
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

In [9]:
y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

### Logistic Regression

In [10]:
def clean_standardize_expand(y, x, h):
        
    degree = int(h['degree'])

    x = remove_errors(x)
    x = remove_outliers(x)
    x = standardize_all(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)

    return y, x

def clean_expand(y, x, h):
        
    degree = int(h['degree'])

    x = remove_errors(x)
    x = remove_outliers(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)

    return y, x

In [16]:
def map_logistic(clean):
    
    def inner_function(y, x, h):
        y, x = clean(y, x, h)
        y = np.where(y == 1, 0, 1)
        return y, x
    
    return inner_function

def logistic_gradient(y, x, w, h):
    
    return compute_logistic_gradient(y, x, w)

def logistic_gradient_ridge(y, x, w, h):
    
    lambda_ = h['lambda']
    
    return compute_logistic_gradient(y, x, w) + lambda_ * w
            
def logistic_error(y, x, w):
    
    return { 
        'logistic_err': compute_logistic_error(y, x, w),
        'n_err': compute_error_count(y, x, w)
    }

def mle(y, x, w):
    return { 'mle' : compute_logistic_error(y, x, w) }

In [22]:
hs = {
    'batch_size': 1,
    'degree': np.concatenate([[-2], np.arange(1, 7)]),
    'gamma': [1e-2, 1e-3], 
    'k_fold': 4,
    'lambda': 0,
    'max_iters': 5000,
    'num_batches': 1,
    'seed': 1,
}

#stochastic_logistic_descent = stochastic_gradient_descent_e(logistic_gradient, compute_logistic_error, True)
_ = evaluate(
    clean = map_logistic(clean_expand), 
    fit   = stochastic_gradient_descent_e(
            logistic_gradient, 
            logistic_error, 
            False
    ),  
    y     = y, 
    x     = x, 
    hs    = hs, 
    cache = CACHE_DIR + 'clean_expand_stochastic_logistic_descentTESTVINC2')

iteration 0 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 50 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 100 - err = {'logistic_err': 7.690634184073958, 'n_err': 1670}
iteration 150 - err = {'logistic_err': 7.589320456944458, 'n_err': 1648}
iteration 200 - err = {'logistic_err': 8.086678824320092, 'n_err': 1756}
iteration 250 - err = {'logistic_err': 7.423534313668417, 'n_err': 1612}
iteration 300 - err = {'logistic_err': 6.732758797704378, 'n_err': 1462}
iteration 350 - err = {'logistic_err': 15.381268421233315, 'n_err': 3340}
iteration 400 - err = {'logistic_err': 7.036700030474711, 'n_err': 1528}
iteration 450 - err = {'logistic_err': 9.196524856067073, 'n_err': 1997}
iteration 500 - err = {'logistic_err': 7.359061931610952, 'n_err': 1598}
iteration 550 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 600 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 650 - err = {'logistic_err': 14.81483248821

iteration 600 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 650 - err = {'logistic_err': 14.813074912722303, 'n_err': 3217}
iteration 700 - err = {'logistic_err': 14.971408274665615, 'n_err': 3251}
iteration 750 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 800 - err = {'logistic_err': 7.2991947313396635, 'n_err': 1585}
iteration 850 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 900 - err = {'logistic_err': 7.128803423901163, 'n_err': 1548}
iteration 950 - err = {'logistic_err': 7.124198253566264, 'n_err': 1547}
iteration 1000 - err = {'logistic_err': 7.925497878401974, 'n_err': 1721}
iteration 1050 - err = {'logistic_err': 7.557084267793936, 'n_err': 1641}
iteration 1100 - err = {'logistic_err': 10.126769236363026, 'n_err': 2199}
iteration 1150 - err = {'logistic_err': 7.483401541227507, 'n_err': 1625}
iteration 1200 - err = {'logistic_err': 6.820257027216593, 'n_err': 1481}
iteration 1250 - err = {'logistic_err': 6.

iteration 1250 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 1300 - err = {'logistic_err': 6.6692830148045195, 'n_err': 1464}
iteration 1350 - err = {'logistic_err': 6.467275710082573, 'n_err': 1446}
iteration 1400 - err = {'logistic_err': 11.84123108583471, 'n_err': 2622}
iteration 1450 - err = {'logistic_err': 6.475509775407207, 'n_err': 1433}
iteration 1500 - err = {'logistic_err': 7.253241725805958, 'n_err': 1620}
iteration 1550 - err = {'logistic_err': 6.97528159717359, 'n_err': 1541}
iteration 1600 - err = {'logistic_err': 7.263319140940861, 'n_err': 1624}
iteration 1650 - err = {'logistic_err': 6.652222684506934, 'n_err': 1465}
iteration 1700 - err = {'logistic_err': 6.936199940908773, 'n_err': 1537}
iteration 1750 - err = {'logistic_err': 6.598779210377772, 'n_err': 1472}
iteration 1800 - err = {'logistic_err': 6.602450144674334, 'n_err': 1462}
iteration 1850 - err = {'logistic_err': 8.560325586681575, 'n_err': 1900}
iteration 1900 - err = {'logistic_err'

iteration 1950 - err = {'logistic_err': 7.5118032962155326, 'n_err': 1640}
iteration 2000 - err = {'logistic_err': 15.45955631439477, 'n_err': 3357}
iteration 2050 - err = {'logistic_err': 7.292738439963064, 'n_err': 1623}
iteration 2100 - err = {'logistic_err': 6.779566760075186, 'n_err': 1717}
iteration 2150 - err = {'logistic_err': 4.723330722502107, 'n_err': 1370}
iteration 2200 - err = {'logistic_err': 5.8755240437064336, 'n_err': 1477}
iteration 2250 - err = {'logistic_err': 5.359028485106023, 'n_err': 1468}
iteration 2300 - err = {'logistic_err': 5.111157429030845, 'n_err': 1460}
iteration 2350 - err = {'logistic_err': 7.530301293337138, 'n_err': 1642}
iteration 2400 - err = {'logistic_err': 11.598778672854763, 'n_err': 2965}
iteration 2450 - err = {'logistic_err': 5.511118488358805, 'n_err': 1481}
iteration 2500 - err = {'logistic_err': 5.629565110721999, 'n_err': 1452}
iteration 2550 - err = {'logistic_err': 9.401717521681329, 'n_err': 2453}
iteration 2600 - err = {'logistic_e

iteration 2650 - err = {'logistic_err': 6.78802083492193, 'n_err': 1474}
iteration 2700 - err = {'logistic_err': 7.1103827432896765, 'n_err': 1544}
iteration 2750 - err = {'logistic_err': 7.3544567618717815, 'n_err': 1597}
iteration 2800 - err = {'logistic_err': 11.936601121119805, 'n_err': 2592}
iteration 2850 - err = {'logistic_err': 6.8570983932716, 'n_err': 1489}
iteration 2900 - err = {'logistic_err': 7.893261693156802, 'n_err': 1714}
iteration 2950 - err = {'logistic_err': 7.547873907812503, 'n_err': 1639}
iteration 3000 - err = {'logistic_err': 13.576041708118728, 'n_err': 2948}
iteration 3050 - err = {'logistic_err': 7.322220577387829, 'n_err': 1590}
iteration 3100 - err = {'logistic_err': 7.584715269200959, 'n_err': 1647}
iteration 3150 - err = {'logistic_err': 7.101172413922217, 'n_err': 1542}
iteration 3200 - err = {'logistic_err': 7.262353359145303, 'n_err': 1577}
iteration 3250 - err = {'logistic_err': 7.88405135316547, 'n_err': 1712}
iteration 3300 - err = {'logistic_err'

iteration 3400 - err = {'logistic_err': 6.7327587931205635, 'n_err': 1462}
iteration 3450 - err = {'logistic_err': 7.888656523682398, 'n_err': 1713}
iteration 3500 - err = {'logistic_err': 7.5570842481182465, 'n_err': 1641}
iteration 3550 - err = {'logistic_err': 7.009069005006766, 'n_err': 1522}
iteration 3600 - err = {'logistic_err': 7.2163016568388265, 'n_err': 1567}
iteration 3650 - err = {'logistic_err': 6.852493226544183, 'n_err': 1488}
iteration 3700 - err = {'logistic_err': 7.515637716643112, 'n_err': 1632}
iteration 3750 - err = {'logistic_err': 6.718943289181884, 'n_err': 1459}
iteration 3800 - err = {'logistic_err': 6.737363962925927, 'n_err': 1463}
iteration 3850 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 3900 - err = {'logistic_err': 7.589320439386927, 'n_err': 1648}
iteration 3950 - err = {'logistic_err': 8.639299249101924, 'n_err': 1876}
iteration 4000 - err = {'logistic_err': 11.674106416597759, 'n_err': 2535}
iteration 4050 - err = {'logistic_

iteration 4050 - err = {'logistic_err': 14.128662130269408, 'n_err': 3068}
iteration 4100 - err = {'logistic_err': 10.573470736640912, 'n_err': 2296}
iteration 4150 - err = {'logistic_err': 11.996468332379031, 'n_err': 2605}
iteration 4200 - err = {'logistic_err': 7.870235831222479, 'n_err': 1709}
iteration 4250 - err = {'logistic_err': 11.549766825150984, 'n_err': 2508}
iteration 4300 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 4350 - err = {'logistic_err': 7.561689418287667, 'n_err': 1642}
iteration 4400 - err = {'logistic_err': 8.708376808957473, 'n_err': 1891}
iteration 4450 - err = {'logistic_err': 7.561689418287667, 'n_err': 1642}
iteration 4500 - err = {'logistic_err': 7.322220575766118, 'n_err': 1590}
iteration 4550 - err = {'logistic_err': 7.741291066197402, 'n_err': 1681}
iteration 4600 - err = {'logistic_err': 7.317615404686552, 'n_err': 1589}
iteration 4650 - err = {'logistic_err': 9.878090039635365, 'n_err': 2145}
iteration 4700 - err = {'logistic_

iteration 4700 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 4750 - err = {'logistic_err': 7.3866929527764045, 'n_err': 1604}
iteration 4800 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 4850 - err = {'logistic_err': 9.744540110398145, 'n_err': 2116}
iteration 4900 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 4950 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 0 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 50 - err = {'logistic_err': 14.883910041132223, 'n_err': 3232}
iteration 100 - err = {'logistic_err': 7.566294588457086, 'n_err': 1643}
iteration 150 - err = {'logistic_err': 8.243254608508298, 'n_err': 1790}
iteration 200 - err = {'logistic_err': 7.8702358225512725, 'n_err': 1709}
iteration 250 - err = {'logistic_err': 7.76431690815817, 'n_err': 1686}
iteration 300 - err = {'logistic_err': 7.584715270723385, 'n_err': 1647}
iteration 350 - err = {'logistic_err': 7.57550

KeyboardInterrupt: 

In [None]:
def compute_mle(y, x, w):
    res = 0
    for i in range(x.shape[0]):
        xnTW = x[i].T@w
        res += np.log(1+np.exp(xnTW))-y[i]*xnTW
    return res / y.shape[0]

def print_values(y, x, w):
    for i in range(y.shape[0]):
        print("y: " + str(y[i]))
        print("pred :  " + str(x[i].T@w))

def compute_mle2(y, x, w):
    y_pred = logistic_function(x @ w)
    return - (y @ np.log(y_pred) + (1 - y) @ np.log(1 - y_pred))

class First_Order_Logistic_Regression_Model2(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        #x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        #print('avant')
        #print(y[:5])
        y = np.where(y == -1, 0, 1)
        #print('après')
        #print(y[:5])
        
        return x, y

    def fit(self, x, y, h={}):
        
        batch_size = int(h['batch_size'])
        n_iters = int(h['n_iters'])
        gamma = float(h['gamma'])
        
        initial_w = np.zeros(x.shape[1])
        return logistic_regression(y, x, initial_w, batch_size, n_iters, gamma)
    
    def test(self, x, y, w, h):
        #y = np.where(y == 0, -1, 1)
        mse = compute_mle(y, x, w)
        #print_values(y, x, w)
        if np.isnan(mse):
            mse = np.inf
        return { 'mse': mse }

In [None]:
myModel = CrossValidationModel(First_Order_Logistic_Regression_Model2())

n_iters = [100]
batch_size = [50]

degrees = np.arange(1,4)
gammas = np.logspace(-20, -10, 5)
gammas = gammas[:len(gammas)-1]

hs={
    'n_iters': n_iters,
    'batch_size': batch_size,
    'degree': degrees,
    'gamma': gammas,
    'k_fold': [4],
    'seed': [0]
}

res = myModel.evaluate(x, y, hs, filename=CACHE_DIR+'Logistic_Regression_ExploFullSampleNoStdProp')

#print(res)

plot_heatmap(res, hs, 'mse_te', 'degree', 'gamma')

#res_mse = np.vectorize(lambda x: x['mse'])(res)
#x_axis = np.unique(np.vectorize(lambda x: x['gamma'])(res))
#y_axis = np.unique(np.vectorize(lambda x: x['degree'])(res))

#plot_heatmap(res, hs, 'mse', 'degree', 'gamma')
find_arg_min(res, 'mse_te')

In [None]:
hs = {'batch_size': 1.0,
 'degree': 6.0,
 'gamma': 10**-20,
 'k_fold': 4.0,
 'n_iters': 10.0,
 'seed': 0.0,
 'mse_te': 0.3988701335328735,
 'mse_tr': 0.3988200000042866}

myModel.predict(hs, x, y, SUBMISSIONS_DIR + 'Logistic_Regression')

### Regularized Order Logistic Regression

In [None]:
class Regularized_Logistic_Regression_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        #x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        y = np.where(y == -1, 0, 1)
        
        return x, y

    def fit(self, x, y, h={}):
        
        batch_size = int(h['batch_size'])
        n_iters = int(h['n_iters'])
        gamma = float(h['gamma'])
        lambda_ = float(h['lambda'])
        
        initial_w = np.zeros(x.shape[1])
        return reg_logistic_regression(y, x, initial_w, batch_size, n_iters, gamma, lambda_)
    
    def test(self, x, y, w, h):
        mse = compute_mle(y, x, w)
        if np.isnan(mse):
            mse = np.inf
        return { 'mse': mse }
    
    def predict(self, h, x_tr, y_tr, name):

        x_tr, y_tr = self.prepare(x_tr, y_tr, h)
        w = self.fit(x_tr, y_tr, h)

        _, x_pred, ids = load_csv_data("data/test.csv", sub_sample=False)
        x_pred, _ = self.prepare(x_pred, None, h)
        y_pred = np.dot(data, weights)
        #modified for logistic regression
        y_pred[np.where(y_pred <= 0.5)] = -1
        y_pred[np.where(y_pred > 0.5)] = 1

        create_csv_submission(ids, y_pred, name)
        

In [None]:
myModel = CrossValidationModel(Regularized_Logistic_Regression_Model())

n_iters = [1000]
batch_size = [1]
degrees = 4#np.arange(1,8)
gammas = 10**-7 #np.logspace(-10, -5, 10)
lambdas = 10**-8#np.logspace(-8, -5, 3)

hs={
    'n_iters': n_iters,
    'batch_size': batch_size,
    'degree': degrees,
    'gamma': gammas,
    'lambda': lambdas,
    'k_fold': [4],
    'seed': [0]
}

res = myModel.evaluate(x, y, hs, filename=CACHE_DIR+'Regularized_Logistic_Regression_Explo')


#plot_heatmap(res, hs, 'mse_te', 'degree', 'gamma')

#res_mse = np.vectorize(lambda x: x['mse'])(res)
#x_axis = np.unique(np.vectorize(lambda x: x['gamma'])(res))
#y_axis = np.unique(np.vectorize(lambda x: x['degree'])(res))

plot_heatmap(res, hs, 'mse_te', 'degree', 'lambda')
plt.figure(2)
plot_heatmap(res, hs, 'mse_te', 'degree', 'gamma')
find_arg_min(res, 'mse_te')

In [None]:
hs = {'batch_size': 1,
 'degree': 4,
 'gamma': 1e-07,
 'k_fold': 4,
 'lambda': 1e-08,
 'n_iters': 10000,
 'seed': 0,
 'mse_tr': 0.6896604235218645,
 'mse_te': 0.6896430727585345}


myModel.predict(hs, x, y, SUBMISSIONS_DIR + 'Regularized_Logistic_RegressionNoStd')



In [None]:
import pandas
csv_file = SUBMISSIONS_DIR + 'Regularized_Logistic_RegressionNoStd'

data = pandas.read_csv(csv_file)

l = data.Prediction

counter = 0
for i in l:
    if i == 1:
        counter += 1

print(counter/568238)

### Ridge

In [None]:
class RidgeRegression_MSE_Degree_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h={}):

        lambda_ = float(h['lambda'])

        return ridge_regression(y, x, lambda_)
    
    def test(self, x, y, w, h):
        mse = compute_mse(y, x, w)
        if np.isnan(mse):
            mse = np.inf
        return { 'mse': mse }


In [None]:
myModel = CrossValidationModel(RidgeRegression_MSE_Degree_Model())

degrees = np.arange(10, 14)
lambdas = np.logspace(-6, -4,10)

hs={
    'degree': degrees,
    'lambda': lambdas,
    'k_fold': [4],
    'seed': [0]
}

res = myModel.evaluate(x, y, hs, filename=CACHE_DIR+'Ridge_Explo_Vinc')


plot_heatmap(res, hs, 'mse_te', 'degree', 'lambda')
find_arg_min(res, 'mse_te')

### Lasso

In [None]:
class Lasso_SGD_MSE_Degree_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h={}):
        
        batch_size = int(h['batch_size'])
        n_iters = int(h['n_iters'])
        lambda_ = float(h['lambda'])
        gamma = float(h['gamma'])

        initial_w = np.zeros(x.shape[1])

        return lasso_stochastic_gradient_descent(y, x, initial_w, batch_size, n_iters, gamma, lambda_)
    
    def test(self, x, y, w, h):
            mse = compute_mse(y, x, w)
            if np.isnan(mse):
                mse = np.inf
            return { 'mse': mse }

In [None]:
myModel = CrossValidationModel(Lasso_SGD_MSE_Degree_Model())

degrees = np.arange(4)
lambdas = np.logspace(-3, -1,5)
gammas = np.logspace(-15, -10, 3)

hs={
    'batch_size': 1,
    'n_iters': 1000,
    'degree': degrees,
    'lambda': lambdas,
    'gamma': gammas,
    'k_fold': [4],
    'seed': [0]
}

res = myModel.evaluate(x, y, hs, filename=CACHE_DIR+'Lasso_Explo_VincTEST')


plot_heatmap(res, hs, 'mse_te', 'degree', 'lambda')
plot_heatmap(res, hs, 'mse_te', 'degree', 'gamma')
find_arg_min(res, 'mse_te')