In [1]:
%load_ext autoreload
%autoreload 2

from algebra import *
from cache import *
from costs import *
from features import *
from gradients import *
from helpers import *
from model import *
from splits import *

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings('ignore')

In [2]:
SUB_SAMPLE = True
CACHE_DIR = "test/cache/" if SUB_SAMPLE else "cache/"
SUBMISSIONS_DIR = "test/submissions/" if SUB_SAMPLE else "submissions/"

In [3]:
y, x, ids = load_csv_data('data/train.csv', SUB_SAMPLE)

### Logistic Regression

In [4]:
def clean_standardize_expand(y, x, h):
        
    degree = int(h['degree'])

    x = remove_errors(x)
    x = remove_outliers(x)
    x = standardize_all(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)

    return y, x

def clean_expand(y, x, h):
        
    degree = int(h['degree'])

    x = remove_errors(x)
    x = remove_outliers(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)

    return y, x

In [5]:
def map_logistic(clean):
    
    def inner_function(y, x, h):
        y, x = clean(y, x, h)
        y = np.where(y == 1, 0, 1)
        return y, x
    
    return inner_function

def logistic_gradient(y, x, w, h):
    
    return compute_logistic_gradient(y, x, w)

def logistic_gradient_ridge(y, x, w, h):
    
    lambda_ = h['lambda']
    
    return compute_logistic_gradient(y, x, w) + lambda_ * w
            
def logistic_error(y, x, w, h):
    
    lambda_ = h['lambda']
    
    logistic_err = compute_logistic_error(y, x, w)
    n_err = compute_error_count(predict_logistic)(y, x, w)
    
    return {
        'logistic_err': logistic_err,
        'n_err': n_err
    }

In [12]:
hs = {
    'batch_size': 1,
    'degree': 1,#np.concatenate([np.arange(1, 7)]),
    'gamma': 1e-1, 
    'k_fold': 4,
    'lambda': 0,
    'max_iters': 1000,
    'num_batches': 1,
    'seed': 1
}

cache = Cache(CACHE_DIR + 'AAAAAAAABBCCDEF')

_ = evaluate(
    clean = map_logistic(clean_expand), 
    fit   = descent_with_cache(
        descent    = stochastic_gradient_descent_e(logistic_gradient), 
        loss       = logistic_error, 
        round_size = 100,
        cache      = cache,
        log        = True
    ), 
    y     = y,
    x     = x,
    hs    = hs
)

iteration 100 - err = {'logistic_err': 7.566294588457086, 'n_err': 0.3286}
iteration 200 - err = {'logistic_err': 7.566294588457086, 'n_err': 0.3286}
iteration 300 - err = {'logistic_err': 7.566294588457086, 'n_err': 0.3286}
iteration 400 - err = {'logistic_err': 15.45955631439477, 'n_err': 0.6714}
iteration 500 - err = {'logistic_err': 7.566294588457086, 'n_err': 0.3286}
iteration 600 - err = {'logistic_err': 7.566294588457086, 'n_err': 0.3286}
iteration 700 - err = {'logistic_err': 15.45955631439477, 'n_err': 0.6714}
iteration 800 - err = {'logistic_err': 7.566294588457086, 'n_err': 0.3286}
iteration 900 - err = {'logistic_err': 7.566294588457086, 'n_err': 0.3286}
iteration 1000 - err = {'logistic_err': 7.566294588457086, 'n_err': 0.3286}


In [None]:
def compute_mle(y, x, w):
    res = 0
    for i in range(x.shape[0]):
        xnTW = x[i].T@w
        res += np.log(1+np.exp(xnTW))-y[i]*xnTW
    return res / y.shape[0]

def print_values(y, x, w):
    for i in range(y.shape[0]):
        print("y: " + str(y[i]))
        print("pred :  " + str(x[i].T@w))

def compute_mle2(y, x, w):
    y_pred = logistic_function(x @ w)
    return - (y @ np.log(y_pred) + (1 - y) @ np.log(1 - y_pred))

class First_Order_Logistic_Regression_Model2(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        #x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        #print('avant')
        #print(y[:5])
        y = np.where(y == -1, 0, 1)
        #print('après')
        #print(y[:5])
        
        return x, y

    def fit(self, x, y, h={}):
        
        batch_size = int(h['batch_size'])
        n_iters = int(h['n_iters'])
        gamma = float(h['gamma'])
        
        initial_w = np.zeros(x.shape[1])
        return logistic_regression(y, x, initial_w, batch_size, n_iters, gamma)
    
    def test(self, x, y, w, h):
        #y = np.where(y == 0, -1, 1)
        mse = compute_mle(y, x, w)
        #print_values(y, x, w)
        if np.isnan(mse):
            mse = np.inf
        return { 'mse': mse }

In [None]:
myModel = CrossValidationModel(First_Order_Logistic_Regression_Model2())

n_iters = [100]
batch_size = [50]

degrees = np.arange(1,4)
gammas = np.logspace(-20, -10, 5)
gammas = gammas[:len(gammas)-1]

hs={
    'n_iters': n_iters,
    'batch_size': batch_size,
    'degree': degrees,
    'gamma': gammas,
    'k_fold': [4],
    'seed': [0]
}

res = myModel.evaluate(x, y, hs, filename=CACHE_DIR+'Logistic_Regression_ExploFullSampleNoStdProp')

#print(res)

plot_heatmap(res, hs, 'mse_te', 'degree', 'gamma')

#res_mse = np.vectorize(lambda x: x['mse'])(res)
#x_axis = np.unique(np.vectorize(lambda x: x['gamma'])(res))
#y_axis = np.unique(np.vectorize(lambda x: x['degree'])(res))

#plot_heatmap(res, hs, 'mse', 'degree', 'gamma')
find_arg_min(res, 'mse_te')

In [None]:
hs = {'batch_size': 1.0,
 'degree': 6.0,
 'gamma': 10**-20,
 'k_fold': 4.0,
 'n_iters': 10.0,
 'seed': 0.0,
 'mse_te': 0.3988701335328735,
 'mse_tr': 0.3988200000042866}

myModel.predict(hs, x, y, SUBMISSIONS_DIR + 'Logistic_Regression')

### Regularized Order Logistic Regression

In [None]:
class Regularized_Logistic_Regression_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        #x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        y = np.where(y == -1, 0, 1)
        
        return x, y

    def fit(self, x, y, h={}):
        
        batch_size = int(h['batch_size'])
        n_iters = int(h['n_iters'])
        gamma = float(h['gamma'])
        lambda_ = float(h['lambda'])
        
        initial_w = np.zeros(x.shape[1])
        return reg_logistic_regression(y, x, initial_w, batch_size, n_iters, gamma, lambda_)
    
    def test(self, x, y, w, h):
        mse = compute_mle(y, x, w)
        if np.isnan(mse):
            mse = np.inf
        return { 'mse': mse }
    
    def predict(self, h, x_tr, y_tr, name):

        x_tr, y_tr = self.prepare(x_tr, y_tr, h)
        w = self.fit(x_tr, y_tr, h)

        _, x_pred, ids = load_csv_data("data/test.csv", sub_sample=False)
        x_pred, _ = self.prepare(x_pred, None, h)
        y_pred = np.dot(data, weights)
        #modified for logistic regression
        y_pred[np.where(y_pred <= 0.5)] = -1
        y_pred[np.where(y_pred > 0.5)] = 1

        create_csv_submission(ids, y_pred, name)
        

In [None]:
myModel = CrossValidationModel(Regularized_Logistic_Regression_Model())

n_iters = [1000]
batch_size = [1]
degrees = 4#np.arange(1,8)
gammas = 10**-7 #np.logspace(-10, -5, 10)
lambdas = 10**-8#np.logspace(-8, -5, 3)

hs={
    'n_iters': n_iters,
    'batch_size': batch_size,
    'degree': degrees,
    'gamma': gammas,
    'lambda': lambdas,
    'k_fold': [4],
    'seed': [0]
}

res = myModel.evaluate(x, y, hs, filename=CACHE_DIR+'Regularized_Logistic_Regression_Explo')


#plot_heatmap(res, hs, 'mse_te', 'degree', 'gamma')

#res_mse = np.vectorize(lambda x: x['mse'])(res)
#x_axis = np.unique(np.vectorize(lambda x: x['gamma'])(res))
#y_axis = np.unique(np.vectorize(lambda x: x['degree'])(res))

plot_heatmap(res, hs, 'mse_te', 'degree', 'lambda')
plt.figure(2)
plot_heatmap(res, hs, 'mse_te', 'degree', 'gamma')
find_arg_min(res, 'mse_te')

In [None]:
hs = {'batch_size': 1,
 'degree': 4,
 'gamma': 1e-07,
 'k_fold': 4,
 'lambda': 1e-08,
 'n_iters': 10000,
 'seed': 0,
 'mse_tr': 0.6896604235218645,
 'mse_te': 0.6896430727585345}


myModel.predict(hs, x, y, SUBMISSIONS_DIR + 'Regularized_Logistic_RegressionNoStd')



In [None]:
import pandas
csv_file = SUBMISSIONS_DIR + 'Regularized_Logistic_RegressionNoStd'

data = pandas.read_csv(csv_file)

l = data.Prediction

counter = 0
for i in l:
    if i == 1:
        counter += 1

print(counter/568238)

### Ridge

In [None]:
class RidgeRegression_MSE_Degree_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h={}):

        lambda_ = float(h['lambda'])

        return ridge_regression(y, x, lambda_)
    
    def test(self, x, y, w, h):
        mse = compute_mse(y, x, w)
        if np.isnan(mse):
            mse = np.inf
        return { 'mse': mse }


In [None]:
myModel = CrossValidationModel(RidgeRegression_MSE_Degree_Model())

degrees = np.arange(10, 14)
lambdas = np.logspace(-6, -4,10)

hs={
    'degree': degrees,
    'lambda': lambdas,
    'k_fold': [4],
    'seed': [0]
}

res = myModel.evaluate(x, y, hs, filename=CACHE_DIR+'Ridge_Explo_Vinc')


plot_heatmap(res, hs, 'mse_te', 'degree', 'lambda')
find_arg_min(res, 'mse_te')

### Lasso

In [None]:
class Lasso_SGD_MSE_Degree_Model(Model):

    def prepare(self, x, y, h):
        
        degree = int(h['degree'])

        x = remove_errors(x)
        x = remove_outliers(x)
        x = standardize_all(x)
        x = remove_nan_features(x)
        x = build_poly(x, degree)
        
        return x, y

    def fit(self, x, y, h={}):
        
        batch_size = int(h['batch_size'])
        n_iters = int(h['n_iters'])
        lambda_ = float(h['lambda'])
        gamma = float(h['gamma'])

        initial_w = np.zeros(x.shape[1])

        return lasso_stochastic_gradient_descent(y, x, initial_w, batch_size, n_iters, gamma, lambda_)
    
    def test(self, x, y, w, h):
            mse = compute_mse(y, x, w)
            if np.isnan(mse):
                mse = np.inf
            return { 'mse': mse }

In [None]:
myModel = CrossValidationModel(Lasso_SGD_MSE_Degree_Model())

degrees = np.arange(4)
lambdas = np.logspace(-3, -1,5)
gammas = np.logspace(-15, -10, 3)

hs={
    'batch_size': 1,
    'n_iters': 1000,
    'degree': degrees,
    'lambda': lambdas,
    'gamma': gammas,
    'k_fold': [4],
    'seed': [0]
}

res = myModel.evaluate(x, y, hs, filename=CACHE_DIR+'Lasso_Explo_VincTEST')


plot_heatmap(res, hs, 'mse_te', 'degree', 'lambda')
plot_heatmap(res, hs, 'mse_te', 'degree', 'gamma')
find_arg_min(res, 'mse_te')