In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from preprocessing import preprocessing
from features_engineering import augment
from implementations import *
from cross_validation import *
from proj1_helpers import *
import pickle as pkl

In [2]:
DATA_TRAIN_PATH = '../data/train.csv' 
DATA_TEST_PATH = '../data/test.csv'
y, tX_train, ids = load_csv_data(DATA_TRAIN_PATH)
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [3]:
#adding a second axis to y for dimension compatitbility reasons
y=y[:,np.newaxis]

#computing the size of the prediction to generate
pred_len=tX_test.shape[0]

#all available regression functions
lq=lambda a,b,c,d,e,f:least_squares(a,b)
lqgd=lambda a,b,c,d,e,f: least_squares_GD(a, b, d, e, f)
lqsgd=lambda a,b,c,d,e,f: least_squares_SGD(a, b, d, e, f)
r=lambda a,b,c,d,e,f: ridge_regression(a, b, c)
lgd=lambda a,b,c,d,e,f: logistic_regression(a,b,d,e,f)
lsgd=lambda a,b,c,d,e,f: logistic_regression_SGD(a,b,d,e,f)
rlgd=reg_logistic_regression
rlsgd=reg_logistic_regression_SGD

In [4]:
def split(perc,y,x):
    """computes a (perc, 1-perc) split of x and y"""
    np.random.seed(seed=1)
    sample_size=len(y)
    cut_ind=int(perc*sample_size)
    shuffle_indices = np.random.permutation(sample_size)
    shuffled_x=x[shuffle_indices]
    shuffled_y=y[shuffle_indices]
    return shuffled_y[:cut_ind], shuffled_x[:cut_ind],shuffled_y[cut_ind:], shuffled_x[cut_ind:]

In [5]:
def preprocess_and_expand(xtr,xte,degrees):
    """preprocesses and then expands the samples"""
    (xstr, mtr) = preprocessing(xtr)
    (xste, mte) = preprocessing(xte)
    for it,(degree,xtr,xte) in enumerate(zip(degrees,xstr,xste)):
        xstr[it]=augment(xtr,degree)
        xste[it]=augment(xte,degree)
    return xstr, mtr, xste, mte

In [6]:
def predict_labels(tx,w,logistic):
    """generates class predictions given weights, and a test data matrix"""
    y_pred= 2*sigmoid(tx.dot(w))-1 if logistic else tx.dot(w)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    return y_pred

def predictions(ys, tx, px, mask_t, mask_p, reg_fs, len_pred, lambdas, max_iters, gammas, logistics):
    """generates predictions using the regression function reg_f (trained on y,tx) and the inputs px"""
    y_pred = np.zeros((len_pred,1))
    for x_train, mask_train, x_test, mask_test, lambda_, max_iter, gamma, logistic, reg_f, y_i in zip(tx, mask_t, px, mask_p, lambdas, max_iters, gammas, logistics, reg_fs, ys):
        #print("#######New subset#######")
        y_correspond = y_i[mask_train]
        x_train
        #print("Augmented train")
        initial_w= np.zeros((x_train.shape[1], 1))
        w,_ = reg_f(y_correspond, x_train, lambda_, initial_w, max_iter, gamma)
        #print("Computed weights")
        del x_train
        #print("Augmented test")
        y_pred[mask_test] = predict_labels(x_test,w,logistic)
        del w
        #print("Computed predictions")
    return y_pred

In [7]:
def basic_cross(ytr, yte, xstr, mtr, xste, mte, reg_fs, lambdas, max_iters, gammas, logistics):
    """estimates the true performance of reg_fs through basic cross-validation between 
    a test and training set of samples"""
    ytr_l=np.where(ytr==-1,0,1)
    size_pred=len(yte)
    ys=[(ytr_l if l else ytr) for l in logistics]
    y_bar=predictions(ys, xstr, xste, mtr, mte, reg_fs, size_pred, lambdas, max_iters, gammas, logistics)
    return np.sum((yte==y_bar))/len(yte)

In [8]:
def cross_validation_kfolds(y_train, x_train, num_folds, mtr, reg_fs, lambdas, max_iters, gammas, logistics):
    scores = []
    #ytr_l=np.where(y_train==-1,0,1)
    #ys=[(ytr_l if l else y_train) for l in logistics]
    for x_sub,mask,lambda_, max_iter, gamma, logistic, reg_f in zip(x_train,mtr,lambdas,max_iters,gammas,logistics,reg_fs):
        y_correspond = y_train[mask]
        scores_sub = []
        for x_train_s, x_val_s, y_train_s, y_val_s in k_fold_splits(y_correspond, x_sub, num_folds):
            size_pred=len(y_val_s)
            y_pred = np.zeros((size_pred,1))
            initial_w= np.zeros((x_train_s.shape[1], 1))
            w,_ = reg_f(y_train_s, x_train_s, lambda_, initial_w, max_iter, gamma)
            y_pred = predict_labels(x_val_s,w,logistic)
            score = np.mean(y_pred == y_val_s)
            scores_sub.append(score)
        print("finished subset average is :",np.array(scores_sub).mean())
        scores.append(np.array(scores_sub).mean())
    return np.array(scores)

In [None]:
#split percentage for the cross-validation
perc=0.8
#generating the training and test sets
ytr,xtr,yte,xte=split(perc,y,tX_train)

In [None]:
#degrees of expansion for each of the subsets generated by the preprocessing
degrees=[3,3,3]

#preprocessing and expanding both the training and the test set
xstr, mtr, xste, mte=preprocess_and_expand(xtr,xte,degrees)


In [None]:
# grid searching the best lmbda for ridge regression

# setting the search interval
interval_size=10
interval=np.linspace(-20, 0, interval_size)

# models to test
test_models=[r]
test_logistics=[False]

# setting the models
logistics=[False,False,False]
reg_fs= [r,r,r]
datasets=3

# setting model parameters
max_iters=[100000,100000,100000] 
gammas=[10**-8,10**-8,10**-8] 

# initializing model hyperparameters 
lda1=10**-14
lda2=10**-12
lda3=10**-16
lambdas=[lda1,lda2,lda3]

# initializing the results container
results=np.zeros((interval_size*len(test_models),3))

# finding lambdas
for data_num in range(datasets):
    for i, m in enumerate(test_models):
        for j, v in enumerate(interval):
            lda1=10**v  
            lambdas[data_num]=lda1
            reg_fs[data_num]=m
            logistics[data_num]=test_logistics[i]
            performance=basic_cross(ytr, yte, xstr, mtr, xste, mte, reg_fs, lambdas, max_iters, gammas, logistics)
            results[interval_size*i+j]=[lda1,i,performance]
            print("Step ",data_num+1,": ", 100*(i*interval_size+j+1)/(interval_size*len(test_models)), "% done")
    
    print(results[np.where(results[:,2]==np.max(results[:,2]))])
    lambdas[data_num]=results[np.where(results[:,2]==np.max(results[:,2]))][0,0]
    index_mod=int(results[np.where(results[:,2]==np.max(results[:,2]))][0,1])
    reg_fs[data_num]=test_models[index_mod]
    logistics[data_num]= test_logistics[index_mod]
    results=np.zeros((interval_size*len(test_models),3))

# estimating the performance of the best overall model
performance=basic_cross(ytr, yte, xstr, mtr, xste, mte, reg_fs, lambdas, max_iters, gammas, logistics)
print("Best performance: ",performance*100, "%") 
print("Best lambdas: ", lambdas)
print("Best models: ", [("r" if m==r else "lsgd") for m in reg_fs])

In [None]:
# for degree 2
lda1=1.2*10**-15
lda2=1.2*10**-10
lda3=1.2*10**-14
reg_fs=[r,r,r]
logistics=[False,False,False]
lambdas=[lda1,lda2,lda3]
lambdas=[lda1,lda2,lda3]
gamma,max_iters= [0,0,0],[0,0,0]

performance=basic_cross(ytr, yte, xstr, mtr, xste, mte, reg_fs, lambdas, max_iters, gammas, logistics)
print("The best performance for the degree 2 espansion is: ", performance) 

In [None]:
# for degree 3
reg_fs=[r,r,r]
logistics=[False,False,False]
lambdas=[3.5938136638046256e-05, 3.5938136638046256e-05, 0.005994842503189421]
gamma,max_iters= [0,0,0],[0,0,0]

performance=basic_cross(ytr, yte, xstr, mtr, xste, mte, reg_fs, lambdas, max_iters, gammas, logistics)
print("The best performance for the degree 3 espansion is: ", performance) 

In [None]:
# expanding for k cross-validation
degrees=[3,3,3]
def preprocess_and_expandkcross(xtr,degrees):
    """preprocesses and then expands the samples"""
    (xstr, mtr) = preprocessing(xtr)
    for it,(degree,xtr) in enumerate(zip(degrees,xstr)):
        xstr[it]=augment(xtr,degree)
    return xstr, mtr

#preprocessing and expanding both the training and the test set
xtraink, mtr=preprocess_and_expandkcross(tX_train,degrees)

In [None]:
# for degree 3
lda1=2.26*10**-15
lda2=2.42*10**-12
lda3=2.21*10**-16
reg_fs=[r,r,r]
logistics=[False,False,False]
lambdas=[lda1,lda2,lda3]
gammas,max_iters= [0,0,0],[0,0,0]

performance=cross_validation_kfolds(y, xtraink, 5, mtr, reg_fs, lambdas, max_iters, gammas, logistics)
print("The best performance for the degree 3 espansion is: ", performance) 

In [9]:
# for degree 3
reg_fs=[r,r,r]
logistics=[False,False,False]
lda1=2.26*10**-15
lda2=2.42*10**-12
lda3=2.21*10**-16
lambdas=[lda1,lda2,lda3]
gamma,max_iters= [0,0,0],[0,0,0]
degrees=[3,3,3]
xstr, mtr, xste, mte=preprocess_and_expand(tX_train,tX_test,degrees)
size_pred=len(yte)
ys=[y,y,y]
y_bar=predictions(ys, xstr, xste, mtr, mte, reg_fs, size_pred, lambdas, max_iters, gammas, logistics)
create_csv_submission(ids_test, y_bar, "predictions.csv")

Degree 2
Degree 3
Degree 2
Degree 3
Degree 2
Degree 3
Degree 2
Degree 3
Degree 2
Degree 3
Degree 2
Degree 3


MemoryError: 