In [1]:
import numpy as np

from implementations import *
from proj1_helpers import *

In [2]:
# Load data
DATA_TRAIN_PATH = 'train.csv' 
y, tX, ids, headers = load_csv_data(DATA_TRAIN_PATH) # Modified the load_csv_data to also give headers

In [3]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [4]:
def final_model(tX, headers, y, degree, tX_test, ids_test, headers_test):
    # process features for to train the model
    data, targets, ids = process_features_train(tX, headers, y, degree)
    
    # train base model 
    w_1 = logistic_regression_demo(targets[0], data[0], max_iters=10000, gamma=0.01)

    # train jet=1 model using base model weights as initial weights
    w_2 = logistic_regression_demo_winit(targets[2], data[2], w_1, max_iters=10000, gamma=0.01)

    # train jet=2/3 model using base model weights as initial weights
    w_3 = logistic_regression_demo_winit(targets[3], data[4], w_1, max_iters=10000, gamma=0.01)
    
    

    # process test set
    data, ids = process_features_test(tX_test, headers, ids_test, degree)

    # create Predictions
    weights = [w_1, w_2, w_3]
    y_pred_final = create_predictions(weights, data, ids)
    
    return y_pred_final

In [5]:
def cross_validation(y, x, k_indices, k, gamma, degree):
    """return the loss of ridge regression."""
    # ***************************************************
    # get k'th subgroup in test, others in train
    x_tr = x[k_indices[np.arange(len(k_indices))!=k].ravel()]
    x_te = x[k_indices[k]]
    
    y_tr = y[k_indices[np.arange(len(k_indices))!=k].ravel()]
    y_te = y[k_indices[k]]
    
    ids_tr = ids[k_indices[np.arange(len(k_indices))!=k].ravel()]
    ids_te = ids[k_indices[k]]
    
    # ***************************************************
    # form data with polynomial degree
    print("Fold number: {f}, polynomial degree: {d}".format(f=k+1, d=degree))

    # ***************************************************
    max_iters=100
    # INSERT THE MODEL
    y_pred_te = final_model(x_tr, headers, y_tr, degree, x_te, ids_te, headers)
    # ***************************************************
    # Calcualte the accuracy
    acc = np.sum(y_pred_te.T==y_te)/len(y_te)
    
    return acc


In [6]:
def cross_validation_demo(y, x, degrees):
    seed = 1
    k_fold = 4
    
    max_iter = 10000
    threshold = 1e-8
    gamma = 0.01
    
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    accuracy = np.zeros((degrees, k_fold))
    
    
    for degree in np.arange(degrees)+1:
        acc_d = []
        for k in range(k_fold):
            acc  = cross_validation(y, x, k_indices, k, gamma, degree)
            acc_d.append(acc)
        accuracy[degree-1] = np.array(acc_d)
    
    return accuracy

In [7]:
degrees = 5
accuracy = cross_validation_demo(y, tX, degrees)

Fold number: 1, polynomial degree: 1
Current iteration=9999, training loss=[78308.52783414]
Proportion test error:  [0.28384]
Current iteration=9999, training loss=[25497.1939226]
Proportion test error:  [0.31835786]
Current iteration=9999, training loss=[23254.18676019]
Proportion test error:  [0.28615977]
Fold number: 2, polynomial degree: 1
Current iteration=9999, training loss=[77755.43566982]
Proportion test error:  [0.27941333]
Current iteration=9999, training loss=[25783.45759898]
Proportion test error:  [0.32486264]
Current iteration=9999, training loss=[24145.39143166]
Proportion test error:  [0.28533358]
Fold number: 3, polynomial degree: 1
Current iteration=9999, training loss=[77940.25242795]
Proportion test error:  [0.28784]
Current iteration=9999, training loss=[25483.62175084]
Proportion test error:  [0.31592061]
Current iteration=9999, training loss=[23367.0346446]
Proportion test error:  [0.28758411]
Fold number: 4, polynomial degree: 1
Current iteration=9999, training

In [8]:
accuracy

array([[0.601104, 0.597664, 0.601792, 0.609648],
       [0.595088, 0.592384, 0.590128, 0.595392],
       [0.591184, 0.59016 , 0.58656 , 0.592848],
       [0.589744, 0.588432, 0.58704 , 0.591568],
       [0.587632, 0.586944, 0.582512, 0.587696]])

In [3]:
data, targets, ids = process_features_train(tX, headers, y, 3)

In [4]:
targets[1].shape

(99913,)

In [5]:
data[2].shape

(77544, 55)

In [6]:

w_init = np.zeros((20, 1))
logistic_regression_demo_winit(targets[2], data[2], w_init, 100, 0.01)

Current iteration=0, training loss=[42714.73023504]
Current iteration=99, training loss=[38571.33097913]
(15509,)
(15509, 1)
Proportion test error:  [0.33528919]


array([[-1.23516334e-01],
       [-3.44617687e-02],
       [-8.69974063e-02],
       [-6.81018520e-02],
       [-1.44760662e-01],
       [-1.29050699e-01],
       [-7.83263475e-02],
       [-5.32919207e-02],
       [-8.51555363e-02],
       [-5.48536118e-02],
       [ 3.12272461e-02],
       [ 3.16678886e-02],
       [ 2.15440850e-02],
       [ 7.35803185e-02],
       [ 4.61578998e-02],
       [ 1.51506142e-02],
       [-5.27320226e-02],
       [-2.37449040e-02],
       [-4.61177568e-03],
       [ 2.65442099e-02],
       [ 1.65072654e-02],
       [ 1.16433142e-02],
       [-4.52733902e-02],
       [-1.90874624e-02],
       [-2.74465241e-03],
       [ 6.44972287e-02],
       [ 6.08726149e-02],
       [ 6.55895915e-02],
       [ 3.74990347e-02],
       [ 6.58310295e-03],
       [-3.58567387e-03],
       [-6.92765470e-05],
       [-3.76602840e-02],
       [-5.70460444e-03],
       [-3.79688274e-02],
       [-3.58609962e-02],
       [-2.00335014e-02],
       [ 7.54517083e-03],
       [-1.4

In [None]:
data, targets, ids = process_features_train(tX, headers, y, degree)
    
    # train base model 
    w_1 = logistic_regression_demo(targets[0], data[0], max_iters=500, gamma=0.01)

    # train jet=1 model using base model weights as initial weights
    w_2 = logistic_regression_demo_winit(targets[1], data[2], w_1, max_iters=500, gamma=0.01)