In [23]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from proj1_helpers import *
from proj1_utils import *
from implementations_utils import *
from implementations import *
from proj1_visualization import *
from proj1_cross_validation import *
from sklearn.metrics import accuracy_score


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the data into feature matrix, class labels, and event ids:

In [6]:
DATA_TRAIN_PATH = '../data/train.csv' 
y_trainRaw, tX_trainRaw, ids_train = load_csv_data(DATA_TRAIN_PATH)
DATA_TEST_PATH = '../data/test.csv' 
y_testRaw, tX_testRaw, ids_test = load_csv_data(DATA_TEST_PATH)


# Data Pre-processing

In [7]:
# Preprocessing of Training 

cat_cols = [22]
full_x_train_num, full_x_train_cat = split_numerical_categorical(tX_trainRaw,cat_cols)

# Treat numerical values
full_x_train_num_nan = replace_undef_val_with_nan(full_x_train_num)
full_x_train_num_nan_std, train_mean, train_std = nan_standardize_fit(full_x_train_num_nan)

full_x_train_num_valid_std = replace_nan_val_with_median(full_x_train_num_nan_std)
full_x_train_num_valid_std = replace_iqr_outliers(full_x_train_num_valid_std)

# Treat categorical values
full_x_train_ohe_cat = one_hot_encode(full_x_train_cat)
x_train_poly = build_poly(full_x_train_num_valid_std,3)
full_x_train = np.hstack((x_train_poly,full_x_train_ohe_cat))

# Treat labels
full_y_train = y_trainRaw
full_y_train = relabel_y_non_negative(full_y_train).reshape(-1,1)
full_y_train = full_y_train.reshape(-1,1)


In [8]:
# Split in train and validation set
x_train, y_train, x_val, y_val = split_data(full_x_train,full_y_train,0.8)


In [9]:
# Preprocessing of Test

cat_cols = [22]
x_test_num, x_test_cat = split_numerical_categorical(tX_testRaw,cat_cols)

# Treat numerical values
x_test_num_nan = replace_undef_val_with_nan(x_test_num)
x_test_num_nan_std = nan_standardize_transform(x_test_num_nan,train_mean,train_std)
x_test_num_valid_std = replace_nan_val_with_median(x_test_num_nan_std)
x_test_num_valid_std = replace_iqr_outliers(x_test_num_valid_std)
x_test_ohe_cat = one_hot_encode(x_test_cat)
x_test_poly = build_poly(x_test_num_valid_std,3)
x_test = np.hstack((x_test_poly,x_test_ohe_cat))


# Training and Validation 

### Logistic Regression

In [17]:
# Best Degree was polynomial 3
max_iters = 1000
gamma = 0.000001
w_initial = np.zeros((full_x_train.shape[1], 1))
weights, loss_tr,losses = logistic_regression(y_train, x_train, w_initial, max_iters, gamma)

#Iteration: 0, Loss: 138629.4361119856
#Iteration: 250, Loss: 84725.66497321552
#Iteration: 500, Loss: 83314.93088593178
#Iteration: 750, Loss: 82787.21469085384
#Iteration: 1000, Loss: 82494.51500940116


In [18]:
y_val = relabel_y_negative(y_val)
y_pred = predict_labels(weights, x_val)
accuracy_score(y_val,y_pred)

0.81084

### Regularized Logistic Regression

In [20]:
w_initial = np.zeros((x_train.shape[1], 1))
weights,_,_ = reg_logistic_regression(y_train, x_train, w_initial, max_iters, gamma, 0.1)

#Iteration: 0, Loss: 138629.4361119856
#Iteration: 250, Loss: 84726.22275867395
#Iteration: 500, Loss: 83315.7173661692
#Iteration: 750, Loss: 82788.14487469665
#Iteration: 1000, Loss: 82495.55465048923


In [21]:
y_val = relabel_y_negative(y_val)
y_pred = predict_labels(weights, x_val)
accuracy_score(y_val,y_pred)

0.81084

### Cross Validation Verification

In [25]:
_, loss_val, _, accuracy_val = cross_validation_log(x_train, y_train, 0.1, gamma, max_iters)

#Iteration: 0, Loss: 110903.5488895897
#Iteration: 250, Loss: 68376.50218850648
#Iteration: 500, Loss: 67007.34552271449
#Iteration: 750, Loss: 66502.40935577865
#Iteration: 1000, Loss: 66226.20247440969


  


#Iteration: 0, Loss: 110903.5488895897
#Iteration: 250, Loss: 68157.55667564325
#Iteration: 500, Loss: 66787.44746189806
#Iteration: 750, Loss: 66287.0267579032
#Iteration: 1000, Loss: 66014.99700464892
#Iteration: 0, Loss: 110903.54888958971
#Iteration: 250, Loss: 68334.05624165178
#Iteration: 500, Loss: 66972.92801224273
#Iteration: 750, Loss: 66475.3421855915
#Iteration: 1000, Loss: 66204.75499245182
#Iteration: 0, Loss: 110903.54888958971
#Iteration: 250, Loss: 68459.77208337637
#Iteration: 500, Loss: 67110.57056169314
#Iteration: 750, Loss: 66612.7682498139
#Iteration: 1000, Loss: 66340.26743276972
#Iteration: 0, Loss: 110903.54888958971
#Iteration: 250, Loss: 68208.93549907033
#Iteration: 500, Loss: 66834.76146407175
#Iteration: 750, Loss: 66330.25956338288
#Iteration: 1000, Loss: 66054.89953819114


In [26]:
loss_val

nan

In [27]:
accuracy_val

0.8115349999999999

## Generate predictions and save ouput in csv format for submission:

In [28]:
OUTPUT_PATH = '../results/reg_log_reg.csv' 
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [29]:
# Script to delete empty rows
with open('../results/reg_log_reg.csv') as input, open('../results/reg_log_reg_cleaned.csv', 'w', newline='') as output:
    writer = csv.writer(output)
    for row in csv.reader(input):
        if any(field.strip() for field in row):
            writer.writerow(row)