In [56]:
import math
%load_ext autoreload
%autoreload 2

import os
import csv
import numpy as np
from helpers import load_data, one_hot_encode, standardize

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
data_directory = '../data'
train_dataset_path = os.path.join(data_directory, 'train.csv')
_, Y_train_public, feature_names, X_train_public = load_data(train_dataset_path)

In [58]:
EPSILON = 1E-4
mask_train = np.abs(X_train_public + 999) <= EPSILON
X_train_public[mask_train] = np.nan

In [59]:
# Add polynomial features before standardization
from helpers import build_poly
X_train_poly = build_poly(X_train_public, 3)

In [60]:
# Standardize both
column_means, column_stds = standardize(X_train_public)
X_train_public = np.nan_to_num(X_train_public, nan=0.0)

column_means, column_stds = standardize(X_train_poly)
X_train_poly = np.nan_to_num(X_train_poly, nan=0.0)

In [61]:
positive_sample = 's'
negative_sample = 'b'
Y_train_public = np.expand_dims((Y_train_public == positive_sample).astype(np.int32), axis=1)

In [62]:
from helpers import build_poly, train_test_split
from implementations import reg_logistic_regression_AGDR
from metrics import LogisticRegressionLoss

print("Data with polynomial features",X_train_poly.shape)

cutoff = 0.5
X_poly_train, X_poly_test, Y_poly_train, Y_poly_test = train_test_split(X_train_poly, Y_train_public, 0.5)
w_agdr_poly, _, loss = reg_logistic_regression_AGDR( \
    Y_poly_train, X_poly_train, lambda_=0, \
    initial_w=np.zeros(shape=(X_poly_train.shape[1], 1)), \
    max_iters=100, gamma=0.01, return_all_losses=True)

predictions = (LogisticRegressionLoss.sigmoid(X_poly_test @ w_agdr_poly) > cutoff).astype(np.int32)
error = np.sum(np.abs(Y_poly_test - predictions)) / Y_poly_test.shape[0]
print(f"Mean error poly:", error)

# Without polynomial features
X_nonpoly_train, X_nonpoly_test, Y_nonpoly_train, Y_nonpoly_test = train_test_split(X_train_public, Y_train_public, 0.5)
w_agdr_nonpoly, _, loss = reg_logistic_regression_AGDR( \
    Y_nonpoly_train, X_nonpoly_train, lambda_=0, \
    initial_w=np.zeros(shape=(X_nonpoly_train.shape[1], 1)), \
    max_iters=100, gamma=0.01, return_all_losses=True)

predictions = (LogisticRegressionLoss.sigmoid(X_nonpoly_test @ w_agdr_nonpoly) > cutoff).astype(np.int32)
error = np.sum(np.abs(Y_nonpoly_test - predictions)) / Y_nonpoly_test.shape[0]
print(f"Mean error nonpoly:", error)

Data with polynomial features (250000, 90)


100%|██████████| 100/100 [00:16<00:00,  6.12it/s]


Mean error poly: 0.26796


100%|██████████| 100/100 [00:06<00:00, 14.91it/s]


Mean error nonpoly: 0.284912
