## Imports

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
%load_ext autoreload
%autoreload 2

## Assignment

### Pre-processing

In [None]:
# In a single cell because it takes a long time and doesn't need to be ran everytime
y, x = load_data(train=True) # Load data
y_indexes, x_test = load_data(train=False)

In [None]:
x_tr, x_te, y_tr, y_te = split_data(x, y, 0.8, np.random.seed())
x_tr = replace_min_999_by_col_mean(x_tr) # Handle invalid values
x_te = replace_min_999_by_col_mean(x_te)

x_tr, mean_x_tr, std_x_tr = standardize(x_tr) # Standardize x
x_te, mean_x_te, std_x_te = standardize(x_te)

tx_tr = build_poly(x_tr, 2) # build polynomial expansion (with bias)
tx_te = build_poly(x_te, 2)

### Linear regression using gradient descent

In [None]:
# We run GD step times per epoch, for epochs epochs (same as running GD for epochs*step just lets us print intermediate results)
w_GD, epochs, step, gamma = np.zeros(61), 100, 100, 1e-4
loss_tr_GD = []
loss_te_GD = []
for i in range((int)(epochs)):
    w_GD, loss_tr = mean_squared_error_gd(y_tr, tx_tr, w_GD, step, gamma)
    loss_te = compute_mse(y_te, tx_te, w_GD)
    loss_tr_GD.append(loss_tr)
    loss_te_GD.append(loss_te)
    print(f"Epoch {i} : Training loss: {loss_tr} Test loss: {loss_te}")

#### Plotting the resulting losses

In [None]:
plt.plot(range(len(loss_tr_GD)), loss_tr_GD, c='red')
plt.plot(range(len(loss_te_GD)), loss_te_GD, c='blue')

#### Calculating the accuracy on the test set (with predictions = -1 or 1)

In [None]:
accuracy = compute_accuracy(tx_te, y_te, w_GD, 0)
print(f"Accuracy for these w: {accuracy*100}%")

In [None]:
x_test = replace_min_999_by_col_mean(x_test) # Handle invalid values

x_test, mean_x_test, std_x_test = standardize(x_test) # Standardize x

tx_test = build_poly(x_test, 2) # build polynomial expansion (with bias)

y_hat = build_prediction(tx_test, w_GD, 0.4) # threshold of 0.4 found experimentally - mention in report
write_to_csv(np.column_stack((y_indexes, y_hat)), "test_output.csv")

### Linear regression using stochastic gradient descent

In [None]:
w_SGD, epochs, step, gamma = np.zeros(61), 100, 100, 1e-4
for i in range((int)(epochs)):
    w_SGD, loss_tr = mean_squared_error_sgd(y_tr, tx_tr, w_SGD, step, gamma)
    loss_te = compute_mse(y_te, tx_te, w_SGD)
    print(f"Epoch {i} : Training loss: {loss_tr} Test loss: {loss_te}")

#### Calculating the accuracy on the test set (with predictions = 0 or 1)

In [None]:
y_hat_cont = tx_te@w_SGD
y_hat = [1 if yi > 0.40 else 0 for yi in y_hat_cont]
accuracy = 1-abs(y_te-y_hat).mean()
print(f"Accuracy for these w: {accuracy*100}%")

### Least squares

In [None]:
w_LS, loss_tr = least_squares(y_tr, tx_tr)
loss_te = compute_mse(y_te, tx_te, w_LS)
print(f"Training loss: {loss_tr}\nTest loss: {loss_te}")

In [None]:
y_hat_cont = tx_te@w_LS
y_hat = [1 if yi > 0.45 else 0 for yi in y_hat_cont]
accuracy = 1-abs(y_te-y_hat).mean()
print(f"Accuracy for these w: {accuracy*100}%")

### Ridge regression

In [None]:
lambda_ = 0.1
w_REG, loss_tr = ridge_regression(y_tr, tx_tr, lambda_)
print(f"Training loss: {loss_tr}\n")

In [None]:
y_hat_cont = tx_te@w_REG
y_hat = [1 if yi > 0.44 else 0 for yi in y_hat_cont]
accuracy = 1-abs(y_te-y_hat).mean()
print(f"Accuracy for these w: {accuracy*100}%")

### Logistic regression using gradient descent or SGD (y ∈ {0, 1})

In [None]:
# We run GD step times per epoch, for epochs epochs (same as running GD for epochs*step just lets us print intermediate results)
w_GD_log, epochs, step, gamma = np.zeros(61), 100, 50, 1e-2
loss_tr_GD_log = []
loss_te_GD_log = []
for i in range((int)(epochs)):
    w_GD_log, loss_tr = logistic_regression(y_tr, tx_tr, w_GD_log, step, gamma) # TODO this leads to some NaNs
    loss_te = compute_log_loss(y_te, tx_te, w_GD_log)
    loss_tr_GD_log.append(loss_tr)
    loss_te_GD_log.append(loss_te)
    print(f"Epoch {i} : Training loss: {loss_tr} Test loss: {loss_te}")

#### Plotting the resulting losses

In [None]:
plt.plot(range(len(loss_tr_GD_log)), loss_tr_GD_log, c='red')
plt.plot(range(len(loss_te_GD_log)), loss_te_GD_log, c='blue')

#### Calculating the accuracy on the test set (with predictions = -1 or 1)

In [None]:
accuracy = compute_accuracy_log(tx_te, y_te, w_GD_log, threshold=0.5)
print(f"Accuracy for these w: {accuracy*100}%")

In [None]:
x_test = replace_min_999_by_col_mean(x_test) # Handle invalid values

x_test, mean_x_test, std_x_test = standardize(x_test) # Standardize x

tx_test = build_poly(x_test, 2)

y_hat = build_prediction_log(tx_test, w_GD_log, threshold=0.5, minus_one = True)
write_to_csv(np.column_stack((y_indexes, y_hat)), "test_output.csv")

### Regularized logistic regression using gradient descent or SGD (y ∈ {0, 1}, with regularization term λ∥w∥**2)