In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [22]:
def standardize(x):
    """
    Standardize the original data set.

    :param x: Data set.
    :return: Standardized data set.
    """
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x, mean_x, std_x

In [23]:
data, tX_mean, tX_std = standardize(tX)

In [25]:
for i in range(data.shape[1]):
    data[:, i][np.where(data[:,i] == -999)] = tX_mean[i]

In [28]:
data

array([[ 5.13221613e-01,  6.83319669e-02,  4.07680272e-01, ...,
         1.13612250e+00, -2.52828975e+00,  4.12510497e-01],
       [ 8.40891468e-01,  5.52504823e-01,  5.40136414e-01, ...,
         3.14052005e-03,  4.69108779e-04, -2.73819964e-01],
       [-1.50629345e+00,  3.19515553e+00,  1.09655998e+00, ...,
         3.14052005e-03,  4.69108779e-04, -2.93969845e-01],
       ...,
       [ 3.17436708e-02,  3.19316447e-01, -1.30863670e-01, ...,
         3.14052005e-03,  4.69108779e-04, -3.17017229e-01],
       [-1.21481036e-01, -8.45323970e-01, -3.02973380e-01, ...,
         3.14052005e-03,  4.69108779e-04, -7.45439413e-01],
       [-1.50629345e+00,  6.65336083e-01, -2.53522760e-01, ...,
         3.14052005e-03,  4.69108779e-04, -7.45439413e-01]])

In [30]:
tX

array([[138.47 ,  51.655,  97.827, ...,   1.24 ,  -2.475, 113.497],
       [160.937,  68.768, 103.235, ...,   0.   ,   0.   ,  46.226],
       [  0.   , 162.172, 125.953, ...,   0.   ,   0.   ,  44.251],
       ...,
       [105.457,  60.526,  75.839, ...,   0.   ,   0.   ,  41.992],
       [ 94.951,  19.362,  68.812, ...,   0.   ,   0.   ,   0.   ],
       [  0.   ,  72.756,  70.831, ...,   0.   ,   0.   ,   0.   ]])

In [38]:
def batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
    """
    Generate a minibatch iterator for a dataset.
    Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
    Example of use :
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
        <DO-SOMETHING>

    :param y: Labels.
    :param tx: Features.
    :param batch_size: Size of the batch.
    :param num_batches: Number of batches.
    :param shuffle: Should the data be shuffled?
    :return: Batch iterator.
    """
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]


In [27]:
def gradient_descent(y, tx, initial_w, max_iters, gamma):
    """
    Gradient descent algorithm.

    :param y: Labels.
    :param tx: Features.
    :param initial_w: Initial weight vector.
    :param max_iters: Number of steps to run.
    :param gamma: Step-size.
    :return:`(w, loss)`, with `w` the last weight vector of the method, and `loss` the corresponding loss value (cost function).
    """
    # Define parameters to store w and loss
    w = initial_w
    prev_err = float('inf')
    for n_iter in range(max_iters):
        # Transpose the features
        y_pred = np.sign(tx.dot(w))
        
        loss = y_pred - y
        
        error = (loss ** 2).mean()
        
        if error >= prev_err:
            error = prev_err
            break
        else:
            prev_err = error
        
        grad = (1. / tx.shape[0]) * tx.T.dot(loss)
        
        w = w - gamma * grad
        
        print("Iter {} of {}. Error: {}".format(n_iter + 1, max_iters, error))
    return w, error

In [28]:
initial_w = np.random.random(tX.shape[1])

In [29]:
loss, w = gradient_descent(y, tX, initial_w, 150, .00001)

Iter 1 of 150. Error: 1.492208
Iter 2 of 150. Error: 1.492208
Iter 3 of 150. Error: 1.492192
Iter 4 of 150. Error: 1.492176
Iter 5 of 150. Error: 1.492176
Iter 6 of 150. Error: 1.49216
Iter 7 of 150. Error: 1.492144
Iter 8 of 150. Error: 1.492112
Iter 9 of 150. Error: 1.492064
Iter 10 of 150. Error: 1.492064
Iter 11 of 150. Error: 1.492048
Iter 12 of 150. Error: 1.492032
Iter 13 of 150. Error: 1.492016
Iter 14 of 150. Error: 1.492016
Iter 15 of 150. Error: 1.492016
Iter 16 of 150. Error: 1.492016
Iter 17 of 150. Error: 1.491968
Iter 18 of 150. Error: 1.491952
Iter 19 of 150. Error: 1.491936
Iter 20 of 150. Error: 1.49192
Iter 21 of 150. Error: 1.491904
Iter 22 of 150. Error: 1.491888
Iter 23 of 150. Error: 1.49184
Iter 24 of 150. Error: 1.491808
Iter 25 of 150. Error: 1.491808
Iter 26 of 150. Error: 1.491808
Iter 27 of 150. Error: 1.49176
Iter 28 of 150. Error: 1.491712
Iter 29 of 150. Error: 1.49168
Iter 30 of 150. Error: 1.491648
Iter 31 of 150. Error: 1.491648
Iter 32 of 150. Error:

In [30]:
loss

1.484144

### Validation 

In [66]:
y_pred = np.sign(tX.dot(w))

In [67]:
TP = np.sum(np.logical_and(y_pred == 1, y == 1))
TN = np.sum(np.logical_and(y_pred == -1, y == -1))
FP = np.sum(np.logical_and(y_pred == 1, y == -1))
FN = np.sum(np.logical_and(y_pred == -1, y == 1))
print(TP, TN, FP, FN)

32400 127131 37202 53267


In [68]:
precision = TP / (TP + FP)
precision

0.4655038648314704

In [69]:
recall = TP / (TP + FN)
recall

0.37820864510254826

In [70]:
FPR = FP / (FP + TN)
FPR

0.2263817979346814

In [71]:
accuracy = (TP + TN) / (TP + TN + FP + FN)
accuracy

0.638124

In [72]:
# Best value at 1
F1 = 2 * (precision * recall) / (precision + recall)
F1

0.4173402288930824

### Stoch Gradient Descent

In [52]:
def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma):
    """
    Stochastic gradient descent algorithm.

    :param y: Predictions.
    :param tx: Target.
    :param initial_w: Initial weight vector.
    :param batch_size: Size of the batch.
    :param max_iters: Number of steps to run.
    :param gamma: Step-size.
    :return:`(w, loss)`, with `w` the last weight vector of the method, and `loss` the corresponding loss value (cost function).
    """
    w = initial_w
    prev_err = float('inf')
    for n_iter in range(max_iters):
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size=batch_size):
            y_pred = np.sign(minibatch_tx.dot(w))
        
            loss = y_pred - minibatch_y

            error = (loss ** 2).mean()

            if error >= prev_err:
                error = prev_err
                break
            else:
                prev_err = error

            grad = (1. / minibatch_tx.shape[0]) * minibatch_tx.T.dot(loss)

            w = w - gamma * grad
        
        print("Iter {} of {}. Error: {}".format(n_iter + 1, max_iters, error))
    return w, loss


In [53]:
initial_w = np.random.random(tX.shape[1])

In [65]:
w, loss = stochastic_gradient_descent(y, tX, initial_w, 256, 150, .0001)

Iter 1 of 150. Error: 1.609375
Iter 2 of 150. Error: 1.46875
Iter 3 of 150. Error: 1.390625
Iter 4 of 150. Error: 1.375
Iter 5 of 150. Error: 1.375
Iter 6 of 150. Error: 1.375
Iter 7 of 150. Error: 1.375
Iter 8 of 150. Error: 1.375
Iter 9 of 150. Error: 1.375
Iter 10 of 150. Error: 1.375
Iter 11 of 150. Error: 1.375
Iter 12 of 150. Error: 1.375
Iter 13 of 150. Error: 1.375
Iter 14 of 150. Error: 1.375
Iter 15 of 150. Error: 1.375
Iter 16 of 150. Error: 1.375
Iter 17 of 150. Error: 1.375
Iter 18 of 150. Error: 1.296875
Iter 19 of 150. Error: 1.25
Iter 20 of 150. Error: 1.25
Iter 21 of 150. Error: 1.25
Iter 22 of 150. Error: 1.25
Iter 23 of 150. Error: 1.25
Iter 24 of 150. Error: 1.25
Iter 25 of 150. Error: 1.25
Iter 26 of 150. Error: 1.25
Iter 27 of 150. Error: 1.25
Iter 28 of 150. Error: 1.25
Iter 29 of 150. Error: 1.25
Iter 30 of 150. Error: 1.25
Iter 31 of 150. Error: 1.25
Iter 32 of 150. Error: 1.25
Iter 33 of 150. Error: 1.171875
Iter 34 of 150. Error: 1.171875
Iter 35 of 150. Erro

In [74]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [75]:
OUTPUT_PATH = '../data/output.csv'
y_pred = predict_labels(w, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)