In [4]:
from proj1_helpers import *
import itertools
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load Data

In [5]:
(y_train, tx_train_raw, ids_train) = (np.array(x) for x in load_csv_data("data/train.csv"))
(y_test, tx_test_raw, ids_test) = (np.array(x) for x in load_csv_data("data/test.csv"))

## Clean Data

In [17]:
def remove_noisy_column(x):
    """Remove columns for which there are more -999 entries than normal entries"""
    return np.array([i for i in x.T if (i == -999).sum() < (i != -999).sum()]).copy()

In [18]:
def replace_noise_entries(x):
    """Replace -999 by mean value of the column (mean computed without any -999)"""
    col_means = [np.mean(col[col!= -999]) for col in x.T]
    x_local = x.copy()
    for i,col in enumerate(x_local.T):
        col[col == -999] = col_means[i]
    return x_local

## Test accuracy

In [20]:
def test_accuracy(w, x_test, y_test):
    """returns accuracy for a specific weight vector"""
    predictions = predict_labels(w, x_test)
    num_equal = (predictions == y_test).sum()
    return num_equals/y_test.shape[0]

## Feature augmentation

In [126]:
def augment(x, num_important, degree):

    for i, line in enumerate(x):
        ones = np.array([1])
        for val in itertools.combinations(line[:num_important], r=degree):
            print(val)
        comb = np.array([np.prod(val) for val in itertools.combinations(line[:num_important], r=degree)])
        not_imp = np.array([line**d for d in range(2, degree+1)])
        complete = np.concatenate((ones, line, comb, not_imp), axis=None)
        print(ones, line, comb, not_imp, complete)
        if i == 0:
            new = np.zeros((x.shape[0], complete.shape[0]))
        new[i] = complete
    return new

In [133]:
t = np.array([[1,2,3,4],[2,2,2,2],[5,5,5,5], [3,2,3,2], [2,2,4,4]])
# np.hstack()
augment(t, 3, 3)
# t2 = np.array([1,2,3,4,5])
# print(np.power(t2, 2))


(1, 2, 3)
[1] [1 2 3 4] [6] [[ 1  4  9 16]
 [ 1  8 27 64]] [ 1  1  2  3  4  6  1  4  9 16  1  8 27 64]
(2, 2, 2)
[1] [2 2 2 2] [8] [[4 4 4 4]
 [8 8 8 8]] [1 2 2 2 2 8 4 4 4 4 8 8 8 8]
(5, 5, 5)
[1] [5 5 5 5] [125] [[ 25  25  25  25]
 [125 125 125 125]] [  1   5   5   5   5 125  25  25  25  25 125 125 125 125]
(3, 2, 3)
[1] [3 2 3 2] [18] [[ 9  4  9  4]
 [27  8 27  8]] [ 1  3  2  3  2 18  9  4  9  4 27  8 27  8]
(2, 2, 4)
[1] [2 2 4 4] [16] [[ 4  4 16 16]
 [ 8  8 64 64]] [ 1  2  2  4  4 16  4  4 16 16  8  8 64 64]


array([[  1.,   1.,   2.,   3.,   4.,   6.,   1.,   4.,   9.,  16.,   1.,
          8.,  27.,  64.],
       [  1.,   2.,   2.,   2.,   2.,   8.,   4.,   4.,   4.,   4.,   8.,
          8.,   8.,   8.],
       [  1.,   5.,   5.,   5.,   5., 125.,  25.,  25.,  25.,  25., 125.,
        125., 125., 125.],
       [  1.,   3.,   2.,   3.,   2.,  18.,   9.,   4.,   9.,   4.,  27.,
          8.,  27.,   8.],
       [  1.,   2.,   2.,   4.,   4.,  16.,   4.,   4.,  16.,  16.,   8.,
          8.,  64.,  64.]])

In [None]:
def MSE(y, tx, w):
    return np.sum(np.power(y - np.dot(tx, w), 2)/(2*len(y)))  # MSE


def MAE(y, tx, w):
    return np.sum(np.abs(y - np.dot(tx, w)))/len(y)  # MAE


def RMSE(y, tx, w):
    return np.sqrt(2*MSE(y, tx, w))


def compute_gradient(y, tx, w):
    """Compute the gradient."""
    e = y - tx.dot(w)
    grad = -tx.T.dot(e)/len(y)
    return grad


def calculate_gradient_log(y, tx, w):
    """compute the gradient of loss."""
    pred = sigmoid(tx.dot(w))
    grad = tx.T.dot(pred - y)
    return grad


def sigmoid(t):
    """apply sigmoid function on t."""
    return 1/(1+np.exp(-t))


def NLL(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx.dot(w))
    loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    return np.squeeze(- loss)
########################
###### ASSIGNMENT ######
########################


def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    w = initial_w
    for n_iter in range(max_iters):
        grad = compute_gradient(y, tx, w)
        loss = MSE(y, tx, w)
        if n_iter % 100 == 0:
            print(loss)
        w = w - gamma*grad
        # print("Step {}, loss is   {}".format(n_iter, loss))
    return (w, loss)


def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    weights = initial_w
    for _ in range(max_iters):
        rand_index = np.random.randint(y.shape)
        y_batch, tx_batch = y[rand_index], tx[rand_index]
        grad = compute_gradient(y_batch, tx_batch, weights)
        weights = weights - gamma*grad
    loss = MSE(y, tx, weights)
    return (weights, loss)


def least_squares(y, tx):
    a = tx.T.dot(tx)
    b = tx.T.dot(y)
    w = np.linalg.solve(a, b)
    loss = MSE(y, tx, w)
    return (w, loss)


def ridge_regression(y, tx, lambda_):
    aI = 2 * tx.shape[0] * lambda_ * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    w = np.linalg.solve(a, b)
    loss = MSE(y, tx, w)
    return (w, loss)


def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    pass    


In [None]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    def sigmoid(t):
        """apply sigmoid function on t."""
        return 1/(1+np.exp(-t))

    def calculate_gradient(gradient_y, gradient_tx, gradient_w):
        """compute the gradient of loss."""
        pred = sigmoid(gradient_tx.dot(gradient_w))
        grad = gradient_tx.T.dot(pred - gradient_y)
        return grad

    def calculate_loss(y, loss_tx, loss_w):
        """compute the cost by negative log likelihood."""
        pred = sigmoid(loss_tx.dot(loss_w))
        loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
        return np.squeeze(- loss)

    weights = initial_w
    for _ in range(max_iters):
        # get loss and update w.
        loss = calculate_loss(y, tx, weights)
        print(loss)
        grad = calculate_gradient(y, tx, weights)
        weights = weights - gamma * grad
    return (weights, loss)

## Feature augmentation

In [None]:
tx_train = augment(tx_train_raw)
tx_test = augment(tx_test_raw)

In [None]:
initial_w = np.zeros(tx_train.shape[1])