In [None]:
from proj1_helpers import *
import itertools
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load Data

In [None]:
(y_train, tx_train_raw, ids_train) = (np.array(x) for x in load_csv_data("data/train.csv"))
(y_test, tx_test_raw, ids_test) = (np.array(x) for x in load_csv_data("data/test.csv"))

## Clean Data

In [None]:
def remove_noisy_column(x):
    """Remove columns for which there are more -999 entries than normal entries"""
    return np.array([i for i in x.T if (i == -999).sum() < (i != -999).sum()]).copy()

In [None]:
def replace_noise_entries(x):
    """Replace -999 by mean value of the column (mean computed without any -999)"""
    col_means = [np.mean(col[col!= -999]) for col in x.T]
    x_local = x.copy()
    for i,col in enumerate(x_local.T):
        col[col == -999] = col_means[i]
    return x_local

In [None]:
def get_outliers(x, mean, std):
    outlier_indices = np.zeros() 
    for feature in range(tx.shape[1]):
        row_indices = np.where(np.absolute(tx[:,feature]-mean[feature]) > 3*std[feature])[0]
        mask = np.in1d(row_indices, outlier_indices)
        outlier_indices = np.hstack((outlier_indices, row_indices[np.where(~mask)[0]]))
    
    return outlier_indices.astype(int)

## Test accuracy

In [None]:
def test_accuracy(w, x_test, y_test):
    """returns accuracy for a specific weight vector"""
    predictions = predict_labels(w, x_test)
    num_equal = (predictions == y_test).sum()
    return num_equals/y_test.shape[0]

## Feature augmentation

In [None]:
def augment(x, num_important, degree):

    for i, line in enumerate(x):
        ones = np.array([1])
#         for val in itertools.combinations(line[:num_important], r=degree):
#             print(val)
#         comb = np.array([np.prod(val) for val in itertools.combinations(line[:num_important], r=degree)])
        not_imp = np.array([line**d for d in range(2, degree+1)])
        complete = np.concatenate((ones, line, comb, not_imp), axis=None)
        print(ones, line, comb, not_imp, complete)
        if i == 0:
            new = np.zeros((x.shape[0], complete.shape[0]))
        new[i] = complete
    return new

In [None]:
t = np.array([[1,2,3,4],[2,2,2,2],[5,5,5,5], [3,2,3,2], [2,2,4,4]])
# np.hstack()
augment(t, 3, 3)
# t2 = np.array([1,2,3,4,5])
# print(np.power(t2, 2))


In [None]:
def MSE(y, tx, w):
    return np.sum(np.power(y - np.dot(tx, w), 2)/(2*len(y)))  # MSE


def MAE(y, tx, w):
    return np.sum(np.abs(y - np.dot(tx, w)))/len(y)  # MAE


def RMSE(y, tx, w):
    return np.sqrt(2*MSE(y, tx, w))


def compute_gradient(y, tx, w):
    """Compute the gradient."""
    e = y - tx.dot(w)
    grad = -tx.T.dot(e)/len(y)
    return grad


def calculate_gradient_log(y, tx, w):
    """compute the gradient of loss."""
    pred = sigmoid(tx.dot(w))
    grad = tx.T.dot(pred - y)
    return grad


def sigmoid(t):
    """apply sigmoid function on t."""
    return 1/(1+np.exp(-t))


def NLL(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx.dot(w))
    loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    return np.squeeze(- loss)
########################
###### ASSIGNMENT ######
########################


def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    w = initial_w
    for n_iter in range(max_iters):
        grad = compute_gradient(y, tx, w)
        loss = MSE(y, tx, w)
        if n_iter % 100 == 0:
            print(loss)
        w = w - gamma*grad
        # print("Step {}, loss is   {}".format(n_iter, loss))
    return (w, loss)


def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    weights = initial_w
    for _ in range(max_iters):
        rand_index = np.random.randint(y.shape)
        y_batch, tx_batch = y[rand_index], tx[rand_index]
        grad = compute_gradient(y_batch, tx_batch, weights)
        weights = weights - gamma*grad
    loss = MSE(y, tx, weights)
    return (weights, loss)


def least_squares(y, tx):
    a = tx.T.dot(tx)
    b = tx.T.dot(y)
    w = np.linalg.solve(a, b)
    loss = MSE(y, tx, w)
    return (w, loss)


def ridge_regression(y, tx, lambda_):
    aI = 2 * tx.shape[0] * lambda_ * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    w = np.linalg.solve(a, b)
    loss = MSE(y, tx, w)
    return (w, loss)


def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    pass    


In [None]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    def sigmoid(t):
        """apply sigmoid function on t."""
        return 1/(1+np.exp(-t))

    def calculate_gradient(gradient_y, gradient_tx, gradient_w):
        """compute the gradient of loss."""
        pred = sigmoid(gradient_tx.dot(gradient_w))
        grad = gradient_tx.T.dot(pred - gradient_y)
        return grad

    def calculate_loss(y, loss_tx, loss_w):
        """compute the cost by negative log likelihood."""
        pred = sigmoid(loss_tx.dot(loss_w))
        loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
        return np.squeeze(- loss)

    weights = initial_w
    for _ in range(max_iters):
        # get loss and update w.
        loss = calculate_loss(y, tx, weights)
        print(loss)
        grad = calculate_gradient(y, tx, weights)
        weights = weights - gamma * grad
    return (weights, loss)

# MAIN RUN

In [1]:
from proj1_helpers import *
import itertools
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
(y, tx, ids) = (np.array(x) for x in load_csv_data("data/train.csv"))

We quickly saw that the data is split into 4 categories, according to the `PRI_jet_num` column. We split according to these categories

In [3]:
CAT_COL = 22
NUM_CATEGORIES = 4
rows_per_cat = np.array([np.where(tx[:,CAT_COL] == c)[0] for c in range(NUM_CATEGORIES)])

In [4]:
# this represents, for each category, which column contain at least one unknown (-999) value
unknown_cols = [set(np.where(tx[np.where(tx[:, CAT_COL] == c)[0], :] == -999)[1]) for c in range(NUM_CATEGORIES)]

In [5]:
for i, (rows, cols) in enumerate(zip(rows_per_cat, unknown_cols)):
    percentages = np.asarray([len(np.where(tx[rows, i] == -999)[0]) \
                             /len(rows) for i in cols]) * 100
    print("\nCATEGORY {}:".format(i))
    for col, perc in zip(cols, percentages):
        print("Col {} has {}% of unknown".format(col, perc))


CATEGORY 0:
Col 0 has 26.145746799715752% of unknown
Col 4 has 100.0% of unknown
Col 5 has 100.0% of unknown
Col 6 has 100.0% of unknown
Col 12 has 100.0% of unknown
Col 23 has 100.0% of unknown
Col 24 has 100.0% of unknown
Col 25 has 100.0% of unknown
Col 26 has 100.0% of unknown
Col 27 has 100.0% of unknown
Col 28 has 100.0% of unknown

CATEGORY 1:
Col 0 has 9.751882802022077% of unknown
Col 4 has 100.0% of unknown
Col 5 has 100.0% of unknown
Col 6 has 100.0% of unknown
Col 12 has 100.0% of unknown
Col 26 has 100.0% of unknown
Col 27 has 100.0% of unknown
Col 28 has 100.0% of unknown

CATEGORY 2:
Col 0 has 5.859584350622283% of unknown

CATEGORY 3:
Col 0 has 6.663959574084101% of unknown


Thus, for every category, any column (except 0) that has any unknown value has only unknown values. This can be explained by the fact that some fields might not be relevant to a certain category, and thus filled with "NaN", or -999

From now on, for all categories, we only keep columns with values different that -999

In [6]:
for cat in unknown_cols:
    cat.remove(0)

We remove columns PHI, as they are identically distributed and provide no real meaning.
We also remove row 22, as data is now split according to it.
As categories 2 and 3 are basically the same, we consider them as identical

In [7]:
phi_cols = [15, 18, 20, 25, 28]
columns_to_remove = [np.unique(np.concatenate((list(unknown), [CAT_COL], phi_cols))) for unknown in unknown_cols]
columns_to_remove = columns_to_remove[:-1]
NUM_CATEGORIES = 3
rows_per_cat[2] = np.unique(np.concatenate((rows_per_cat[2],rows_per_cat[3])))
rows_per_cat = rows_per_cat[:-1]

In [49]:
tx_by_cat = [tx[rows] for rows in rows_per_cat]
y_by_cat = [y[rows] for rows in rows_per_cat]

## Data cleaning

In [50]:
tx_by_cat = [np.delete(tx_by_cat[cat], np.array(columns_to_remove[cat], dtype=int), axis=1) for cat in range(NUM_CATEGORIES)]

In [51]:
# Replacing remaining NaN by the average value of the column for the category
for cat in tx_by_cat:
    first_col = cat[:,0]
    first_col[first_col == -999] = np.mean(first_col[first_col != -999])

26123
7562
4429


## Features engineering