In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [292]:
from proj1_helpers import *
from implementations import *

In [68]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [375]:
X = tX.copy()

In [402]:
# Define some constants

M = X.shape[1] # number of features
UNDEF = -999.
JET_NUM_INDEX = 22 # the index of the PRI_jet_num feature

## Do your thing crazy machine learning thing here :) ...

In [403]:
# group data by PRI_jet_num
def separate_data(X, y, val):
    mask = X[:, JET_NUM_INDEX] == val
    return X[mask], y[mask]

In [489]:
# remove rows that still have undefined values
def remove_undef_rows(X,y,info=False):
    to_remove = []
    for i,row in enumerate(X):
        if UNDEF in row:
            to_remove.append(i)
    x_clean = np.delete(X, to_remove, axis=0)
    y_clean = np.delete(y, to_remove)
    if info:
        print('Removed',len(to_remove),'samples containing',UNDEF)
    return x_clean, y_clean

In [426]:
# compute weights for columns we selected
def regress(X, y, mask):
    x_masked = np.take(X, mask, axis=1)
    x_clean, y_clean = remove_undef_rows(x_masked, y)
    w,_ = least_squares(y_clean, x_clean)
    return w

In [427]:
# put 0 weights where the removed columns are, so we can simply multiply with test set without removing columns again
def rescale(w, mask, M=M):
    out = np.zeros((M,))
    for e,i in enumerate(mask):
        out[i] = w[e]
    return out

In [428]:
def predict(ws, X):
    labels = []
    for sample in X:
        labels.append(sample @ ws[int(sample[22])])
    labels = np.array(labels)
    labels[labels > 0] = 1
    labels[labels <= 0] = -1
    return labels

In [467]:
def accuracy(predictions, actual):
    return np.sum(predictions==actual)/len(actual)

In [725]:
# choose relevant columns for each group
mask0 = [0,1,2,3,7,10,11,13,14,15,16,17,18,19,20,21]
mask1 = [0,1,2,3,7,8,9,10,11,13,14,15,16,17,18,19,20,21,23,24,25,29]
mask2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29]
mask3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29]

In [726]:
train_x, train_y, test_x, test_y = split_data(X, y, 0.1) # split into train and test

In [727]:
x0, y0 = separate_data(train_x, train_y, 0)
x1, y1 = separate_data(train_x, train_y, 1)
x2, y2 = separate_data(train_x, train_y, 2)
x3, y3 = separate_data(train_x, train_y, 3)

In [728]:
w0 = rescale(regress(x0,y0,mask0), mask0)
w1 = rescale(regress(x1,y1,mask1), mask1)
w2 = rescale(regress(x2,y2,mask2), mask2)
w3 = rescale(regress(x3,y3,mask3), mask3)
ws = [w0,w1,w2,w3]

In [729]:
# check accuracy on our test set
predictions = predict(ws, test_x)
print('Accuracy on',len(test_y),'test samples is',accuracy(predictions,test_y))

Accuracy on 225000 test samples is 0.7529644444444444


In [730]:
# check accuracy on our train set
predictions_0 = predict(ws, train_x)
print('Accuracy on train samples is',accuracy(predictions_0,train_y))

Accuracy on train samples is 0.75168


In [731]:
misclassified = predictions != test_y
vals = test_x[misclassified]
correct = test_x[predictions == test_y]

errors = np.array([np.sum(vals[:,22] == 0), np.sum(vals[:,22] == 1),np.sum(vals[:,22] == 2),np.sum(vals[:,22] == 3)])
total_in_group = np.array([np.sum(test_x[:,22]==0),np.sum(test_x[:,22]==1),np.sum(test_x[:,22]==2),np.sum(test_x[:,22]==3)])

In [732]:
errors / total_in_group * 100

array([18.04029711, 30.4902003 , 27.66871571, 27.76662988])

# Generate predictions and save ouput in csv format for submission:

In [46]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [399]:
OUTPUT_PATH = '../output/predictions_2.csv'
y_pred = predict(ws, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

### Unused stuff, keep it here just in case

In [319]:
def replace_undefined_with_mean(X, wrong_value):
    def nan_if(x, wrong_value):
        return np.where(X == wrong_value, np.nan, X)

    means = np.nanmean(nan_if(X, wrong_value), axis=0)

    X_clean = np.zeros(X.shape)
    for i,row in enumerate(X):
        mask = row == wrong_value
        X_clean[i] = mask*means + np.logical_not(mask)*row
    return X_clean

In [320]:
def normalize_data(X):
    return (X - np.mean(X, axis=0)) / np.std(X, axis=0)

In [None]:
accs = []
for i in range(100):
    train_x, train_y, test_x, test_y = split_data(X, y, 0.6, seed=i) # split into train and test
    x0, y0 = separate_data(train_x, train_y, 0)
    x1, y1 = separate_data(train_x, train_y, 1)
    x2, y2 = separate_data(train_x, train_y, 2)
    x3, y3 = separate_data(train_x, train_y, 3)
    w0 = rescale(regress(x0,y0,mask0), mask0)
    w1 = rescale(regress(x1,y1,mask1), mask1)
    w2 = rescale(regress(x2,y2,mask2), mask2)
    w3 = rescale(regress(x3,y3,mask3), mask3)
    ws = [w0,w1,w2,w3]
    predictions = predict(ws, test_x)
    accs.append(accuracy(predictions, test_y))
    print('\r%d' % i,end='')
np.argmax(accs)