In [2]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [3]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

### Analysis

In [4]:
num_data, num_features = tX.shape  # Shape of the set
print("We have {} data with {} features.".format(num_data, num_features))

We have 250000 data with 30 features.


Here's an example of data:

In [5]:
print("ID: {}\nClass label: {}\nFeatures: {}".format(ids[0], y[0], tX[0]))

ID: 100000
Class label: 1.0
Features: [ 1.38470e+02  5.16550e+01  9.78270e+01  2.79800e+01  9.10000e-01
  1.24711e+02  2.66600e+00  3.06400e+00  4.19280e+01  1.97760e+02
  1.58200e+00  1.39600e+00  2.00000e-01  3.26380e+01  1.01700e+00
  3.81000e-01  5.16260e+01  2.27300e+00 -2.41400e+00  1.68240e+01
 -2.77000e-01  2.58733e+02  2.00000e+00  6.74350e+01  2.15000e+00
  4.44000e-01  4.60620e+01  1.24000e+00 -2.47500e+00  1.13497e+02]


In [6]:
num_background = (y < 0).sum()
num_boson = (y > 0).sum()
print("We have about {} ({}%) background data for {} ({}%) bosons in the training data.".format(num_background, int(100 * num_background / num_data), num_boson, int(100 * num_boson / num_data)))

We have about 164333 (65%) background data for 85667 (34%) bosons in the training data.


In [7]:
indices = np.random.permutation(num_data)
test_indices, validation_indices = np.split(indices, [int(0.8 * num_data)])

In [8]:
features_mean = np.mean(tX, axis=0)
features_std = np.std(tX, axis=0)
print(features_mean)
print(features_std)

[-4.90230794e+01  4.92398193e+01  8.11819816e+01  5.78959617e+01
 -7.08420675e+02 -6.01237051e+02 -7.09356603e+02  2.37309984e+00
  1.89173324e+01  1.58432217e+02  1.43760943e+00 -1.28304708e-01
 -7.08985189e+02  3.87074191e+01 -1.09730480e-02 -8.17107200e-03
  4.66602072e+01 -1.95074680e-02  4.35429640e-02  4.17172345e+01
 -1.01191920e-02  2.09797178e+02  9.79176000e-01 -3.48329567e+02
 -3.99254314e+02 -3.99259788e+02 -6.92381204e+02 -7.09121609e+02
 -7.09118631e+02  7.30645914e+01]
[406.34483401  35.34481492  40.82860887  63.65555431 454.47965615
 657.97098617 453.01897051   0.78290955  22.2734492  115.70588372
   0.84474126   1.19358245 453.59581401  22.41203584   1.21407622
   1.81675941  22.06487828   1.26497962   1.81660763  32.8946274
   1.81221908 126.49925272   0.97742435 532.96172343 489.33730734
 489.33290465 479.87453609 453.38371728 453.3881105   98.01546598]


### Data preparation

In [9]:
data, tX_mean, tX_std = standardize(tX)

In [10]:
features_mean = np.mean(data, axis=0)
features_std = np.std(data, axis=0)
print(features_mean)
print(features_std)

[-2.63465694e-15  4.50019089e-15 -3.48448848e-15  7.19675786e-15
 -2.72244716e-14 -3.48856766e-15  1.33668259e-14  2.16429719e-14
  6.39742126e-15  2.86409207e-15 -7.00447966e-15  4.45924897e-15
 -1.36393998e-14 -5.96492045e-15  1.35646161e-16  7.13136217e-17
  2.58030370e-14 -1.06327391e-16 -1.87188487e-16  8.24369382e-15
  1.41040513e-16 -9.00283004e-15 -6.01698247e-16 -5.68357095e-15
  3.38428841e-15 -1.72635239e-15 -1.00691633e-14  2.10324860e-14
 -5.81535886e-15 -8.76751116e-16]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


### First model

In [13]:
from implementations import *

In [14]:
w, loss = least_squares(y, tX)

  w, res, rank, s = np.linalg.lstsq(tx, y)


(250000,)
(30, 250000)
(30,)


In [15]:
temp = tX.dot(w)

In [16]:
result = np.ones(temp.shape)
result[np.where(temp < 1)] = -1

In [17]:
result

array([-1., -1., -1., ..., -1., -1., -1.])

In [18]:
y

array([ 1., -1., -1., ...,  1., -1., -1.])

In [19]:
((y - tX.dot(w)) ** 2).mean()

0.6793736189540696

In [None]:
print("{}/{}({})".format((result < 0).sum(), (result > 0).sum(), (result.shape[0])))

### Validation

In [None]:
TP = np.sum(np.logical_and(result == 1, y == 1))
TN = np.sum(np.logical_and(result == -1, y == -1))
FP = np.sum(np.logical_and(result == 1, y == -1))
FN = np.sum(np.logical_and(result == -1, y == 1))
print(TP, TN, FP, FN)

In [None]:
precision = TP / (TP + FP)
precision

In [None]:
recall = TP / (TP + FN)
recall

In [None]:
FPR = FP / (FP + TN)
FPR

In [None]:
accuracy = (TP + TN) / (TP + TN + FP + FN)
accuracy

In [None]:
# Best value at 1
F1 = 2 * (precision * recall) / (precision + recall)
F1

#### ROC curve

In [None]:
plt.plot(recall, FPR)
plt.title('ROC curve')
plt.show()

### Finally, give predictions

In [None]:
y_pred = predict_labels(weights, tX)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '../output.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)