In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [None]:
# for visualitation import datetime
import datetime

In [None]:
def standardize(tx):
    mean = np.mean(tx, axis=0)
    std = np.std(tx, axis=0)
    tx = (tx-mean)/std
    return tx

# First : exploring the data

We'll need to have a look at what the data is, how it is distributed for the different features, and start to get an intuition about what methods might work better for analysis and prediction later.

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *

# loading train data
DATA_TRAIN_PATH = 'data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
# remove samples with error values
idx_c = np.all(tX!=-999, axis=1)
y_c = y[idx_c]
tX_c = tX[idx_c]

# regularize
mean = np.mean(tX_c, axis=0)
std = np.std(tX_c, axis=0)
tX_c = (tX_c-mean)/std # tX.c contains data without samples with errors

# checking if features have errors or not
f=np.all(tX!=-999, axis=0)

# Removing features with errors (-999)
tX_fc = tX_c[:,f]

# adding column of 1 at the beginning of tX_fc
tX_fc= np.hstack(( np.ones((tX_fc.shape[0], 1), dtype=tX_fc.dtype),tX_fc)) 

In [None]:
# Describing features: True-> No error False -> At least one error
f

In [None]:
print(f[22],any(tX[:,22]>3)==True) 
# This shows that the features number of jets has no errors and only four possible values: 0,1,2,3

In [None]:
#plot of all features
print("Overall: s: ",np.sum(y==1),", b: ",np.sum(y==-1)," ,total:",len(y))
print("NoErrors: s: ",np.sum(y_c==1),", b: ",np.sum(y_c==-1)," ,total:",len(y_c))
for n in range(tX_c.shape[1]):
    plt.figure(figsize=(20,4))
    plt.subplot(131)
    plt.hist([tX_c[y_c==1,n],tX_c[y_c==-1,n]], 20, density=True, histtype='bar', stacked=True)
    plt.legend(['s','b'])
    plt.title('Feature '+str(n))
    plt.subplot(132)
    plt.title('s histogram feature '+str(n))
    plt.hist(tX_c[y_c==1,n], 20, density=True, histtype='bar', stacked=True)
    plt.subplot(133)
    plt.title('b histogram feature '+str(n))
    plt.hist(tX_c[y_c==-1,n], 20, density=True, histtype='bar', stacked=True)    
    plt.show()

In [None]:
# remove features with error values
idx_gf = np.arange(tX.shape[1])[np.all(tX!=-999, axis=0)]
y_gf = y
tX_gf = tX[:,idx_gf]
# regularize
mean = np.mean(tX_gf, axis=0)
std = np.std(tX_gf, axis=0)
tX_gf = (tX_gf-mean)/std

In [None]:
# plot of features without errors
print("Overall: s: ",np.sum(y==1),", b: ",np.sum(y==-1)," ,total:",len(y))
for n in range(tX_gf.shape[1]):
    plt.figure(figsize=(20,4))
    plt.subplot(131)
    plt.hist([tX_gf[y_gf==1,n],tX_gf[y_gf==-1,n]], 20, density=True, histtype='bar', stacked=True)
    plt.legend(['s','b'])
    plt.title('Feature '+str(idx_gf[n]))
    plt.subplot(132)
    plt.title('s histogram feature '+str(idx_gf[n]))
    plt.hist(tX_gf[y_gf==1,n], 20, density=True, histtype='bar', stacked=True)
    plt.subplot(133)
    plt.title('b histogram feature '+str(idx_gf[n]))
    plt.hist(tX_gf[y_gf==-1,n], 20, density=True, histtype='bar', stacked=True)    
    plt.show()

# Actual predictions start from here

After having looked at the data we will now do some actual predictions using different models andd parameters. 
Feature 15 ,18, 20 does not seem to impact the result.

In [None]:
# Splitting samples according to number of jets (featuare 23) 
# Only four possible values for jets (3 indicates also more jets)
jet_0 = tX[:,22]==0
jet_1 = tX[:,22]==1
jet_2 = tX[:,22]==2
jet_3 = tX[:,22]==3

#Splitting tX and y
tX_0 = tX[jet_0, :]
tX_1 = tX[jet_1, :]
tX_2 = tX[jet_2, :]
tX_3 = tX[jet_3, :]
y_0 = y[jet_0]
y_1 = y[jet_1]
y_2 = y[jet_2]
y_3 = y[jet_3]

#Removing column with jet number
tX_0=np.delete(tX_0,22,1)
tX_1=np.delete(tX_1,22,1)
tX_2=np.delete(tX_2,22,1)
tX_3=np.delete(tX_3,22,1)

# selection of features without errors in every partition of tX
f_0 =np.all(tX_0!=-999, axis=0)
tX_0_gf = tX_0[:,f_0]
f_1 =np.all(tX_1!=-999, axis=0)
tX_1_gf = tX_1[:,f_1]
f_2 =np.all(tX_2!=-999, axis=0)
tX_2_gf = tX_2[:,f_2]
f_3 =np.all(tX_3!=-999, axis=0)
tX_3_gf = tX_3[:,f_3]
tX_3_gf.shape[0] + tX_2_gf.shape[0] + tX_1_gf.shape[0] + tX_0_gf.shape[0]

In [None]:
#Removing last column of tX_0_gf 
tX_0_gf = tX_0_gf[:,0:tX_0_gf.shape[1]-1]

In [None]:
# Standardize data
tX_0_gf = standardize(tX_0_gf)
tX_1_gf = standardize(tX_1_gf)
tX_2_gf = standardize(tX_2_gf)
tX_3_gf = standardize(tX_3_gf)
tX_0_gf.shape

In [None]:
# function that add new features 
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=1 up to j=degree."""
    phi=np.zeros((x.shape[0],degree*x.shape[1]))
    for j in range(degree):
            phi[:,j*x.shape[1]:(j+1)*x.shape[1]]=x**(j+1)
    return phi

In [None]:
# Passing to polynomial regression of degree= 'degree'
degree = 7
tX_0_gf=build_poly(tX_0_gf, degree)
tX_1_gf=build_poly(tX_1_gf, degree)
tX_2_gf=build_poly(tX_2_gf, degree)
tX_3_gf=build_poly(tX_3_gf, degree)

In [None]:
# adding column of 1 at the beginning of tX_gf
tX_0_gf= np.hstack(( np.ones((tX_0_gf.shape[0], 1), dtype=tX_0_gf.dtype),tX_0_gf))
tX_1_gf= np.hstack(( np.ones((tX_1_gf.shape[0], 1), dtype=tX_1_gf.dtype),tX_1_gf))
tX_2_gf= np.hstack(( np.ones((tX_2_gf.shape[0], 1), dtype=tX_2_gf.dtype),tX_2_gf))
tX_3_gf= np.hstack(( np.ones((tX_3_gf.shape[0], 1), dtype=tX_3_gf.dtype),tX_3_gf))


In [None]:
from implementations_2 import *

# indices of samples with no errors
ids_=ids[idx_c]

In [None]:
# least_squares method 
w_ls_0, loss_ls_0 =least_squares(y_0,tX_0_gf)
w_ls_1, loss_ls_1 =least_squares(y_1,tX_1_gf)
w_ls_2, loss_ls_2 =least_squares(y_2,tX_2_gf)
w_ls_3, loss_ls_3 =least_squares(y_3,tX_3_gf)
loss_ls_0, loss_ls_1

In [None]:
# least square using SGD

# starting value for SGD method
w_in_0= np.zeros((tX_0_gf.shape[1],))
w_in_1= np.zeros((tX_1_gf.shape[1],))
w_in_2= np.zeros((tX_2_gf.shape[1],))
w_in_3= np.zeros((tX_3_gf.shape[1],))

# number of iterations for SGD method
max_iters=1000

# step-size of the method
gamma=0.01

# application of the method to the separated dataset
w_ls_SGD_0,_=least_squares_SGD(y_0,tX_0_gf, w_in_0, max_iters, gamma)


In [None]:
w_ls_SGD_1,_=least_squares_SGD(y_1,tX_1_gf, w_in_1, max_iters, gamma)

In [None]:
w_ls_SGD_2,_=least_squares_SGD(y_2,tX_2_gf, w_in_2, max_iters, gamma)

In [None]:
w_ls_SGD_3,_=least_squares_SGD(y_3,tX_3_gf, w_in_3, max_iters, gamma)

In [None]:
###### logistic regression SGD############

# modifing y from -1,1 to 0,1 
y_l=(y_gf+1)/2

# starting value for GD/SGD method
w_initial= np.zeros((19,))

# number of iterations for SGD/GD method
max_iters=100

# step pf the method
gamma=0.00001

_, w_l = logistic_regression(y_l,tX_gf, w_initial,max_iters,gamma, mode= 'GD')
w_l.shape

In [None]:
# regularized logistic regression

# starting value for GD/SGD method
w_initial_rl = w_ls

# number of iterations for SGD/GD method
max_iters=100

# step pf the method
gamma=0.1
lambda_=1
reg_logistic_regression(y_l, tX_gf, lambda_, w_initial_rl, max_iters, gamma, mode='SGD')

# Save prediction ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = 'data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
# Splitting samples according to number of jets (featuare 23) 
# Only four possible values for jets (3 indicates also more jets)
jet_0_test = tX_test[:,22]==0
jet_1_test = tX_test[:,22]==1
jet_2_test = tX_test[:,22]==2
jet_3_test = tX_test[:,22]==3

#Splitting tX 
tX_0_test = tX_test[jet_0_test, :]
tX_1_test = tX_test[jet_1_test, :]
tX_2_test = tX_test[jet_2_test, :]
tX_3_test = tX_test[jet_3_test, :]

# Splitting indices
ids_test_0 = ids_test[jet_0_test]
ids_test_1 = ids_test[jet_1_test]
ids_test_2 = ids_test[jet_2_test]
ids_test_3 = ids_test[jet_3_test]


#Removing column with jet number
tX_0_test=np.delete(tX_0_test,22,1)
tX_1_test=np.delete(tX_1_test,22,1)
tX_2_test=np.delete(tX_2_test,22,1)
tX_3_test=np.delete(tX_3_test,22,1)

# selection of features without errors in every partition of tX
f_0_test =np.all(tX_0_test!=-999, axis=0)
tX_0_gf_test = tX_0_test[:,f_0_test]
f_1_test =np.all(tX_1_test!=-999, axis=0)
tX_1_gf_test = tX_1_test[:,f_1_test]
f_2_test =np.all(tX_2_test!=-999, axis=0)
tX_2_gf_test = tX_2_test[:,f_2_test]
f_3_test =np.all(tX_3_test!=-999, axis=0)
tX_3_gf_test = tX_3_test[:,f_3_test]

In [None]:
#Removing last column of tX_0_gf 
tX_0_gf_test = tX_0_gf_test[:,0:tX_0_gf_test.shape[1]-1]

In [None]:
# Standardize data
tX_0_gf_test = standardize(tX_0_gf_test)
tX_1_gf_test = standardize(tX_1_gf_test)
tX_2_gf_test = standardize(tX_2_gf_test)
tX_3_gf_test = standardize(tX_3_gf_test)

In [None]:
tX_0_gf_test=build_poly(tX_0_gf_test, degree)
tX_1_gf_test=build_poly(tX_1_gf_test, degree)
tX_2_gf_test=build_poly(tX_2_gf_test, degree)
tX_3_gf_test=build_poly(tX_3_gf_test, degree)

In [None]:
# Adding 1 column 
tX_0_gf_test= np.hstack(( np.ones((tX_0_gf_test.shape[0], 1), dtype=tX_0_gf_test.dtype),tX_0_gf_test))
tX_1_gf_test= np.hstack(( np.ones((tX_1_gf_test.shape[0], 1), dtype=tX_1_gf_test.dtype),tX_1_gf_test))
tX_2_gf_test= np.hstack(( np.ones((tX_2_gf_test.shape[0], 1), dtype=tX_2_gf_test.dtype),tX_2_gf_test))
tX_3_gf_test= np.hstack(( np.ones((tX_3_gf_test.shape[0], 1), dtype=tX_3_gf_test.dtype),tX_3_gf_test))

In [None]:
# Prediction using least squares
y_pred_0 = predict_labels(w_ls_0,tX_0_gf_test)
y_pred_1 = predict_labels(w_ls_1,tX_1_gf_test)
y_pred_2 = predict_labels(w_ls_2,tX_2_gf_test)
y_pred_3 = predict_labels(w_ls_3,tX_3_gf_test)


In [None]:
# Predictions together
y_pred=np.zeros((ids_test.shape))
y_pred[jet_0_test]=y_pred_0
y_pred[jet_1_test]=y_pred_1
y_pred[jet_2_test]=y_pred_2
y_pred[jet_3_test]=y_pred_3
y_pred.shape

In [None]:
# remove features with errors from test data
f_test= np.all(tX_test!=-999, axis=0)

tX_test_fc = tX_test[:,f_test]
# regularize
mean = np.mean(tX_test_fc, axis=0)
std = np.std(tX_test_fc, axis=0)
tX_test_fc = (tX_test_fc-mean)/std

#Adding 1 column
tX_test_fc= np.hstack(( np.ones((tX_test_fc.shape[0], 1), dtype=tX_fc.dtype),tX_test_fc)) 

In [None]:
# if we use logistic regression SGD and no error samples in test data
#weights=w_ls_GD
#y_pred = prediction(tX_test_fc, weights)

In [None]:
OUTPUT_PATH = 'sub_3' # TODO: fill in desired name of output file for submission
#y_pred = predict_labels(weights, tX_test_fc)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
y_pred
# hello###############################à