<h1><center>Plan</center></h1>

| <h2>Classification</h2> | <h2>Regression</h2> | <h2>Clustering</h2> |
| :- | :- | :- |
| Neural network (MLP) DONE | Decision tree (LightGBM) DONE | PCA and k-means DONE |
| Decision tree (LightGBM) DONE | Neural network (MLP) DONE |  |
| kNN DONE| kNN DONE |  |



# Load the training data

In [1]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

In [2]:
def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        return pd.DataFrame(f[name][:])

train = load_data('train')

#only use 25 best features from SHAP on decision tree
best_features = ['p_sigmad0',
 'p_ethad',
 'p_Rhad',
 'p_Rphi',
 'p_Reta',
 'p_deltaEta1',
 'p_nTracks',
 'p_ambiguityType',
 'p_deltaPhiRescaled2',
 'p_d0',
 'p_Rhad1',
 'p_ptconecoreTrackPtrCorrection',
 'p_E7x11_Lr3',
 'p_d0Sig',
 'p_Eratio',
 'p_numberOfInnermostPixelHits',
 'p_numberOfPixelHits',
 'p_ehad1',
 'p_ethad1',
 'p_TRTPID',
 'p_weta2',
 'p_EptRatio',
 'p_numberOfSCTHits',
 'p_deltaPhi2',
 'p_deltaPhiFromLastMeasurement']

train_variables = train[best_features]
train_energy = train['p_truth_E']
train_class = train['Truth']

#scale data
transformer = RobustScaler().fit(train_variables)
train_variables = pd.DataFrame(transformer.transform(train_variables), columns=train_variables.columns
)

# Classification


### Neural network (MLP)

In [3]:
#define model
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=42, max_iter=200)

In [4]:
#split data into training and validation
x_train, x_val, y_train, y_val = train_test_split(train_variables, train_class, test_size=0.3, random_state=42)

In [5]:
#hyperparameter search
from sklearn.model_selection import RandomizedSearchCV

search_params = {'hidden_layer_sizes': [(15,30,15), (30,30), (40,60), (10,10,10), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam']}

random_search = RandomizedSearchCV(clf, search_params, n_iter=20, cv=3, return_train_score=True, random_state=42, n_jobs=-1)
random_search.fit(x_train, y_train)

print('Best parameters: ', random_search.best_params_ , 'Best score: ', random_search.best_score_)

In [6]:
#train optimized model
from sklearn.metrics import accuracy_score

clf_opt = MLPClassifier(random_state=42, max_iter=200, solver='sgd', hidden_layer_sizes=(100,), activation='tanh')
clf_opt.fit(x_train, y_train)
y_pred_prob = clf_opt.predict_proba(x_val)
y_pred = clf_opt.predict(x_val)
acc = accuracy_score(y_val, y_pred)
print('MLPClassifier accuracy '+str(acc))

MLPClassifier accuracy 0.9369025641025641


# Test data

In [7]:
#load test set
test = load_data('test')
test_variables = test[best_features]

#scale test data
transformer = RobustScaler().fit(test_variables)
test_variables = pd.DataFrame(transformer.transform(test_variables), columns=test_variables.columns
)

In [8]:
#use model on test data
y_pred_prob = pd.DataFrame(clf_opt.predict_proba(test_variables)[:,1])

#export to csv
# y_pred_prob.to_csv('predicted_classes_MLPClassifier.csv', header=False)