<h1><center>Plan</center></h1>

| <h2>Classification</h2> | <h2>Regression</h2> | <h2>Clustering</h2> |
| :- | :- | :- |
| Neural network (MLP) DONE | Decision tree (LightGBM) DONE | PCA and k-means DONE |
| Decision tree (LightGBM) DONE | Neural network (MLP) DONE |  |
| kNN DONE| kNN DONE |  |



# Load the training data

In [20]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

In [21]:
def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        return pd.DataFrame(f[name][:])

train = load_data('train')

#only consider actual electrons
electrons = train[train['Truth'] == True]

#only use 15 best features (from SHAP on decision tree)
best_features = ['p_eCluster',
 'p_eAccCluster',
 'p_ecore',
 'p_E3x5_Lr1',
 'p_rawECluster',
 'p_nCells_Lr1_HiG',
 'p_eClusterLr1',
 'p_deltaEta2',
 'p_nTracks',
 'p_EptRatio',
 'p_d0',
 'p_pt_track',
 'p_deltaPhi2',
 'p_nCells_Lr2_HiG',
 'p_deltaEta1']

train_variables = electrons[best_features]
train_energy = electrons['p_truth_E']
train_class = electrons['Truth']

#scale data
transformer = RobustScaler().fit(train_variables)
train_variables = pd.DataFrame(transformer.transform(train_variables), columns=train_variables.columns
)

# Regression

### Neural network (MLP)

In [22]:
#define model
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor(random_state=42, max_iter=400, solver='adam')

In [23]:
#split data into training and validation
x_train, x_val, y_train, y_val = train_test_split(train_variables, train_energy, test_size=0.25, random_state=42)

In [24]:
#hyperparameter search
from sklearn.model_selection import RandomizedSearchCV

search_params = {'hidden_layer_sizes': [(15,30,15), (30,30), (40,60), (10,10,10), (100,)],
    'activation': ['tanh', 'relu'], 'learning_rate':['constant', 'adaptive']}

random_search = RandomizedSearchCV(reg, search_params, n_iter=20, cv=3, return_train_score=True, random_state=42, n_jobs=-1)
random_search.fit(x_train, y_train)

print('Best parameters: ', random_search.best_params_ , 'Best score: ', random_search.best_score_)

In [25]:
#define error function
def mape(pred, true):
    diffs = abs((true-pred)/true)
    
    return (sum(diffs))/len(pred)

In [26]:
#train optimized model
reg_opt = MLPRegressor(random_state=42, max_iter=400, solver='adam', hidden_layer_sizes=(40,60), activation='relu', learning_rate='constant')
reg_opt.fit(x_train, y_train)
y_pred = reg_opt.predict(x_val)
error = mape(y_pred, y_val)
print('MLPRegressor error '+str(error))

MLPRegressor error percentage 0.07483129291254882


# Test data

In [27]:
#load test data
test = load_data('test')
test_variables = test[best_features]

#scale data
transformer = RobustScaler().fit(test_variables)
test_variables = pd.DataFrame(transformer.transform(test_variables), columns=test_variables.columns
)

In [29]:
#apply model to test data
y_test_pred = pd.DataFrame(reg_opt.predict(test_variables))

#export as csv
# y_test_pred.to_csv('predicted_energies_MLPRegressor.csv', header=False)