In [None]:
"""

- The training and test files are a collection of 50.000 showers per particle type. They were constrcuted from 
the dataset available : https://data.mendeley.com/datasets/pvn3xc3wy5/1 

@ “B.Nachman; L.de Oliveira,; M.Paganini (2017), “Electromagnetic Calorimeter Shower Images”, Mendeley Data, v1“, 
DOI: 10.17632/pvn3xc3wy5.1"

"""

## Import packages

In [1]:
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
import h5py

  from ._conv import register_converters as _register_converters


# 1. Read dataset

In [2]:
h5File = h5py.File('training.h5' , 'r+')

X = []
y = []
particles = ['gamma' ,'piplus','eplus']

for particleId , particleFile in enumerate(particles) :
    
    print particleId , particleFile
    
    events_ = h5File.get( particleFile )
    
    energyL0 = np.array(events_['layer_0'][:])
    energyL1 = np.array(events_['layer_1'][:])
    energyL2 = np.array(events_['layer_2'][:])
    energyOverFlow = np.array(events_['overflow'][:])
    
    eventsCPT = np.array( [ np.concatenate([ 
                                       np.concatenate(energyL0[shower]),
                                       np.concatenate(energyL1[shower]),
                                       np.concatenate(energyL2[shower]),
                                       energyOverFlow[shower]  
                                      ]) for shower in range(len(energyL0)) ] )
    
    X.append( eventsCPT )
    y.append( [particleId]*len(energyL0)  )
    
X = np.array(X)
y = np.array(y)

0 gamma
1 piplus
2 eplus


In [3]:
X.shape , y.shape

((3, 50000, 507), (3, 50000))

In [4]:
X = X.reshape(X.shape[0]*X.shape[1],X.shape[2])
print X.shape

(150000, 507)


In [5]:
y = y.reshape(y.shape[0]*y.shape[1])
print y.shape

(150000,)


# 2. SVM for event classification in the calorimeter

In [6]:
# Data preprocessing : [0,1]

nmax = np.max(X)
X /= nmax
print np.min(X), np.max(X)

0.0 1.0


In [7]:
# Shuffle

from sklearn.utils import shuffle
X,y = shuffle(X, y, random_state=0)

In [10]:
# Split data into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
len(X_train), len(X_test)

(105000, 45000)

In [28]:
# SVM model for supervised classification

from sklearn import svm
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train[:1000], y_train[:1000] )

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
# Model prediction using test set

y_pred = clf.predict(X_test)

In [30]:
# Evaluating the algorithm

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


0.6241555555555556

In [31]:
# Evaluating the algorithm

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 8043  3317  3510]
 [ 1221 13595   121]
 [ 5757  2987  6449]]
              precision    recall  f1-score   support

           0       0.54      0.54      0.54     14870
           1       0.68      0.91      0.78     14937
           2       0.64      0.42      0.51     15193

   micro avg       0.62      0.62      0.62     45000
   macro avg       0.62      0.63      0.61     45000
weighted avg       0.62      0.62      0.61     45000

