In [None]:
import os, glob, pickle

import numpy as np

import tensorflow as tf

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import matthews_corrcoef, confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

## Load Data

In [None]:
directory = 'INSERT_DIRECTORY' # Load simulated event data created by "ExtractEvents.ipynb"
files = glob.glob(os.path.join(directory, '*.pkl'))
files

In [None]:
evt_data = []
molecules = [x.split('_')[-2] for x in files] # Extract class names
for file in files:
    with open(file, 'rb') as f:
        data = pickle.load(f)
    evt_data.append(data)
molecules

## Preprocess Data
Split the loaded data into a balanced training and testing dataset

In [None]:
def split_data(data, split_ratio):
    train_evt_data = []
    test_evt_data = []
    for molecule in data:
        train_data, test_data = train_test_split(molecule, train_size=split_ratio)
        train_evt_data.append(train_data)
        test_evt_data.append(test_data)
    return train_evt_data, test_evt_data

split_ratio = 0.95
training_evt_data, testing_evt_data = split_data(evt_data, split_ratio)
len(testing_evt_data)

In [None]:
X_train = np.concatenate(training_evt_data)
X_test = np.concatenate(testing_evt_data)
print(X_train.shape)
print(X_test.shape)

In [None]:
# Create class labels
all_labels = []
for i in range(len(molecules)):
    num_evts = len(evt_data[i])
    labels = np.ones(num_evts) * i
    all_labels.append(labels)

all_train_labels, all_test_labels = split_data(all_labels, split_ratio)    

# Join class labels
Y_train = np.concatenate(all_train_labels)
Y_test = np.concatenate(all_test_labels)
print(Y_train.shape)
print(Y_test.shape)

## Classifiers

###  Naive Bayesian Classifier

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train) # Train NBC
pred = gnb.predict(X_test) # Generate predictions

accuracy = (Y_test == pred).sum() / Y_test.shape[0] # Calcualte Accuracy
mcc = matthews_corrcoef(pred, Y_test) # Calculate MCC

print("Accuracy: {}".format(accuracy))
print("MCC: {}".format(mcc))

In [None]:
# Generate confusion matrix
nbc_conf_mat = confusion_matrix(Y_test, pred, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=nbc_conf_mat, display_labels=molecules)

fig, ax = plt.subplots()
disp.plot(ax=ax, values_format='.2f')

### k-Nearest Neighbour

In [None]:
X_train.shape[0] ** 0.5 # Theoretical best k-value

In [None]:
# Optimise for best k-value
ks = np.array(list(range(1000))) + 1
accuracies = []
mccs = []
best_mcc = -1
for k in ks:
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, Y_train) # Train KNN classifier
    pred = knn.predict(X_test) # Generate Predictions

    accuracy = (Y_test == pred).sum() / Y_test.shape[0] # Calculate Predictions
    mcc = matthews_corrcoef(pred, Y_test) # Calculate MCC
    conf_mat = confusion_matrix(Y_test, pred, normalize='true') # Calculate Confusion
    accuracies.append(accuracy)
    mccs.append(mcc)
    
    if mcc > best_mcc: # Update best found k-value
        best_mcc = mcc
        best_knn_agent = knn

max_idx = np.argmax(mccs)

# Calcualte best confusion matrix
knn_conf_mat = confusion_matrix(Y_test, best_knn_agent.predict(X_test), normalize='true')
print(ks[max_idx])
print("MCC: {}".format(mccs[max_idx]))
print("Accuracy: {}".format(accuracies[max_idx]))

In [None]:
# Plot the Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=knn_conf_mat, display_labels=molecules)

fig, ax = plt.subplots()
disp.plot(ax=ax, values_format='.2f')

In [None]:
# Visualise accuracies vs k-value
fig, ax = plt.subplots()
ax.plot(ks, accuracies)
ax.plot(ks, mccs)

### Random Forest

In [None]:
# Optimise for best forest size (number of trees)
num_trees = np.array(list(range(200))) + 1
accuracies = []
mccs = []
best_mcc = -1
for num in num_trees:
    print(num)
    rfc = RandomForestClassifier(num) 
    rfc.fit(X_train, Y_train) # Train RFC
    pred = rfc.predict(X_test) # Generate Predictions

    accuracy = (Y_test == pred).sum() / Y_test.shape[0] # Calcualte Accuracy
    mcc = matthews_corrcoef(pred, Y_test) # Calculate MCC
    accuracies.append(accuracy)
    mccs.append(mcc)
    
    if mcc > best_mcc: # Update best found tree number
        best_mcc = mcc
        best_rf_agent = rfc



In [None]:
# Calcualte best Confusion matrix
max_idx = np.argmax(mccs)

rfc_conf_mat = confusion_matrix(Y_test, best_rf_agent.predict(X_test), normalize='true')

print(num_trees[max_idx])
print("MCC: {}".format(mccs[max_idx]))
print("Accuracy: {}".format(accuracies[max_idx]))

In [None]:
# Plot the Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=rfc_conf_mat, display_labels=molecules)

fig, ax = plt.subplots()
disp.plot(ax=ax, values_format='.2f')

In [None]:
# Visualise Accuracies vs tree number
fig, ax = plt.subplots()
ax.plot(num_trees, accuracies)
ax.plot(num_trees, mccs)

### Support-Vector Machine

In [None]:
sup = SVC()
sup.fit(X_train, Y_train) # Train SVM classifier
pred = sup.predict(X_test) # Generate Predictions

accuracy = (Y_test == pred).sum() / Y_test.shape[0] # Calcualte Accuracy
mcc = matthews_corrcoef(pred, Y_test) # Calculate MCC
svm_conf_mat = confusion_matrix(Y_test, pred, normalize='true') # Calculate Confusion
print("Accuracy: {}".format(accuracy))
print("MCC: {}".format(mcc))

In [None]:
# Plot the Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=svm_conf_mat, display_labels=molecules)

fig, ax = plt.subplots()
disp.plot(ax=ax, values_format='.2f')

## FFNN

In [None]:
# Load pretrained FFNN from 'FFN Classification.ipynb'
ffn = tf.keras.models.load_model('FFN')
ffn.summary()

In [None]:
pred = np.array([np.argmax(x) for x in ffn.predict(X_test)]) # Generate Predictions
accuracy = (Y_test == pred).sum() / Y_test.shape[0] # Calculate Accuracy
mcc = matthews_corrcoef(pred, Y_test) # Calculate MCC
ffnn_conf_mat = confusion_matrix(Y_test, pred, normalize='true') # Calculate Confusion
print("Accuracy: {}".format(accuracy))
print("MCC: {}".format(mcc))

In [None]:
# Plot the Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=ffnn_conf_mat, display_labels=molecules)

fig, ax = plt.subplots()
disp.plot(ax=ax, values_format='.2f')

## CNN

In [None]:
# Load pretrained CNN from 'CNN Classification.ipynb'
cnn = tf.keras.models.load_model('CNN GAP 2')
cnn.summary()

In [None]:
pred = np.array([np.argmax(x) for x in cnn.predict(X_test)]) # Generate Predictions
accuracy = (Y_test == pred).sum() / Y_test.shape[0] # Calculate Accuracy
mcc = matthews_corrcoef(pred, Y_test) # Calculate MCC
cnn_conf_mat = confusion_matrix(Y_test, pred, normalize='true') # Calculate Confusion
print("Accuracy: {}".format(accuracy))
print("MCC: {}".format(mcc))

In [None]:
# Plot the Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cnn_conf_mat, display_labels=molecules)

fig, ax = plt.subplots()
disp.plot(ax=ax, values_format='.2f')

## All Confusion Matrices
Plot all of the above best confusion matrices in one figure

In [None]:
plt.rcParams.update({'font.size': 8})

In [None]:
fig, axs = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(6, 4), dpi=600)

methods = [['NBC', 'KNN', 'RFC'], ['SVM', 'FFNN', 'CNN']]
conf_mats = [[nbc_conf_mat, knn_conf_mat, rfc_conf_mat], [svm_conf_mat, ffnn_conf_mat, cnn_conf_mat]]

for i, row in enumerate(axs):
    for j, ax in enumerate(row):
        #if i == 0:
        #    ax.set(xlabel="")
        disp = ConfusionMatrixDisplay(confusion_matrix=conf_mats[i][j], display_labels=molecules)
        disp.plot(ax=ax, values_format='.1f', colorbar=False)
        ax.set(title=methods[i][j])

# Implement Colorbar
norm = plt.Normalize(0, 1)
sm = plt.cm.ScalarMappable(cmap='viridis', norm=norm)
sm.set_array([])
    
cbar_ax = axs[0, 2].inset_axes([1.20, -1.5, 0.05, 2.5], transform=axs[0, 2].transAxes)
fig.colorbar(sm, ax=axs[0, 2], cax=cbar_ax)


for i, ax in enumerate(axs[0, :]):
    if i != 0:
        ax.set_ylabel('')
    ax.set_xlabel('')

for i, ax in enumerate(axs[1, :]):
    if i != 0:
        ax.set_ylabel('')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=10)
    ax.set_xlabel(ax.get_xlabel(), fontsize=10, weight='bold')
    
for ax in axs[:, 0]:
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
    ax.set_ylabel(ax.get_ylabel(), fontsize=10, weight='bold')
    
for ax in axs.flatten():
    ax.tick_params(width=1.5)
    for axis in ['top', 'right', 'bottom', 'left']:
        ax.spines[axis].set_linewidth(1.5)

fig.subplots_adjust(bottom=0.2, hspace=0.3)
        
# fig.savefig('ClassResults.png')