In [None]:
import os, glob, pickle

import numpy as np
import tensorflow as tf

from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, Input, Conv1D, MaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split

from sklearn.metrics import matthews_corrcoef, confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

## Load Data

In [None]:
directory = 'INSERT_DIRECTORY' # Load simulated event data created by "ExtractEvents.ipynb"
files = glob.glob(os.path.join(directory, '*.pkl'))
files

In [None]:
evt_data = []
molecules = [x.split('_')[-2] for x in files] # Extract class names
for file in files:
    with open(file, 'rb') as f:
        data = pickle.load(f)
    evt_data.append(data)
molecules

## Preprocess Data
Split the loaded data into a balanced training and testing dataset

In [None]:
def split_data(data, split_ratio):
    train_evt_data = []
    test_evt_data = []
    for molecule in data:
        train_data, test_data = train_test_split(molecule, train_size=split_ratio)
        train_evt_data.append(train_data)
        test_evt_data.append(test_data)
    return train_evt_data, test_evt_data

split_ratio = 0.95
training_evt_data, testing_evt_data = split_data(evt_data, split_ratio)

In [None]:
# Expands the dimensionality of the arrays for use in a CNN (CNNs require arrays with 3 axes)
X_train = np.expand_dims(np.concatenate(training_evt_data), axis=2)
X_test = np.expand_dims(np.concatenate(testing_evt_data), axis=2)
print(X_train.shape)
print(X_test.shape)

In [None]:
# Create class labels
all_labels = []
for i in range(len(molecules)):
    num_evts = len(evt_data[i])
    labels = np.ones(num_evts) * i
    all_labels.append(labels)

all_train_labels, all_test_labels = split_data(all_labels, split_ratio) # Split labels into testing and training sets 

# Join labels to data
Y_train = np.concatenate(all_train_labels)
Y_test = np.concatenate(all_test_labels)
print(Y_train.shape)
print(Y_test.shape)

## Construct Network

In [None]:
inp = Input(X_train.shape[1:])
x = Conv1D(64, 3, activation='relu', padding='same')(inp)
x = Conv1D(64, 3, activation='relu', padding='same')(x)
x = Conv1D(64, 3, activation='relu', padding='same')(x)
x = Conv1D(64, 3, activation='relu', padding='same', name='ConvOut')(x)

x = GlobalAveragePooling1D()(x)

out = Dense(5, activation='softmax')(x)

model = Model(inputs=inp, outputs=out)
opt = Adam(lr=5e-5, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

## Train, Save, Load

In [None]:
history = tf.keras.callbacks.History()
model.fit(X_train, Y_train, epochs=150, validation_data=(X_test, Y_test), callbacks=[history]) # Train the CNN
model.save('CNN GAP 2') # Saves the CNN

In [None]:
# Save training history
with open('CNN_GAP_2_training_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

In [None]:
model = tf.keras.models.load_model('CNN GAP 2') # Reloads the CNN
model.summary()

In [None]:
with open('CNN_GAP_2_training_history.pkl', 'rb') as f: # Reloads the training history
    history = pickle.load(f)

## Training Graphs

In [None]:
plt.plot(history['val_accuracy']) # Plot the validation accuracy vs epoch

In [None]:
plt.plot(history['accuracy']) # Plot the training accuracy vs epoch

## Plot Confusion Matrix

In [None]:
pred = np.array([np.argmax(x) for x in model.predict(X_test)]) # Generate test predictions
accuracy = (Y_test == pred).sum() / Y_test.shape[0] # Calculate Accuracy
mcc = matthews_corrcoef(pred, Y_test) # Calculate MCC
cnn_conf_mat = confusion_matrix(Y_test, pred, normalize='true') # Calculate Confusion
print("Accuracy: {}".format(accuracy))
print("MCC: {}".format(mcc))

In [None]:
# Plot the Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cnn_conf_mat, display_labels=molecules)

fig, ax = plt.subplots()
disp.plot(ax=ax, values_format='.2f')