In [None]:
# === Imports ===
import gc
import sys
import os 
import warnings
import time
import numpy as np
import pandas as pd
from keras.callbacks import EarlyStopping
from tensorflow.python.framework.random_seed import set_random_seed
from deep_models import CNN
from metrics import measure_performance
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
# function to sample N synthetic values and augment training data
def augment_synth_data(run_file_num, synth_amount):

    synth_data = pd.read_csv("../data/sars/" + file_name + "/" + file_name + "_run_" + str(run_file_num) + "_train.csv")

    if synth_amount == "all":
        return synth_data 
    
    elif synth_amount > 0:
        pos_sample_percentage = round(synth_amount * 0.51)  # positive label percentage
        neg_sample_percentage = synth_amount - pos_sample_percentage  # sample 33% negative samples
        
        # sample for augmentation
        positive_samples = synth_data.loc[synth_data["classLabel"] == 1] \
            .sample(n=pos_sample_percentage, replace=False)
        
        negative_samples = synth_data.loc[synth_data["classLabel"] == 0] \
            .sample(n=neg_sample_percentage, replace=False)
        
        # vertical concat -> row-wise
        return pd.concat([positive_samples, negative_samples], axis=0) 

In [None]:
# === HYPERPARAMTERS ===
_BATCH = 1
_EPOCHS = 100
_VERBOSITY = 0
criterion = EarlyStopping(monitor="loss", patience=5)
NUM_SEED = 5
NUM_RUN = 3
input_dim = 891

run_list = []
seed_list = []

# === Train Results Lists ===
train_acc_list = []
train_loss_curves = []

# === Test Results Lists ===
test_acc_list = []
f1_list = []
recall_list = []
precision_list = []

# synth data controllers, only 1 should be True at any time
is_blended = False
is_vae = False

# synths file names
if is_blended:
    file_name = "blended"
elif is_vae:
    file_name = "vae"
else:
    file_name = "orig"
    
# 0, amount wanted or "all"
synth_amount = 0

In [None]:
# track training time
start_time = time.time()

for run in range(1, NUM_RUN + 1):
    print("######################### RUN %d #########################" % run)
    
    ################
    # DATA #
    ################
    
    # === DATA SOURCES ===
    run_training_file = "../data/sars/orig/orig_run_" + str(run) + "_train.csv"
    run_holdout_file = "../data/sars/orig/orig_run_" + str(run) + "_holdout.csv"
    
    training_data = pd.read_csv(run_training_file, header=0)
    
    # concat training data with current augmentation
    if is_blended or is_vae:
        synth_data = augment_synth_data(run, synth_amount)
        training_data = pd.concat([training_data, synth_data])
        
        print(f'Synth samples added: {synth_data.shape[0]}')
        del synth_data
    
    # split features and labels
    X_train = training_data.iloc[:, :-1].to_numpy()
    X_train = X_train.reshape(-1, 1, input_dim, 1)
    
    y_train = training_data.iloc[:, -1].to_numpy()
    
    # holdout file
    test_data = pd.read_csv(run_holdout_file, header=0)
    X_test = test_data.iloc[:, :-1].to_numpy()
    X_test = X_test.reshape(-1, 1, input_dim, 1)
    
    y_test = test_data.iloc[:, -1].to_numpy()
    
    print("Total training samples = %d" % X_train.shape[0])
    print("Total holdout samples = %d" % y_test.shape[0])

    # clean memory
    del training_data
    del test_data
    del run_training_file
    del run_holdout_file
    gc.collect()
    
    ################
    # TRAINING # 
    ################
    
    print('Run', '\t', 'Seed', '\t', 'Test Accuracy')

    for seed in range(0, NUM_SEED):
        # reproducibility
        set_random_seed(seed)
        np.random.seed(seed)
        
        run_list.append(run)
        seed_list.append(seed)

        # import and fit model
        model = CNN(input_dim)

        # train model
        fit = model.fit(X_train, y_train, epochs=_EPOCHS, batch_size=_BATCH,
                        shuffle=True, verbose=_VERBOSITY)
        
        # size = batch_size * num_seeds * num runs
        train_loss_curves.append(fit.history['loss'])
        train_acc_list.append(fit.history['acc'])

        # get test metrics
        accuracy, precision, recall, f1 = measure_performance(model, X_test, y_test)
        
        # update metric lists
        test_acc_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        
        print(run, '\t', seed, '\t', accuracy)
    
    # clear data to load fresh (cache issue in notebook)
    del X_train 
    del y_train
    del X_test
    del y_test
    del model
    del fit
    gc.collect()
        
    print()
    
print("CNN took approx: %s minutes" % round((time.time() - start_time) / 60, 4))

In [None]:
# === SAVE RESULTS ===
# converts results lists to dataframe
results_folder = "../results/sars/" + file_name
    
results = pd.DataFrame({'run': run_list, 'seed': seed_list, 'accuracy': test_acc_list, 
                        "precision": precision_list, "recall": recall_list, "f1": f1_list})

# calculates the mean accuracies and the associated standard deviation
means = results.groupby(['run'])["accuracy", "precision", "recall", "f1"].mean()
standard_deviations = results.groupby(['run'])["accuracy", "precision", "recall", "f1"].std()

means = means.add_prefix("mean_")
standard_deviations = standard_deviations.add_prefix("std_")

# creates dataframe of mean and standard deviation aggregate values
results_summary = pd.concat([means.round(3), standard_deviations.round(3)], axis=1)

# get mean of all 3 runs
results_summary.loc["run_mean"] = results_summary.mean().round(3)

# Save results to File
results.to_csv(results_folder + "/cnn_results_" + str(synth_amount) + ".csv", index=False)
results_summary.to_csv(results_folder + "/cnn_results_summary_" + str(synth_amount) + ".csv")

df_train_acc = pd.DataFrame(train_acc_list)
df_train_acc.to_csv(results_folder + "/cnn_train_acc_" + str(synth_amount) + ".csv", index=False)

df_train_loss = pd.DataFrame(train_loss_curves)
df_train_loss.to_csv(results_folder + "/cnn_train_loss_" + str(synth_amount) + ".csv", index=False)

In [None]:
# arange 0-19, // by steps=5(seeds)
train_acc_means = df_train_acc.groupby(np.arange(len(df_train_acc))//5).mean()
train_loss_means = df_train_loss.groupby(np.arange(len(df_train_loss))//5).mean()

In [None]:
# plot training loss curves
plt.title("Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Training Loss")
plt.xticks(range(0, _EPOCHS+1, 10))

plt.plot(train_loss_means.iloc[0, :],  label="Run 1")
plt.plot(train_loss_means.iloc[1, :], label="Run 2")
plt.plot(train_loss_means.iloc[2, :], label="Run 3")

plt.legend()
plt.savefig(results_folder + "/cnn_sars_" + str(synth_amount) + "_training_loss.png")

train_loss_means

In [None]:
# plot training acc
plt.title("Training Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Training Accuracy")
plt.xticks(range(0, _EPOCHS+1, 10))

plt.plot(train_acc_means.iloc[0, :], label="Run 1")
plt.plot(train_acc_means.iloc[1, :], label="Run 2")
plt.plot(train_acc_means.iloc[2, :], label="Run 3")

plt.legend()
plt.savefig(results_folder + "/cnn_sars_" + str(synth_amount) + "_training_acc.png")

train_acc_means

In [None]:
# plot acc curves
plt.title("Test Accuracy")
plt.xlabel("Seeds")
plt.ylabel("Test Accuracy")
plt.xticks(range(0, NUM_SEED))

plt.plot(test_acc_list[0:5], "o-", label="Run 1")
plt.plot(test_acc_list[5:10], "o-", label="Run 2")
plt.plot(test_acc_list[10:15], "o-", label="Run 3")

plt.legend()
plt.savefig(results_folder + "/cnn_sars_" + str(synth_amount) + "_test_acc.png")

pd.DataFrame(test_acc_list).T