## LSTM - preliminary results

#### Import libraries

In [1]:
%%capture
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, precision_recall_curve, roc_curve, PrecisionRecallDisplay, RocCurveDisplay, auc
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import AUC

import os
import glob
import fileinput
import matplotlib.pyplot as plt
import seaborn as sns


### Import data and split into train/validation/test sets
For this initial report, we used a smaller subset of the data - scenarios 5, 6, 7, 11, 12

n.b. couldn't get 06 to work, so omitted

In [15]:
# list of scenarios
scenarios = ['05.txt', '07.txt', '11.txt', '12.txt']

# empty list to capture statistics for each scenario
results = [] 

# iterate over scenarios to build, train, and evaluate LSTM model
for filename in scenarios:

    scenario_name = filename.replace(".txt", "")
    # load the dataset
    scenario = np.loadtxt('/content/' + filename, delimiter=',')

    # split into input (X) and output (y) variables
    X = scenario[:,1:]
    y = scenario[:,0]

    # split into training (80%), validation (10%), and test (10%) sets
    rand_seed = 123
    X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size = 0.8,
                                                      random_state = rand_seed,
                                                      stratify = y)
    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, 
                                                        train_size = 0.5,
                                                        random_state = rand_seed,
                                                        stratify = y_rem)
    
    # build LSTM model

    # Input for variable-length sequences of integers
    inputs = keras.Input(shape = (None, ), dtype = "int32")
    # Embed each integer in a 28,128-dimensional vector
    x = layers.Embedding(X.shape[1], 128)(inputs)
    # Add 2 bidirectional LSTMs
    x = layers.Bidirectional(layers.LSTM(64, return_sequences = True))(x)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    # Add a classifier
    outputs = layers.Dense(1, activation = "sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.summary()

    # compile and train model
    model.compile("adam", "binary_crossentropy", metrics = ["accuracy"])
    model.fit(X_train, y_train, batch_size = 32, epochs = 2, 
              validation_data = (X_valid, y_valid))
    
    # make predictions with model and print classification report
    predictions = (model.predict(X_test) > 0.5).astype(int)
    print(filename)
    print(classification_report(y_test, predictions))

    # calculate metrics for each model
    precision, recall, fscore, support = score(y_test, predictions, 
                                               average = 'binary')
    auc = roc_auc_score(y_test, predictions)

    # append metrics to results
    results.append(
        {
          'Scenario': scenario_name,
          'Precision': precision,
          'Recall': recall,
          'F1 Score': fscore,
          'AUC': auc
        })

    print("scenario", filename, "complete")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         3584      
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 128)        98816     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 201,345
Trainable params: 201,345
Non-trainab

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 128)         3584      
                                                                 
 bidirectional_6 (Bidirectio  (None, None, 128)        98816     
 nal)                                                            
                                                                 
 bidirectional_7 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 201,345
Trainable params: 201,345
Non-trainab

In [16]:
results_df = pd.DataFrame(results).sort_values('Scenario')
results_df

Unnamed: 0,Scenario,Precision,Recall,F1 Score,AUC
0,5,0.963855,0.888889,0.924855,0.944328
1,7,0.0,0.0,0.0,0.5
2,11,0.998775,0.997552,0.998163,0.998726
3,12,0.599119,0.626728,0.612613,0.811957


In [17]:
from google.colab import files
results_df.to_csv('output.csv', encoding = 'utf-8-sig') 
files.download('output.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
### add to results - 