In [1]:
path = "/content/drive/MyDrive/Graduate/Semester 1 (Fall 2024)/INFO-H423 Data Mining/Project/SNCB_IncidentClassifier/models/anomaly_detection/classification"

In [2]:
%cd $path
%ls

/content/drive/MyDrive/Graduate/Semester 1 (Fall 2024)/INFO-H423 Data Mining/Project/SNCB_IncidentClassifier/models/anomaly_detection/classification
'1201 - LSTM eval.ipynb'   [0m[01;34mdata_all[0m/    lstm_classifier.py   [01;34m__pycache__[0m/
'1206 - LSTM eval.ipynb'   [01;34mdata_norm[0m/   main.py              [01;34mresults[0m/


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import keras
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report

from ast import literal_eval
from datetime import datetime
import os, sys, json

In [19]:
class LSTMClassifier:
    def __init__(self):
        print('[__init__] start')
        self.vocab_path = '../data/metadata.tsv'
        self.vectors_path = '../data/vectors.tsv'
        self.data_path = f'{path}/../data/df_50.csv'
        self.results_path = './results'
        timestamp = datetime.now()
        timestamp_fmt = timestamp.strftime("%Y%m%d_%H%M%S") # e.g. 20230320_154559

        try:
            open(self.data_path)
        except Exception as err:
            print(f'Error opening input file: {err}')
            sys.exit(1)
        print('[__init__] end')

    def load_data(self):
        print('[load_data] start')
        self.df = pd.read_csv(self.data_path, index_col=0)
        self.df_seq = self.df.drop(columns=['anom_count', 'incident_id', 'num', 'class', 'mse', 'incident_type'], axis=1)
        # self.df_og = pd.read_csv('../../../data/time_sorted_table.csv', delimiter=';', index_col=0)
        # self.incident_type_dict = dict()
        # for _, row in self.df_og.iterrows():
        #     self.incident_type_dict[row['incident_id']] = row['incident_type']
        # self.df['incident_type'] = self.df['incident_id'].map(self.incident_type_dict)
        print('[load_data] end')

    def encode_seqs(self):
        print('[encode_seqs] start')
        def encode_seqs(df, vocab_lookup):
            encoded = []
            for _, row in df.iterrows():
                seq = []
                for i, step in enumerate(row.values):
                    try:
                        seq.append(vocab_lookup[str(step)])
                    except:
                        print(f'Unknown: {step}')
                        seq.append(vocab_lookup['[UNK]'])
                encoded.append(seq)
            print(len(encoded), len(encoded[0]))
            return np.array(encoded)
        self.vocab = pd.read_csv(self.vocab_path, delimiter='\t', header=None)
        self.vocab.columns = ['word']
        self.vectors = pd.read_csv(self.vectors_path, delimiter='\t', header=None)
        vocab_lookup = dict()
        for idx, row in self.vocab.iterrows():
            vocab_lookup[row['word']] = self.vectors.iloc[idx].values
        self.df_encoded = encode_seqs(self.df_seq, vocab_lookup)
        print('[encode_seqs] end')

    def prepare_train_data(self):
        print('[prepare_train_data] start')
        self.y = np.array(self.df['incident_type'].to_list())
        self.y_encoder = LabelEncoder()
        self.y_encoded = self.y_encoder.fit_transform(self.y)

        self.X = self.df_encoded
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y_encoded, test_size=0.3, random_state=42)

        self.X_scaler = StandardScaler()
        X_train_reshaped = self.X_train.reshape(-1, self.X_train.shape[-1])
        self.X_train_scaled = self.X_scaler.fit_transform(X_train_reshaped)
        self.X_train_scaled = self.X_train_scaled.reshape(self.X_train.shape)

        X_test_reshaped = self.X_test.reshape(-1, self.X_test.shape[-1])
        self.X_test_scaled = self.X_scaler.transform(X_test_reshaped)
        self.X_test_scaled = self.X_test_scaled.reshape(self.X_test.shape)
        print('[prepare_train_data] end')

    def create_model(self):
        print('[create_model] start')
        seq_length = self.X.shape[1]
        embedding_dim = self.X.shape[2]
        num_classes = len(np.unique(self.y))

        event_input = layers.Input(shape=(seq_length, embedding_dim), name='event_input')
        lstm = layers.LSTM(units=100, dropout=0.3,
                        kernel_initializer='glorot_uniform')(event_input)
        x = layers.Dense(64, activation='relu')(lstm)
        output = layers.Dense(num_classes, activation='softmax')(x)

        self.model = keras.models.Model(inputs=[event_input], outputs=output)
        opt = keras.optimizers.Adam(learning_rate=0.001, decay=1e-6, clipvalue=1.0)
        self.model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        print('[create_model] end')

    def visualize_model(self):
        return keras.utils.plot_model(self.model, show_shapes=True, show_layer_names=True, rankdir='TB', expand_nested=True)

    def train_model(self):
        print('[train_model] start')
        epochs = 100
        self.history = self.model.fit({"event_input": self.X_train_scaled},
                                      self.y_train,
                                      batch_size=36, verbose=1, epochs=epochs,
                                      validation_split=0.2)
        print('[train_model] end')

    def evaluate(self):
        print('[evaluate] start')
        def get_incident_prediction(df, incident_id):
            df_incident = df[df['incident_id'] == incident_id]
            preds_lst = df_incident['prediction'].to_list()
            return max(set(preds_lst), key=preds_lst.count)
        X_reshaped = self.X.reshape(-1, self.X.shape[-1])
        self.X_scaled = self.X_scaler.transform(X_reshaped)
        self.X_scaled = self.X_scaled.reshape(self.X.shape)

        self.preds_full = self.model.predict(self.X_scaled)
        self.pred_labels_full = np.argmax(self.preds_full, axis=1)
        self.preds_og = self.y_encoder.inverse_transform(self.pred_labels_full)
        print('Evaluation on subsequence level')
        self.clf_report_sub = classification_report(self.y, self.preds_og, output_dict=True)
        print(classification_report(self.y, self.preds_og))

        self.df['prediction'] = self.preds_og
        self.incident_labels = []
        self.incident_preds = []
        for incident_id in self.df['incident_id'].unique():
            self.incident_labels.append(self.incident_type_dict[incident_id])
            self.incident_preds.append(get_incident_prediction(self.df, incident_id))
        print('\nEvaluation on incident level')
        self.clf_report_inc = classification_report(self.incident_labels, self.incident_preds, output_dict=True)
        print(classification_report(self.incident_labels, self.incident_preds))
        print('[evaluate] end')

In [20]:
LC = LSTMClassifier()

[__init__] start
[__init__] end


In [21]:
LC.load_data()

[load_data] start
[load_data] end


In [22]:
LC.encode_seqs()

[encode_seqs] start
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
Unknown: 1006
Unknown: 1024
14229 150
[encode_seqs] end


In [23]:
LC.prepare_train_data()

[prepare_train_data] start
[prepare_train_data] end


In [24]:
LC.X_train_scaled.shape

(9960, 150, 50)

In [25]:
LC.X_train_scaled[0]

array([[ 2.08686241e-01, -1.19115545e+00,  8.72303560e-01, ...,
        -7.48386693e-01, -9.00308626e-01,  4.93355294e-01],
       [ 3.77950902e-01, -6.52964170e-01, -5.50418020e-01, ...,
        -1.70296631e+00,  1.06599592e+00,  4.67337251e-01],
       [-1.21656672e+00, -2.58980429e+00,  9.47751983e-01, ...,
        -6.27827128e-02, -3.03827924e+00,  1.48399725e+00],
       ...,
       [ 1.07308073e-01,  1.82551723e-01, -4.59675259e-02, ...,
         1.65830665e-01,  5.15852831e-02, -1.51263848e-01],
       [ 6.73383381e-02,  1.95092985e-01, -5.60012973e-03, ...,
         1.85403156e-01, -2.77860468e-03, -2.60189670e-02],
       [ 3.09309289e-01, -1.44936013e-01, -6.08580108e-01, ...,
         3.92809827e-01,  1.11837496e+00,  4.36502864e-01]])

In [26]:
LC.create_model()

[create_model] start
[create_model] end




In [18]:
LC.train_model()

[train_model] start
Epoch 1/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 146ms/step - accuracy: 0.3060 - loss: 2.0431 - val_accuracy: 0.4127 - val_loss: 1.7097
Epoch 2/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 136ms/step - accuracy: 0.4095 - loss: 1.7233 - val_accuracy: 0.4362 - val_loss: 1.6395
Epoch 3/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 141ms/step - accuracy: 0.4306 - loss: 1.6652 - val_accuracy: 0.4513 - val_loss: 1.5824
Epoch 4/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 140ms/step - accuracy: 0.4345 - loss: 1.6343 - val_accuracy: 0.4829 - val_loss: 1.5568
Epoch 5/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 137ms/step - accuracy: 0.4495 - loss: 1.6130 - val_accuracy: 0.4869 - val_loss: 1.5316
Epoch 6/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 136ms/step - accuracy: 0.4723 - loss: 1.5476 - val_accuracy: 0.5271 - v

KeyboardInterrupt: 

In [27]:
LC.train_model()

[train_model] start
Epoch 1/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 139ms/step - accuracy: 0.2767 - loss: 2.1084 - val_accuracy: 0.3057 - val_loss: 1.9474
Epoch 2/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 136ms/step - accuracy: 0.3391 - loss: 1.8809 - val_accuracy: 0.3715 - val_loss: 1.7493
Epoch 3/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 130ms/step - accuracy: 0.3828 - loss: 1.7607 - val_accuracy: 0.3509 - val_loss: 1.7369
Epoch 4/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 130ms/step - accuracy: 0.3972 - loss: 1.6819 - val_accuracy: 0.4327 - val_loss: 1.6151
Epoch 5/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 134ms/step - accuracy: 0.4391 - loss: 1.6186 - val_accuracy: 0.4428 - val_loss: 1.5897
Epoch 6/100
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 137ms/step - accuracy: 0.4756 - loss: 1.5431 - val_accuracy: 0.4935 - v

KeyboardInterrupt: 