# Lab 7: Time-series Anomaly Detection



Pandas was updated on 03/04/2023 to version 2.0, which is not compatibile with tsfel. Dowgrade to 1.5.3

In [1]:
# !pip install pandas==1.5.3
# !pip install tsfel

## Import and settings

In [2]:
# !pip install keras_tuner

In [3]:
import os
import time
import tsfel
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold

from plots import *
from dataset import *
from metrics import *
from models_funtions import *

# Set style for matplotlib
plt.style.use("Solarize_Light2")

In [4]:
# Path to the root directory of the dataset
ROOTDIR_DATASET_NORMAL =  '../dataset/normal' #'/content/drive/MyDrive/dataset/normal'
ROOTDIR_DATASET_ANOMALY = '../dataset/collisions'#'/content/drive/MyDrive/dataset/collisions'

# TF_ENABLE_ONEDNN_OPTS=0 means that the model will not use the oneDNN library for optimization
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

#### Various parameters

In [5]:
#freq = '1.0'
freq = '0.1'
#freq = '0.01'
#freq = '0.005'

file_name_normal = "_20220811_rbtc_"
file_name_collisions = "_collision_20220811_rbtc_"

recording_normal = [0, 2, 3, 4]
recording_collisions = [1, 5]

features_folder_normal = "../features/normal/"
features_folder_collisions = "../features/collisions/"

#### Data

In [6]:
df_features_normal, df_normal_raw, _, action2int_normal_raw = get_dataframes(ROOTDIR_DATASET_NORMAL, file_name_normal, recording_normal, freq, features_folder=f"{features_folder_normal}")
df_features_collisions, df_collisions_raw, df_collisions_raw_action, action2int_collisions_raw = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, recording_collisions, freq, features_folder=f"{features_folder_collisions}1_5/")
df_features_collisions_1, df_collisions_raw_1, df_collisions_raw_action_1, action2int_collisions_raw_action_1 = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, [1], freq, features_folder=f"{features_folder_collisions}1/")
df_features_collisions_5, df_collisions_raw_5, df_collisions_raw_action_5, action2int_collisions_raw_action_5 = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, [5], freq, features_folder=f"{features_folder_collisions}5/")

Loading data.
Found 31 different actions.
Loading data done.

Computing features.


In [None]:
X_train, y_train, X_test, y_test, df_test = get_train_test_data(df_features_normal, df_features_collisions, full_normal=True)
X_train_1, y_train_1, X_test_1, y_test_1, df_test_1 = get_train_test_data(df_features_normal, df_features_collisions_1, full_normal=True)
X_train_5, y_train_5, X_test_5, y_test_5, df_test_5 = get_train_test_data(df_features_normal, df_features_collisions_5, full_normal=True)

In [None]:
X_test.shape

In [None]:
X_test_1.shape

In [None]:
X_test_5.shape

### Collisions

In [None]:
collisions_rec1, collisions_init1 = get_collisions('1', ROOTDIR_DATASET_ANOMALY)
collisions_rec5, collisions_init5 = get_collisions('5', ROOTDIR_DATASET_ANOMALY)

# Merge the collisions of the two recordings in one dataframe
collisions_rec = pd.concat([collisions_rec1, collisions_rec5])
collisions_init = pd.concat([collisions_init1, collisions_init5])

In [None]:
collisions_zones, y_collisions = get_collisions_zones_and_labels(collisions_rec, collisions_init, df_features_collisions)
collisions_zones_1, y_collisions_1 = get_collisions_zones_and_labels(collisions_rec1, collisions_init1, df_features_collisions_1)
collisions_zones_5, y_collisions_5 = get_collisions_zones_and_labels(collisions_rec5, collisions_init5, df_features_collisions_5)

### Random forest classifier

In [None]:
num_estims = [10, 100, 1000]
crits = ['gini', 'entropy', 'log_loss']
max_dept = [None, 50, 100, 1000]
min_s_splits = [2, 3]
max_features = ['sqrt', 'log2', None]

classifier = RandomForestClassifier(
    n_estimators = 100,
    criterion = 'gini',
    max_depth = 1000,
    min_samples_split = 2,
    max_features = 'sqrt'
    )
# Train the RandomForestClassifier on normal data
classifier.fit(X_train, y_train)
print("Random Forest training completed.")

In [None]:
y_test_predict = classifier.predict_proba(X_test.values)

In [None]:
y_test_predict.argmax(axis=1)

Check the model performance

In [None]:
# Get confusion matrix
cm = confusion_matrix(y_test, y_test_predict.argmax(axis=1), labels=list(action2int_normal_raw.values()))
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
int2action = {v: k for k, v in action2int_normal_raw.items()}
seaborn_cm(cm,
            ax,
            [int2action[l] for l in action2int_normal_raw.values()], fontsize=8, xrotation=90)
plt.tight_layout()

# **TODO** Compute uncertainty

In [None]:
n_mc = 10
preds_array = np.array([classifier.predict_proba(X_test.values) for _ in range(n_mc)])
preds_bayes_mean = np.mean(np.array(preds_array), axis = 0)

In [None]:
uncertainties_bayes = dict()
uncertainties_bayes["correct"] = Confidence(preds_array.mean(axis=0)[(preds_bayes_mean.argmax(axis=1) == y_test), :]).compute_uncertainty_metrics()
uncertainties_bayes["wrong"] = Confidence(preds_array.mean(axis=0)[(preds_bayes_mean.argmax(axis=1) != y_test), :]).compute_uncertainty_metrics()
uncertainties_bayes["all"] = Confidence(preds_array.mean(axis=0)).compute_uncertainty_metrics()

In [None]:
uncertainties_bayes["correct"]

In [None]:
fig, axes = plt.subplots(len(uncertainties_bayes['correct'].keys()), 3, figsize=(15, 9))
for ax, measure in zip(axes, uncertainties_bayes['correct'].keys()):
    ax[0].set_title(f"Wrong - {measure}")
    ax[0].hist(uncertainties_bayes['wrong'][measure], color="red", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
    ax[1].set_title(f"Correct - {measure}")
    ax[1].hist(uncertainties_bayes['correct'][measure], color="green", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
    ax[2].set_title(f"All - {measure}")
    ax[2].hist(uncertainties_bayes['all'][measure], color="blue", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
fig.suptitle("Bayes MLP", fontsize=20)
plt.tight_layout()

# Load collisions and extract features

In [None]:
collisions = pd.read_excel(os.path.join(ROOTDIR_DATASET_ANOMALY, "20220811_collisions_timestamp.xlsx"))
collisions_init = collisions[collisions['Inizio/fine'] == "i"].Timestamp - pd.to_timedelta([2] * len(collisions[collisions['Inizio/fine'] == "i"].Timestamp), 'h')

In [None]:
filepath_csv = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec{r}_collision_20220811_rbtc_0.1s.csv") for r in [1, 5]]
filepath_meta = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec{r}_collision_20220811_rbtc_0.1s.metadata") for r in [1, 5]]
df_action, df, df_meta, action2int = get_df_action(filepath_csv, filepath_meta)

In [None]:
filepath_csv_1 = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec{r}_collision_20220811_rbtc_0.1s.csv") for r in [1]]
filepath_meta_1 = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec{r}_collision_20220811_rbtc_0.1s.metadata") for r in [1]]
df_action_1, df_1, df_meta_1, action2int_1 = get_df_action(filepath_csv_1, filepath_meta_1)

In [None]:
filepath_csv_5 = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec{r}_collision_20220811_rbtc_0.1s.csv") for r in [5]]
filepath_meta_5 = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec{r}_collision_20220811_rbtc_0.1s.metadata") for r in [5]]
df_action_5, df_5, df_meta_5, action2int_5 = get_df_action(filepath_csv_5, filepath_meta_5)

In [None]:
start_time = time.time()
df_features_collision = get_features_ts("statistical", df_action, df_meta, 10, action2int, None)
df_features_collision_1 = get_features_ts("statistical", df_action_1, df_meta_1, 10, action2int_1, None)
df_features_collision_5 = get_features_ts("statistical", df_action_5, df_meta_5, 10, action2int_5, None)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
df_features_collision_1.shape

Preprocessing

In [None]:
# df_features_collision.isnull().values.any()

In [None]:
df_features_collision_nonan = df_features_collision.fillna(0)
df_features_collision_nonan_1 = df_features_collision_1.fillna(0)
df_features_collision_nonan_5 = df_features_collision_5.fillna(0)

In [None]:
df_features_collision_nonan_1.shape

In [None]:
# X_collision = df_features_collision_nonan.drop(["label", "start", "end"], axis=1)
# y_collision = df_features_collision_nonan["label"]
# X_collision.shape

In [None]:
# X_collision_1 = df_features_collision_nonan_1.drop(["label", "start", "end"], axis=1)
# y_collision_1 = df_features_collision_nonan_1["label"]

In [None]:
# X_collision_5 = df_features_collision_nonan_5.drop(["label", "start", "end"], axis=1)
# y_collision_5 = df_features_collision_nonan_5["label"]
# X_collision_5.shape

# **TODO** Compute uncertainty

In [None]:
preds_array_collisions = np.array([classifier.predict_proba(X_test.values) for _ in range(n_mc)])
preds_bayes_mean_collision = np.mean(np.array(preds_array_collisions), axis = 0)

In [None]:
preds_array_collisions_1 = np.array([classifier.predict_proba(X_test_1.values) for _ in range(n_mc)])
preds_bayes_mean_collision_1 = np.mean(np.array(preds_array_collisions_1), axis = 0)

In [None]:
preds_array_collisions_5 = np.array([classifier.predict_proba(X_test_5.values) for _ in range(n_mc)])
preds_bayes_mean_collision_5 = np.mean(np.array(preds_array_collisions_5), axis = 0)

In [None]:
uncertainties_bayes = dict()
uncertainties_bayes["correct"] = Confidence(preds_array_collisions.mean(axis=0)[(preds_bayes_mean_collision.argmax(axis=1) == y_collisions), :]).compute_uncertainty_metrics()
uncertainties_bayes["wrong"] = Confidence(preds_array_collisions.mean(axis=0)[(preds_bayes_mean_collision.argmax(axis=1) != y_collisions), :]).compute_uncertainty_metrics()
uncertainties_bayes["all"] = Confidence(preds_array_collisions.mean(axis=0)).compute_uncertainty_metrics()

In [None]:
uncertainties_bayes_1 = dict()
uncertainties_bayes_1["correct"] = Confidence(preds_array_collisions_1.mean(axis=0)[(preds_bayes_mean_collision_1.argmax(axis=1) == y_collisions_1), :]).compute_uncertainty_metrics()
uncertainties_bayes_1["wrong"] = Confidence(preds_array_collisions_1.mean(axis=0)[(preds_bayes_mean_collision_1.argmax(axis=1) != y_collisions_1), :]).compute_uncertainty_metrics()
uncertainties_bayes_1["all"] = Confidence(preds_array_collisions_1.mean(axis=0)).compute_uncertainty_metrics()

In [None]:
uncertainties_bayes_5 = dict()
uncertainties_bayes_5["correct"] = Confidence(preds_array_collisions_5.mean(axis=0)[(preds_bayes_mean_collision_5.argmax(axis=1) == y_collisions_5), :]).compute_uncertainty_metrics()
uncertainties_bayes_5["wrong"] = Confidence(preds_array_collisions_5.mean(axis=0)[(preds_bayes_mean_collision_5.argmax(axis=1) != y_collisions_5), :]).compute_uncertainty_metrics()
uncertainties_bayes_5["all"] = Confidence(preds_array_collisions_5.mean(axis=0)).compute_uncertainty_metrics()

In [None]:
fig, axes = plt.subplots(len(uncertainties_bayes['correct'].keys()), 3, figsize=(15, 9))
for ax, measure in zip(axes, uncertainties_bayes['correct'].keys()):
    ax[0] .set_title(f"Wrong - {measure}")
    ax[0].hist(uncertainties_bayes['wrong'][measure], color="red", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
    ax[1] .set_title(f"Correct - {measure}")
    ax[1].hist(uncertainties_bayes['correct'][measure], color="green", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
    ax[2] .set_title(f"All - {measure}")
    ax[2].hist(uncertainties_bayes['all'][measure], color="blue", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
fig.suptitle("Random forest", fontsize=20)
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(len(uncertainties_bayes_1['correct'].keys()), 3, figsize=(15, 9))
for ax, measure in zip(axes, uncertainties_bayes_1['correct'].keys()):
    ax[0] .set_title(f"Wrong - {measure}")
    ax[0].hist(uncertainties_bayes_1['wrong'][measure], color="red", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
    ax[1] .set_title(f"Correct - {measure}")
    ax[1].hist(uncertainties_bayes_1['correct'][measure], color="green", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
    ax[2] .set_title(f"All - {measure}")
    ax[2].hist(uncertainties_bayes_1['all'][measure], color="blue", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
fig.suptitle("Random forest", fontsize=20)
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(len(uncertainties_bayes_5['correct'].keys()), 3, figsize=(15, 9))
for ax, measure in zip(axes, uncertainties_bayes_5['correct'].keys()):
    ax[0] .set_title(f"Wrong - {measure}")
    ax[0].hist(uncertainties_bayes_5['wrong'][measure], color="red", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
    ax[1] .set_title(f"Correct - {measure}")
    ax[1].hist(uncertainties_bayes_5['correct'][measure], color="green", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
    ax[2] .set_title(f"All - {measure}")
    ax[2].hist(uncertainties_bayes_5['all'][measure], color="blue", log=False, bins=25, edgecolor='black', linewidth=1.2, alpha=0.5);
fig.suptitle("Random forest", fontsize=20)
plt.tight_layout()

# Assess TAD algorithm performance via ROC curves

In [None]:
roc_dict = dict()
for confidence_metric in uncertainties_bayes['correct'].keys():
    confidence = uncertainties_bayes['all'][confidence_metric]
    if confidence_metric == "entropy":
        confidence = 1- confidence
    sens = list()
    fpr = list()
    for threshold in np.arange(0, 1, 0.1):
        df_not_confident = df_features_collision_nonan[confidence <= threshold]
        anomaly_indexes = list()
        tp = 0
        for anomaly in collisions_init:
            for index, row in df_not_confident.iterrows():
                if anomaly >= row['start'] and anomaly <= row['end']:
                    anomaly_indexes.append(index)
                    tp += 1

        cm_anomaly = np.zeros((2, 2))
        n_samples = len(df_features_collision_nonan)
        n_not_collisions = n_samples - len(collisions_init)
        n_detected = len(df_not_confident)

        fp = n_detected - tp
        fn = len(collisions_init) - tp
        tn = n_not_collisions - fp
        cm_anomaly[0][0] = tn
        cm_anomaly[1][1] = tp
        cm_anomaly[0][1] = fp
        cm_anomaly[1][0] = fn
        sens.append(tp / (tp + fn))
        fpr.append(1 - tn / (fp + tn))
    roc_dict[confidence_metric] = (fpr, sens)

In [None]:
fig, ax = plt.subplots(1, 1)
for confidence_metric in uncertainties_bayes['correct'].keys():
    ax.plot(roc_dict[confidence_metric][0], roc_dict[confidence_metric][1], label=confidence_metric)
ax.legend();

In [None]:
roc_dict = dict()
for confidence_metric in uncertainties_bayes_1['correct'].keys():
    confidence = uncertainties_bayes_1['all'][confidence_metric]
    if confidence_metric == "entropy":
        confidence = 1- confidence
    sens = list()
    fpr = list()
    for threshold in np.arange(0, 1, 0.1):
        df_not_confident = df_features_collision_nonan_1[confidence <= threshold]
        anomaly_indexes = list()
        tp = 0
        for anomaly in collisions_init:
            for index, row in df_not_confident.iterrows():
                if anomaly >= row['start'] and anomaly <= row['end']:
                    anomaly_indexes.append(index)
                    tp += 1

        cm_anomaly = np.zeros((2, 2))
        n_samples = len(df_features_collision_nonan_1)
        n_not_collisions = n_samples - len(collisions_init)
        n_detected = len(df_not_confident)

        fp = n_detected - tp
        fn = len(collisions_init) - tp
        tn = n_not_collisions - fp
        cm_anomaly[0][0] = tn
        cm_anomaly[1][1] = tp
        cm_anomaly[0][1] = fp
        cm_anomaly[1][0] = fn
        sens.append(tp / (tp + fn))
        fpr.append(1 - tn / (fp + tn))
    roc_dict[confidence_metric] = (fpr, sens)

In [None]:
confidence.shape

In [None]:
df_features_collision_nonan_1.shape

In [None]:
fig, ax = plt.subplots(1, 1)
for confidence_metric in uncertainties_bayes_1['correct'].keys():
    ax.plot(roc_dict[confidence_metric][0], roc_dict[confidence_metric][1], label=confidence_metric)
ax.legend();

In [None]:
roc_dict = dict()
for confidence_metric in uncertainties_bayes_5['correct'].keys():
    confidence = uncertainties_bayes_5['all'][confidence_metric]
    if confidence_metric == "entropy":
        confidence = 1- confidence
    sens = list()
    fpr = list()
    for threshold in np.arange(0, 1, 0.1):
        df_not_confident = df_features_collision_nonan_5[confidence <= threshold]
        anomaly_indexes = list()
        tp = 0
        for anomaly in collisions_init:
            for index, row in df_not_confident.iterrows():
                if anomaly >= row['start'] and anomaly <= row['end']:
                    anomaly_indexes.append(index)
                    tp += 1

        cm_anomaly = np.zeros((2, 2))
        n_samples = len(df_features_collision_nonan_5)
        n_not_collisions = n_samples - len(collisions_init)
        n_detected = len(df_not_confident)

        fp = n_detected - tp
        fn = len(collisions_init) - tp
        tn = n_not_collisions - fp
        cm_anomaly[0][0] = tn
        cm_anomaly[1][1] = tp
        cm_anomaly[0][1] = fp
        cm_anomaly[1][0] = fn
        sens.append(tp / (tp + fn))
        fpr.append(1 - tn / (fp + tn))
    roc_dict[confidence_metric] = (fpr, sens)

In [None]:
fig, ax = plt.subplots(1, 1)
for confidence_metric in uncertainties_bayes_5['correct'].keys():
    ax.plot(roc_dict[confidence_metric][0], roc_dict[confidence_metric][1], label=confidence_metric)
ax.legend();