# Testing

##### Libraries

In [None]:
!pip install pandas==1.5.3
!pip install tsfel
!pip3 install --upgrade --no-cache-dir gdown       # support for download a large file from Google Drive
!pip install numpy>=1.19.5
!pip install scikit-learn>=0.24.1
!pip install tadpak

##### Download dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# unzip from drive
!unzip /content/drive/MyDrive/Colab_MLA/MLA_Project/csv_20220811.zip -d /content/

In [None]:
import os, sys
import time
import tsfel
import warnings
import datetime
import numpy as np
import pandas as pd
import pickle
import matplotlib.cm as cm
import logging
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from sklearn import metrics
from google.colab import files
from sklearn.metrics import roc_auc_score, average_precision_score, top_k_accuracy_score, f1_score, roc_curve, auc, precision_recall_curve
from matplotlib.gridspec import GridSpec

In [None]:
ROOTDIR_DATASET_NORMAL = "/content/csv_20220811"
plt.style.use("Solarize_Light2") # Set style for matplotlib

In [None]:
def get_metadata(filepaths_csv, filepaths_meta, action2int=None, delimiter=";"):
  dfs_meta = list()
  for filepath in filepaths_meta:                                   # read filepath (0, 2, 3, 4)
        df_m = pd.read_csv(filepath, sep=delimiter)                 # read csv files
        df_m.str_repr = df_m.str_repr.str.replace('True', 'true')   # replace True with true
        df_m['filepath'] = filepath                                 # create the 'filepath' column
        dfs_meta.append(df_m)                                       # add the corresponding dataframe
  df_meta = pd.concat(dfs_meta)                                     # concatenate all dataframes

  df_meta.index = pd.to_datetime(df_meta.init_timestamp.astype('datetime64[ms]'), format="%Y-%m-%dT%H:%M:%S.%f")                        # convert numerical index in time index
  df_meta['completed_timestamp'] = pd.to_datetime(df_meta.completed_timestamp.astype('datetime64[ms]'), format="%Y-%m-%dT%H:%M:%S.%f")  # change format of completed_timestamp
  df_meta['init_timestamp'] = pd.to_datetime(df_meta.init_timestamp.astype('datetime64[ms]'), format="%Y-%m-%dT%H:%M:%S.%f")            # change format of init_timestamp

  actions = df_meta.str_repr.unique()                                             # due to the concat of before we know take the actions removing the duplicate

  dfs = [pd.read_csv(filepath_csv, sep=";") for filepath_csv in filepaths_csv]    # read the train/test datas
  df = pd.concat(dfs)                                                             # concat the data like before
  df = df.sort_index(axis=1)                                                      # sort columns by name

  df.index = pd.to_datetime(df.time.astype('datetime64[ms]'), format="%Y-%m-%dT%H:%M:%S.%f")        # set timestamp as index
  columns_to_drop = [column for column in df.columns if "Abb" in column or "Temperature" in column] # remove uselesse columns
  df.drop(["machine_nameKuka Robot_export_active_energy", "machine_nameKuka Robot_import_reactive_energy"] + columns_to_drop, axis=1, inplace=True)

  df_action = list()        # take the actions
  for action in actions:    # loop for each action (30)
      for index, row in df_meta[df_meta.str_repr == action].iterrows(): # get index and row from metadata where the actions are the same
          start = row['init_timestamp']         # start
          end = row['completed_timestamp']      # end
          df_tmp = df.loc[start: end].copy()    # temporary dataframe
          df_tmp['action'] = action             # get action
          df_tmp['duration'] = str((row['completed_timestamp'] - row['init_timestamp']).total_seconds())    # life of the action (it's not a feature)
          df_action.append(df_tmp)
  df_action = pd.concat(df_action, ignore_index=True)     # concatenate the actions
  df_action.index = pd.to_datetime(df_action.time.astype('datetime64[ms]'), format="%Y-%m-%dT%H:%M:%S.%f")   # set the time as index
  df_action = df_action[~df_action.index.duplicated(keep='first')]     # keep the duplicate

  df = df.dropna(axis=0)                  # remove NaNs from Df (34275, 56)
  df_action = df_action.dropna(axis=0)    # (33063, 58)

  if action2int is None:        # map the actions to integer --> 30 action - 30 indexes from 1 to 30
      action2int = dict()
      j = 1
      for label in df_action.action.unique():
          action2int[label] = j
          j += 1

  df_merged = df.merge(df_action[['action']], left_index=True, right_index=True, how="left")  # (34275, 57)
  df_idle = df_merged[df_merged['action'].isna()].copy()    # (1212, 57)
  df_idle['action'] = 'idle'
  df_idle['duration'] = df_action.duration.values.astype(float).mean().astype(str)
  df_action = pd.concat([df_action, df_idle]) # (34275, 58)
  action2int['idle'] = 0
  return df_action, df, df_meta, action2int

##### Get dataset

In [None]:
frequency = 1    # 1 10 100 200 Hz - life {1: 10, 10: 1, 100: 0.1, 200: 0.05}

filepath_csv_test = [os.path.join(ROOTDIR_DATASET_NORMAL, f"rec{r}_collision_20220811_rbtc_{1/frequency}s.csv") for r in [1, 5]]        # read data with anomalies
filepath_meta_test = [os.path.join(ROOTDIR_DATASET_NORMAL, f"rec{r}_collision_20220811_rbtc_{1/frequency}s.metadata") for r in[1, 5]]

filepath_csv_train = [os.path.join(ROOTDIR_DATASET_NORMAL, f"rec{r}_20220811_rbtc_{1/frequency}s.csv") for r in [0, 2, 3, 4]]           # read non-anomalous data
filepath_meta_train = [os.path.join(ROOTDIR_DATASET_NORMAL, f"rec{r}_20220811_rbtc_{1/frequency}s.metadata") for r in [0, 2, 3, 4]]

df_action_train, df_train, df_meta_train, action2int_train = get_metadata(filepath_csv_train, filepath_meta_train)    # read corresponding metadata
df_action_test, df_test, df_meta_test, action2int_test = get_metadata(filepath_csv_test, filepath_meta_test)

df_test['time'] = pd.to_datetime(df_test.time.astype('datetime64[ms]'), format ="%Y-%m-%dT%H:%M:%S.%f")

X_train = df_train.drop(['time'], axis=1, inplace=False)     # remove last column 'time' from dataset
X_collisions = df_test.drop(['time'], axis=1, inplace=False)

##### Get collisions

In [None]:
timestamps_collisions = pd.read_excel(os.path.join(ROOTDIR_DATASET_NORMAL, "20220811_collisions_timestamp.xlsx"))
timestamps_collisions['Timestamp'] = timestamps_collisions['Timestamp'] - pd.to_timedelta(2, 'h')
# due to a time discrepancy, the time interval of the collisions should be anticipated of two hour
start_col = timestamps_collisions[timestamps_collisions['Inizio/fine'] == "i"][['Timestamp']].rename(columns={'Timestamp': 'start'}) # even indexes
end_col = timestamps_collisions[timestamps_collisions['Inizio/fine'] == "f"][['Timestamp']].rename(columns={'Timestamp': 'end'})     # odd indexes

start_col.reset_index(drop=True, inplace=True)  # reset the indexes
end_col.reset_index(drop=True, inplace=True)

df_collision = pd.concat([start_col, end_col], axis=1)  # concatenate start e end --> it becomes (key, start, end) 51 columns

### Plotting functions

In [None]:
# plotting a small samples of scores
def plotting_anomaly(scores, title, samples=2500):
  data = scores[315:samples]
  x = np.arange(1000, len(data) + 1000)
  plt.plot(x, data, color='salmon', label='Anomaly Scores')
  plt.xlabel('Point')
  plt.ylabel('Score')
  plt.title(title)
  plt.savefig(f'/content/{title}.jpg')
  plt.show()

# plot distribution and return the true_labels
def plot_hist(anomaly_scores, df_collision, df, title):
    index_anomaly = []      # anomalies' index
    idx = 0
    for _, row in df.iterrows():
        for _, collision_row in df_collision.iterrows():
            if (row['time'] >= collision_row['start']) and (row['time'] <= collision_row['end']):
                index_anomaly.append(idx)
        idx += 1
    true_labels = np.zeros_like(anomaly_scores)
    true_labels[index_anomaly] = 1
    logging.info(f"Anomalies detected: {int(true_labels.sum())}")
    anomaly_values = anomaly_scores[index_anomaly]
    normal_values = np.delete(anomaly_scores, index_anomaly)

    plt.hist(normal_values, bins=30, color="tab:blue", ec="dodgerblue", alpha=0.5, label='Normal')
    plt.hist(anomaly_values, bins=30, color='tab:red', ec="darkred", alpha=0.7, label='Anomalies')

    plt.xlabel('Values')
    plt.ylabel('Occurrencies')
    plt.legend(loc='upper right')
    plt.title(title)
    plt.savefig(f'/content/{title}.jpg')  # Modify the path and filename as needed
    plt.show()
    return true_labels

# dataset divition for testing with validation
def dataset_div(X_collisions, anomaly_scores_norm, df_test):
  split = 0.9                                    # splitting value
  split_at = int(len(X_collisions) * split)      # elements

  asn_val = anomaly_scores_norm[split_at:]       # validation scores
  asn_col = anomaly_scores_norm[:split_at]       # test scores

  df_val = df_test.iloc[split_at:]
  df_col = df_test.iloc[:split_at]

  df_val = df_val[-asn_val.shape[0]:]
  df_col = df_col[-asn_col.shape[0]:]

  return df_val, df_col, asn_val, asn_col

In [None]:
# plotting a prediction segment
def plot_prediction(df_test, y_true, anomaly_scores_norm, X_collisions, threshold, title, samples=1815):
    df_test = df_test[315:samples]              # selecting samples
    X_collisions = X_collisions[315:samples]
    y_true = y_true[315:samples]                # selecting labels
    anomaly_scores_norm = anomaly_scores_norm[315:samples] # selecting scores

    pca = PCA(n_components=1)
    X_collisions_pca = pca.fit_transform(X_collisions)

    X_collisions_pca = X_collisions['machine_nameKuka Robot_apparent_power']
    print(f"{X_collisions_pca.shape=}")

    fig = plt.figure(figsize=(10, 6))
    gs = GridSpec(2, 1, height_ratios=[2, 1])

    # Plot della serie temporale (sopra)
    ax1 = fig.add_subplot(gs[0])
    ax1.plot(df_test['time'], X_collisions_pca, label='Serie Temporale', color='darkgreen', zorder=1)  # Imposta zorder

    ax1.set_ylabel('')
    ax1.yaxis.set_visible(False)

    # Imposta il colore di bordo a 'none' per rimuovere il riquadro
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['bottom'].set_visible(False)
    ax1.spines['left'].set_visible(False)
    # Rimuovi le spine in basso impostando la loro visibilità a False
    ax1.xaxis.set_ticks_position('none')
    ax1.yaxis.set_ticks_position('none')

    condition_red = anomaly_scores_norm > threshold
    condition_orange = y_true

    ax1.fill_between(df_test['time'], X_collisions_pca, where=condition_red, color='red', alpha=0.5, zorder=2)
    ax1.fill_between(df_test['time'], X_collisions_pca, where=condition_orange, color='blue', alpha=0.3, zorder=3)

    # Plot dello score (sotto)
    ax2 = fig.add_subplot(gs[1], sharex=ax1)
    ax2.plot(df_test['time'], anomaly_scores_norm, label='Score', color='orange')
    ax2.set_ylabel('Score')
    ax2.axhline(y=threshold, color='red', linestyle='--', label=f'Threshold ({threshold})')
    ax2.legend()

    plt.setp(ax1.get_xticklabels(), visible=False)
    plt.title(title)
    plt.xlabel('Timestamp')
    plt.tight_layout()
    plt.savefig(f'/content/{title}.jpg')  # Modify the path and filename as needed
    plt.show()

### Evaluate functions

In [None]:
# compute f1, fB score, auc-roc, auc-pr
def compute_metrics(anomaly_scores_norm, df_test, y_true, th=None):
    tot_anomalies = y_true.sum()
    sens = list()           # recalls o tpr
    spec = list()
    fpr = list()
    f1 = list()
    f0_1= list()
    prec = list()
    cm_list = list()
    anomlay_indexes_dict = dict()
    acc_with_err = list()
    step = 0.01
    ths = np.arange(0, 1, step)
    if th is None:
        for threshold in tqdm(ths):
            anomalies_pred = anomaly_scores_norm > threshold
            tp = 0                                                          # true positive per quella threshold
            anomaly_indexes = list()
            for index, anomaly_pred in enumerate(anomalies_pred):
                if y_true[index] and anomaly_pred:
                    anomaly_indexes.append(index)
                    tp += 1

            cm_anomaly = np.zeros((2,2))
            n_sample = len(df_test)
            n_not_collision = n_sample - tot_anomalies
            n_detected = anomalies_pred.sum()

            fp = n_detected - tp
            fn = tot_anomalies - tp
            tn = n_not_collision - fp

            cm_anomaly[0, 0] = tn
            cm_anomaly[0, 1] = fp
            cm_anomaly[1, 0] = fn
            cm_anomaly[1, 1] = tp

            cm_list.append(cm_anomaly)
            recall = tp / (tp + fn)
            sens.append(recall)
            fpr.append(1 - tn /(tn + fp))
            precision = tp / (tp + fp)
            prec.append(precision)
            spec.append(tn /(tn + fp))
            f1.append(2 * tp / (2 * tp + fp + fn))
            f0_1.append((1 + 0.1**2) * tp / ((1 + 0.1**2) * tp +  0.1**2*fp + fn))
            cm_anomaly_norm = cm_anomaly.astype('float') / cm_anomaly.sum(axis=1)[:, np.newaxis]
            acc_with_err.append( (np.mean(np.diag(cm_anomaly_norm)), np.std(np.diag(cm_anomaly_norm))) )
            anomlay_indexes_dict[threshold] = anomaly_indexes

        f1_max = max(f1)
        f0_1_max = max(f0_1)
        max_index_f1 = f1.index(f1_max)
        max_index_f0_1 = f0_1.index(f0_1_max)
        th_f1_max = max_index_f1 * step
        th_f0_1_max = max_index_f0_1 * step
        print(f"f1: {f1_max} at th: {th_f1_max}")
        print(f"f0.1: {f0_1_max} at th: {th_f0_1_max}")
        print(f"AUC-PR: {metrics.average_precision_score(y_true, anomaly_scores_norm)}")
        print(f"AUC-ROC: {metrics.roc_auc_score(y_true, anomaly_scores_norm)}")
        return sens, fpr, th_f1_max
    else:
        df_anomaly = df_test.loc[np.array(anomaly_scores_norm > th)]
        tp = 0                                                          # true positive per quella threshold
        anomaly_indexes = list()
        anomalies_pred = anomaly_scores_norm > th

        for index, anomaly_pred in enumerate(anomalies_pred):
            if y_true[index] and anomaly_pred:
                anomaly_indexes.append(index)
                tp += 1

        cm_anomaly = np.zeros((2,2))
        n_sample = len(df_test)
        n_not_collision = n_sample - tot_anomalies
        n_detected = len(df_anomaly)

        fp = n_detected - tp
        fn = tot_anomalies - tp
        tn = n_not_collision - fp

        cm_anomaly[0, 0] = tn
        cm_anomaly[0, 1] = fp
        cm_anomaly[1, 0] = fn
        cm_anomaly[1, 1] = tp

        f1 = 2 * tp / (2 * tp + fp + fn)
        f0_1 = (1 + 0.1**2) * tp / ((1 + 0.1**2) * tp +  0.1**2*fp + fn)
        print(f"f1: {f1} at th: {th} for the test set")
        print(f"f0.1: {f0_1} at th: {th} for the test set")

# another way to compute true_labels
def create_true_labels(df_test, df_collision, scores):
    index_anomaly = []
    idx = 0
    for _, row in df_test.iterrows():    # prende la riga da df_validation
        for _, collision_row in df_collision.iterrows():  # prende la collision da df_collision
            if (row['time'] >= collision_row['start']) and (row['time'] <= collision_row['end']):
                index_anomaly.append(idx)         # salva l'indice
        idx += 1               # aumenta l'indice
    true_labels = np.zeros_like(scores)
    true_labels[index_anomaly] = 1
    logging.info(f"Anomalies detected: {int(true_labels.sum())}")
    return true_labels

# auc_roc and auc_pr
def roc_pr(true_labels, anomaly_scores_norm):
  auc_roc = roc_auc_score(true_labels, anomaly_scores_norm)          # Compute AUC-ROC
  auc_pr = average_precision_score(true_labels, anomaly_scores_norm) # Compute AUC-PR
  print(f'AUC-ROC: {auc_roc:.2f}')
  print(f'AUC-PR: {auc_pr:.2f}')

### Testing on a single score

In [None]:
with open('/content/kmeans_f10_clusters15_w10', "rb") as file:
      scores = pickle.load(file)

In [None]:
true_labels = plot_hist(scores, df_collision, df_test, title='KMeans_distribution_f=10Hz')

In [None]:
compute_metrics(scores, df_test, true_labels)

### Testing on multiple scores

In [None]:
with open('/content/enc_dec_f10.pickle', "rb") as file:
      scores_encdec = pickle.load(file)
with open('/content/lstm_f10.pickle', "rb") as file:
      scores_lstm = pickle.load(file)
with open('/content/hif_supervised_f10_trees1024_samples256.pkl', "rb") as file:
      scores_hif_supervised = pickle.load(file)
with open('/content/hif_unsupervised_f10_trees1024_samples_256.pkl', "rb") as file:
      scores_hif_unsupervised = pickle.load(file)
with open('/content/kmeans_f10_clusters15_w10.pkl', "rb") as file:
      scores_kmeans = pickle.load(file)

##### Distribution

In [None]:
_ = plot_hist(scores_hif_supervised['anomaly_scores_norm'], df_collision, df_test[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:], title='HIF_supervised_Distribution')
_ = plot_hist(scores_encdec['anomaly_scores_norm'], df_collision, df_test[-scores_encdec['anomaly_scores_norm'].shape[0]:], title='EncDec-AD_Distribution')
_ = plot_hist(scores_kmeans['anomaly_scores_norm'], df_collision, df_test, title='KMeans_Distribution')
_ = plot_hist(scores_lstm['anomaly_scores_norm'], df_collision, df_test[-scores_lstm['anomaly_scores_norm'].shape[0]:], title='LSTM-AD_Distribution')
_ = plot_hist(scores_hif_unsupervised['anomaly_scores_norm'], df_collision, df_test, title='HIF_unsupervised_Distribution')

##### Metrics

In [None]:
compute_metrics(scores_hif_supervised['anomaly_scores_norm'], df_test[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:], scores_hif_supervised['true_labels'])
compute_metrics(scores_encdec['anomaly_scores_norm'], df_test[-scores_encdec['anomaly_scores_norm'].shape[0]:], scores_encdec['true_labels'])
compute_metrics(scores_kmeans['anomaly_scores_norm'], df_test, scores_kmeans['true_labels'])
compute_metrics(scores_lstm['anomaly_scores_norm'], df_test[-scores_lstm['anomaly_scores_norm'].shape[0]:], scores_lstm['true_labels'])
compute_metrics(scores_hif_unsupervised['anomaly_scores_norm'], df_test, scores_hif_unsupervised['true_labels'])

##### Scores plot

In [None]:
plotting_anomaly(scores_encdec['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:], title='Anomaly_scores_EncDec-AD')    # plotting of scores
plotting_anomaly(scores_lstm['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:], title='Anomaly_scores_LSTM-AD')    # plotting of scores
plotting_anomaly(scores_hif_supervised['anomaly_scores_norm'], title='Anomaly_scores_HIF_supervised')    # plotting of scores
plotting_anomaly(scores_hif_unsupervised['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:], title='Anomaly_scores_HIF_unsupervised')    # plotting of scores
plotting_anomaly(scores_kmeans['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:], title='Anomaly_scores_KMeans')    # plotting of scores

##### Prediction segments

In [None]:
plot_prediction(df_test[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                scores_encdec['true_labels'][-scores_hif_supervised['true_labels'].shape[0]:],
                scores_encdec['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                X_collisions[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                threshold=0.32,
                title='EncDec-AD_segment_prediction')
plot_prediction(df_test[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                scores_lstm['true_labels'][-scores_hif_supervised['true_labels'].shape[0]:],
                scores_lstm['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                X_collisions[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                threshold=0.07,
                title='LSTM-AD_segment_prediction')
plot_prediction(df_test[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                scores_kmeans['true_labels'][-scores_hif_supervised['true_labels'].shape[0]:],
                scores_kmeans['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                X_collisions[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                threshold=0.31,
                title='KMeans_segment_prediction')
plot_prediction(df_test[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                scores_hif_unsupervised['true_labels'][-scores_hif_supervised['true_labels'].shape[0]:],
                scores_hif_unsupervised['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                X_collisions[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                threshold=0.63,
                title='HIF_unsupervised_segment_prediction')
plot_prediction(df_test[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                scores_hif_supervised['true_labels'],
                scores_hif_supervised['anomaly_scores_norm'],
                X_collisions[-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
                threshold=0.56,
                title='HIF_supervised_segment_prediction')

##### AUC-ROC and AUC-PR plot

In [None]:
# Calculate ROC curve and AUC for each model
fpr1, tpr1, _ = roc_curve(scores_hif_supervised['true_labels'], scores_hif_supervised['anomaly_scores_norm'])
roc_auc1 = auc(fpr1, tpr1)

fpr2, tpr2, _ = roc_curve(scores_lstm['true_labels'], scores_lstm['anomaly_scores_norm'])
roc_auc2 = auc(fpr2, tpr2)

fpr3, tpr3, _ = roc_curve(scores_encdec['true_labels'], scores_encdec['anomaly_scores_norm'])
roc_auc3 = auc(fpr3, tpr3)

fpr4, tpr4, _ = roc_curve(scores_kmeans['true_labels'], scores_kmeans['anomaly_scores_norm'])
roc_auc4 = auc(fpr4, tpr4)

# Calculate PR curve and AUC for each model
precision1, recall1, _ = precision_recall_curve(scores_hif_supervised['true_labels'], scores_hif_supervised['anomaly_scores_norm'])
pr_auc1 = average_precision_score(scores_hif_supervised['true_labels'], scores_hif_supervised['anomaly_scores_norm'])

precision2, recall2, _ = precision_recall_curve(scores_lstm['true_labels'],  scores_lstm['anomaly_scores_norm'])
pr_auc2 = average_precision_score(scores_lstm['true_labels'],  scores_lstm['anomaly_scores_norm'])

precision3, recall3, _ = precision_recall_curve(scores_encdec['true_labels'], scores_encdec['anomaly_scores_norm'])
pr_auc3 = average_precision_score(scores_encdec['true_labels'], scores_encdec['anomaly_scores_norm'])

precision4, recall4, _ = precision_recall_curve(scores_kmeans['true_labels'], scores_kmeans['anomaly_scores_norm'])
pr_auc4 = average_precision_score(scores_kmeans['true_labels'], scores_kmeans['anomaly_scores_norm'])

# Plot ROC curves
plt.figure(figsize=(10, 6))
plt.plot(fpr1, tpr1, label=f'HIF_supervised (AUC = {roc_auc1:.2f})')
plt.plot(fpr2, tpr2, label=f'LSTM-AD (AUC = {roc_auc2:.2f})')
plt.plot(fpr3, tpr3, label=f'EncDec-AD (AUC = {roc_auc3:.2f})')
plt.plot(fpr4, tpr4, label=f'KMeans (AUC = {roc_auc4:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.savefig('/content/auc_roc_curves.jpg')  # Modify the path and filename as needed
plt.show()

# Plot PR curves
plt.figure(figsize=(10, 6))
plt.plot(recall1, precision1, label=f'HIF_supervised (AUC = {pr_auc1:.2f})')
plt.plot(recall2, precision2, label=f'LSTM-AD (AUC = {pr_auc2:.2f})')
plt.plot(recall3, precision3, label=f'EncDec-AD (AUC = {pr_auc3:.2f})')
plt.plot(recall4, precision4, label=f'KMeans (AUC = {pr_auc4:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.xlim([0.1, 1.0])
plt.savefig('/content/auc_pr_curves.jpg')  # Modify the path and filename as needed
plt.show()


### Precision@K

In [None]:
# Precision@k: All previous measures require an anomaly score threshold to be computed. An alternative approach is to measure
# the Precision using a subset of anomalies corresponding to the 𝑘 highest value in the anomaly score. This is equivalent to setting
# the threshold such that only the k highest values are retrieved

def precision_at_k(anomaly_scores, true_labels, k):
    num_anomalies  = int(np.sum(true_labels))                                # number of anomalies scores
    total_samples  = len(true_labels)
    threshold = np.percentile(anomaly_scores, 100 * (1 - k/total_samples ))  # compute a dynamic threshold based on the k proportion of anomalies. This threshold is determined by the percentile of the anomaly scores
    detected_anomalies  = np.where(anomaly_scores > threshold)[0]            # get the indeces of the samples where the anomaly scores exceed the dynamic threshold
    true_positives = sum(true_labels[detected_anomalies ])                   # compute the number of TP among the top-k predictions
    precision_at_k = true_positives / num_anomalies                          # compute precision at the. P = TP/(FP + TP) --> FP = total - TP
    return precision_at_k

##### Precision@K plot

In [None]:
# K must be equal to a fraction of anomalies
N = 90
precisions_K_kmeans = []
precisions_K_hif_supervised = []
precisions_K_lstm = []
precisions_K_encdec = []
precisions_K_hif_unsupervised = []

k_values = np.arange(1, N)

for i in k_values: # from 1 % to 10% of anomalies
  k = int((1736 * i) / 100) # 1736 is the total number of anomalies with frequency = 10Hz
  precision_k_kmeans = precision_at_k(
                                scores_kmeans['anomaly_scores_norm'],
                                scores_kmeans['true_labels'], k)
  precision_k_hif_supervised = precision_at_k(
                                scores_hif_supervised['anomaly_scores_norm'],
                                scores_hif_supervised['true_labels'], k)
  precision_k_lstm = precision_at_k(
                                scores_lstm['anomaly_scores_norm'],
                                scores_lstm['true_labels'], k)
  precision_k_encdec = precision_at_k(
                                scores_encdec['anomaly_scores_norm'],
                                scores_encdec['true_labels'], k)
  precision_k_hif_unsupervised = precision_at_k(
                                scores_hif_unsupervised['anomaly_scores_norm'],
                                scores_hif_unsupervised['true_labels'], k)

  precisions_K_kmeans.append(precision_k_kmeans)
  precisions_K_hif_supervised.append(precision_k_hif_supervised)
  precisions_K_lstm.append(precision_k_lstm)
  precisions_K_encdec.append(precision_k_encdec)
  precisions_K_hif_unsupervised.append(precision_k_hif_unsupervised)

plt.figure(figsize=(10, 6))
plt.plot(k_values, precisions_K_kmeans, label='KMeans')
plt.plot(k_values, precisions_K_hif_supervised, label='HIF supervised')
plt.plot(k_values, precisions_K_hif_unsupervised, label='HIF unsupervised')
plt.plot(k_values, precisions_K_lstm, label='LSTM-AD')
plt.plot(k_values, precisions_K_encdec, label='EncDec-AD')
plt.xlabel('K % of anomalies')
plt.ylabel('Precision@K')
plt.title('Precision@K scores over different K values')
plt.legend()
plt.grid(True)
plt.savefig('/content/precision_at_K_scores_over_different_K_values.jpg')  # Modify the path and filename as needed
plt.show()

### PA@K

In [None]:
def pak(scores, targets, thres, k=20):
    """

    :param scores: anomaly scores
    :param targets: target labels
    :param thres: anomaly threshold
    :param k: PA%K ratio, 0 equals to conventional point adjust and 100 equals to original predictions
    :return: point_adjusted predictions
    """
    scores = np.array(scores)     # convert anomaly scores and threholsd to in numpy array
    thres = np.array(thres)

    predicts = scores > thres     # each element is true if the score is greater than the threshold
    actuals = targets > 0.01      # each elment is true if the corrisponding target label is greather than 0.01

    one_start_idx = np.where(np.diff(actuals, prepend=0) == 1)[0]   # dentify the starting indices of consecutive sequences of 1s (one_start_idx) and 0s (zero_start_idx) in the actuals array.
    zero_start_idx = np.where(np.diff(actuals, prepend=0) == -1)[0]

    # If the length of one_start_idx is equal to the length of zero_start_idx + 1, adjust zero_start_idx by appending the length of predicts.
    assert len(one_start_idx) == len(zero_start_idx) + 1 or len(one_start_idx) == len(zero_start_idx)

    if len(one_start_idx) == len(zero_start_idx) + 1:
        zero_start_idx = np.append(zero_start_idx, len(predicts))

    # Iterate through each sequence of 1s and 0s, and if the sum of predicted anomalies
    # in that sequence exceeds the PA%K ratio, set all elements in that sequence to 1.
    for i in range(len(one_start_idx)):
        if predicts[one_start_idx[i]:zero_start_idx[i]].sum() > k / 100 * (zero_start_idx[i] - one_start_idx[i]):
            predicts[one_start_idx[i]:zero_start_idx[i]] = 1

    return predicts

##### PA@k plot

In [None]:
scores = {
    'ENC-DEC':{
        'anomaly_scores_norm' : scores_encdec['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
        'threshold' : 0.32
    },
    'HIF_supervised':{
        'anomaly_scores_norm' : scores_hif_supervised['anomaly_scores_norm'],
        'threshold' : 0.56
    },
    'LSTM-AD':{
        'anomaly_scores_norm' : scores_lstm['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
        'threshold' : 0.07
    },
    'HIF_unsupervised':{
        'anomaly_scores_norm' : scores_hif_unsupervised['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
        'threshold' : 0.63
    },
    'K-Means':{
        'anomaly_scores_norm' : scores_kmeans['anomaly_scores_norm'][-scores_hif_supervised['anomaly_scores_norm'].shape[0]:],
        'threshold' : 0.31
    }
}

for key in scores.keys():
    threshold = scores[key]['threshold']
    f1pa_k = [sklearn.metrics.f1_score(scores_hif_supervised['true_labels'], pak(scores[key]['anomaly_scores_norm'], scores_hif_supervised['true_labels'], threshold, k=k)) for k in range(0, 101)]
    f1pa_k = np.array(f1pa_k)
    area_trapz = np.trapz(f1pa_k, dx=0.01)
    plt.plot(range(len(f1pa_k)), f1pa_k, label=f'{key} (AUC: {area_trapz:.2f})')
    plt.fill_between(range(0, 101), f1pa_k, alpha=0.3)

plt.grid(True, linestyle='-')
plt.xlabel('K')
plt.ylabel('F1$PA_{\%K}$')
plt.legend()
plt.show()