# Hybrid Isolation Forest

### Libraries

In [None]:
!pip install pandas==1.5.3
!pip install tsfel
!pip3 install --upgrade --no-cache-dir gdown       # support for download a large file from Google Drive
!pip install numpy>=1.19.5
!pip install scikit-learn>=0.24.1
!pip install tadpak

### Download dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# unzip from drive
!unzip /content/drive/MyDrive/Colab_MLA/MLA_Project/csv_20220811.zip -d /content/

In [None]:
# Download from link
import os, sys
# https://drive.google.com/file/d/1Fn_KVRpwLedTYU1QgfVCRtkvo1hf_9GB/view?usp=sharing first account
# https://drive.google.com/file/d/1P8pCKLI-64_HT91Oqid4RUGtZCUht2c-/view?usp=sharing second account

if not os.path.isfile('/content/csv_20220811.zip'):
  !gdown 1P8pCKLI-64_HT91Oqid4RUGtZCUht2c-
  !jar xvf  "/content/csv_20220811.zip"

if not os.path.isdir('/content/csv_20220811'):
  print("Dataset doesn't exist")

In [None]:
import os
import time
import warnings
import datetime
import torch
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import logging
import pickle
import random as rn
from sklearn import preprocessing
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from google.colab import files
from collections import Counter
from sklearn.metrics import roc_auc_score, average_precision_score, top_k_accuracy_score, f1_score, roc_curve, auc, precision_recall_curve

### Data Loading

In [None]:
ROOTDIR_DATASET_NORMAL = "/content/csv_20220811"
plt.style.use("Solarize_Light2") # Set style for matplotlib

##### Loading metadata

In [None]:
def read_folder_normal(dataset_folder, frequency):
    ROOTDIR_DATASET = dataset_folder

    filepaths_csv = [os.path.join(ROOTDIR_DATASET, f"rec{r}_20220811_rbtc_{1/frequency}s.csv") for r in [0, 2, 3, 4]]
    filepaths_meta = [os.path.join(ROOTDIR_DATASET, f"rec{r}_20220811_rbtc_{1/frequency}s.metadata") for r in [0, 2, 3, 4]]

    dfs = [pd.read_csv(filepath_csv, sep=";") for filepath_csv in filepaths_csv]
    df = pd.concat(dfs)
    df = df.sort_index(axis=1)
    df.index = pd.to_datetime(df.time.astype('datetime64[ms]'), format="%Y-%m-%dT%H:%M:%S.%f")

    columns_to_drop = [column for column in df.columns if "Abb" in column or "Temperature" in column]
    df.drop(["machine_nameKuka Robot_export_active_energy", "machine_nameKuka Robot_import_reactive_energy"] + columns_to_drop, axis=1, inplace=True)

    df.drop(['time'], axis=1, inplace=True)     # remove the last column time
    X_train = df
    return X_train


def read_folder_collisions(dataset_folder, frequency):
    ROOTDIR_DATASET = dataset_folder
    collisions = pd.read_excel(os.path.join(ROOTDIR_DATASET, "20220811_collisions_timestamp.xlsx"))
    collisions['Timestamp'] = collisions['Timestamp'] - pd.to_timedelta(2, 'h')

    start_col = collisions[collisions['Inizio/fine'] == "i"][['Timestamp']].rename(columns={'Timestamp': 'start'})
    end_col = collisions[collisions['Inizio/fine'] == "f"][['Timestamp']].rename(columns={'Timestamp': 'end'})

    start_col.reset_index(drop=True, inplace=True)
    end_col.reset_index(drop=True, inplace=True)

    df_collision = pd.concat([start_col, end_col], axis=1)

    filepath_csv_test = [os.path.join(ROOTDIR_DATASET, f"rec{r}_collision_20220811_rbtc_{1/frequency}s.csv") for r in [1, 5]]
    dfs_test = [pd.read_csv(filepath_csv, sep=";") for filepath_csv in filepath_csv_test]
    df_test = pd.concat(dfs_test)

    df_test = df_test.sort_index(axis=1)
    df_test.index = pd.to_datetime(df_test.time.astype('datetime64[ms]'), format="%Y-%m-%dT%H:%M:%S.%f")
    columns_to_drop = [column for column in df_test.columns if "Abb" in column or "Temperature" in column]
    df_test.drop(["machine_nameKuka Robot_export_active_energy", "machine_nameKuka Robot_import_reactive_energy"] + columns_to_drop, axis=1, inplace=True)

    df_test['time'] = pd.to_datetime(df_test['time'].astype('datetime64[ms]'), format="%Y-%m-%dT%H:%M:%S.%f")

    X_collisions = df_test.drop(['time'], axis=1, inplace=False)
    return df_collision, X_collisions, df_test

def pre_processing(data, corr_features = None):

  data = data.drop((data.columns[data.isna().any()].tolist()), axis = 1)  # Remove nan values

  scaler = preprocessing.MinMaxScaler() # Normalizing
  scaler.fit(data)
  data = pd.DataFrame(scaler.transform(data), columns=data.columns)

  selector_variance = VarianceThreshold() # Remove zero-variance
  selector_variance.fit(data)
  data = pd.DataFrame(selector_variance.transform(data), columns=data.columns.values[selector_variance.get_support()])

  if corr_features == None:               # Remove highly correlated features
    corr_features = tsfel.correlated_features(data, threshold=0.95)
  data.drop(corr_features, inplace=True, axis=1)

  return data, corr_features

### Forest

In [None]:
class hiForest(object):

    def __init__(self, X_Train, ntrees, sample_size, limit = None):
        self.ntrees = ntrees            # Number of trees
        self.X = X_Train                # Training data shape [data point, features]
        self.nobjs = len(X_Train)       # Number of nodes
        self.sample = sample_size       # Dimension of the tree (elements of the tree)
        self.Trees = []                 # Trees list
        self.limit = limit              # Maximum depth of each tree

        if limit is None:               # depth
            self.limit = int(np.ceil(1.2 * np.log2(self.sample))) # based on the tree dimension, compute log2
                                                                # multiply it for 1.2 to fix the logarithm
                                                                # np.ceil: return the upper bound of the number: -1.7 -> -1, 1.7 --> 2
        self.c = c_factor(self.sample)      # compute the c_factor for a given sample size

        for i in range(self.ntrees):            # loop for each tree
            ix = rn.sample(range(self.nobjs), self.sample) # Generate between the range of the time series a random number of element equal to sample size
                                                           # ix: random sample_size indexes, it will be our subset
            X_p = X_Train[ix]                              # given the index select the elements
            self.Trees.append(hiTree(X_p, 0, self.limit))  # Save and create the tree given the values and the depth limit

    def computeAggScore(self, x):
        S = np.zeros(self.ntrees)           # array of scores: one for each tree
        labsCount = Counter([])             # counter of labels (anomalies)
        ldist = []                          # list for distance path
        ldist_a = []                        # list for distance path to the anomalies centroid
        for j in range(self.ntrees):            # loop for all the trees
            pf = PathFactor(x, self.Trees[j])   # compute the path factor given x(data point) and the j-tree
            path =  pf.path * 1.0               # extract the value from the path
            S[j] = 2.0**(-1.0 * path/self.c)    # Compute the isolation forest score (first score)
            labsCount = labsCount + pf.labs     # Update the label count

            if(len(pf.ldist) > 0):                          # if the path distance of the datapoint is positive
                ldist.append(np.mean(pf.ldist))             # save the mean of the distance path

            if(len(pf.ldist_a) > 0):                        # if the path distance of the datapoint from the anomalies is positive
                ldist_a.append(np.mean(pf.ldist_a, axis=0)) #  save the mean of the anomalies path

        meanDist = 0                        # mean distance
        if(len(ldist) > 0):
            meanDist = np.mean(ldist)       # mean of distance path (second score)

        meanDist_r = 0
        if(len(ldist_a) > 0):
            meanDist_a = np.mean(ldist_a, axis=0)    # mean of anomalies distance path
            if(meanDist_a > 0):
                meanDist_r = meanDist / (meanDist_a)    # relative mean (third score)

        return np.mean(S), labsCount, meanDist, meanDist_r  # return 1st score, labels count, 2nd score, 3rd score


    def addAnomaly(self, x, lab):                       # ann an anomalie in the tree
        for j in range(self.ntrees):                    # loop in each tree
            pf = PathFactor(x, self.Trees[j])           # create the path factor for the data point and the given tree
            pf.addAnomaly(x, lab, self.Trees[j].root)   # add the anomaly on the root node

    def computeAnomalyCentroid(self):                       # compute the anomaly centroid
        for j in range(self.ntrees):                        # loop for each tree
            self.Trees[j].root.computeAnomalyCentroid()     # compute the Anomaly centroid for the root node


### Tree

In [None]:
class hiTree(object):

    """
    Unique entries for X
    """

    def __init__(self, X, e, l): # X input samples, e = 0, l = depth limit
        self.e = e               # actual depth
        self.X = X               # save input data samples
        self.size = len(X)       # size of X (number of samples) [samples, features]
        self.Q = np.arange(np.shape(X)[1], dtype='int') # Q features aranged
        self.l = l               # depth limit
        self.p = None            # random value between min and max from the feature chosen (threshold)
        self.q = None            # random index of X-features
        self.exnodes = 0         # external nodes
        self.labs = []           # labels
        self.root = self.make_tree(X, e, l) # creating the tree given X samples, depth and limit

    def make_tree(self, X, e, l):
        self.e = e      # assign the depth
        if e >= l or len(X) <= 1: # check if depth is greater than limit or we don't have anymore samples
            left = None           # create an EXTERNAL empty node with left and rigth equal to None
            right = None
            self.exnodes += 1     # increase the number of external node
            return Node(X, self.q, self.p, e, left, right, node_type = 'exNode')
        else:
            self.q = rn.choice(self.Q)  # select a random index from Q from 1 to number of features
            self.p = rn.uniform(X[:, self.q].min(), X[:, self.q].max())
                                # X[:, self.q] extract the values from the q-features. In this case it will be equal to the values of the data points for that features
                                # .min return the minimum value
                                # .max return the maximum value
                                # rn.uniform(min, max): generate a random value from that interval

            w = np.where(X[:, self.q] < self.p, True, False)  # mask
                # create an array w of true/false based on the check
                # X[:, self.q] extract the values from the column given by q
                # < self.p: if the value is lower than the random value chosen by p assign True otherwise False
                # w sarà un array dove ogni elemento è true se la condizione è verificata altrimenti False

            return Node(X, self.q, self.p, e, left=self.make_tree(X[w], e + 1, l), right=self.make_tree(X[~w], e + 1, l), node_type = 'inNode' )
                   # create a new node with
                   # X input samples
                   # q: feature index on which the node is split, p: threshold for the split (random value between min and max)
                   # e: current depth, node_type: inNode or exNode
                   # left: left child created with the true value of X selected with the mask w (lower values)
                   # right: right child created with the false value (greater values)

### Node

In [None]:
class Node(object):
    def __init__(self, X, q, p, e, left, right, node_type = '' ):
        self.e = e              # node depth
        self.size = len(X)      # len of x, sample_size
        self.q = q              # feature index on thich the node is split
        self.p = p              # threshold value for the split
        self.left = left        # left and right child
        self.right = right
        self.ntype = node_type  # 'exNode' for leaf nodes and 'inNode' for inside nodes
        self.C = None           # Centroid of the node
        self.Ca = None          # Centroid of the anomaly
        self.labs = []          # list of labels, used for the anomalies of the external node
        self.Xanomaly = []      # list to memorize anomalies in the 'exNode'
        if(node_type == 'exNode' and self.size > 0):  # if it is an extern node and the size is positive
            self.C = np.mean(X, axis=0)               # centroid of that Node, computed with the mean of the element of X

    def computeAnomalyCentroid(self):
        if self.ntype == 'exNode':                        # check for extern node
            if(len(self.Xanomaly) > 0):                   # check if some anomalies have been added to this exNode
                self.Ca = np.mean(self.Xanomaly, axis=0)  # compute mean of anomalies
        else:
            self.left.computeAnomalyCentroid()            # otherwise go deep left and right until you find some anomalies
            self.right.computeAnomalyCentroid()

### Path Factor

In [None]:
class PathFactor(object):
    def __init__(self, x, hitree):  # I receive a data point x and a tree
        self.path_list = []         # path list
        self.labs = []              # labels
        self.ldist = []             # list of distance path
        self.ldist_a = []           # list of anomalies path
        self.x = x                  # data point
        self.e = 0                  # depth
        self.path = self.find_path(hitree.root)   # find the path from the root

    def find_path(self, T):
        if T.ntype == 'exNode':             # if it is a external node
            self.labs = Counter(T.labs)     # count the labels of the exNode
            if not (T.C is None):
                self.ldist.append(EuclideanDist(self.x, T.C))    # compute the Euclidean distance of the centroid
            if not (T.Ca is None):
                self.ldist_a.append(EuclideanDist(self.x, T.Ca)) # compute the Euclidean distance of anomaly centroid
            sz = T.size     # tree dimension equal to the number of samples
            if(sz == 0):    # if equals to zero increase by 1
                sz += 1
            for key in self.labs:           # loop between the labels of exNode (we have only 1 type of anomalies)
                self.labs[key] /= sz        # normalize the label count by dividing its associated value with the value of size

            if T.size == 1:                 # if there's only 1 element
                return self.e               # return depth
            else:
                self.e = self.e + c_factor(T.size)  # return depth with c_factor
                return self.e
        else:                                 # if it is a inNode
            a = T.q                           # a is our index features
            self.e += 1                       # update depth
            if self.x[a] < T.p:               # check if the featutes of our datapoint are lower than our threshold
                self.path_list.append('L')    # save LEFT and go down left
                return self.find_path(T.left)
            else:
                self.path_list.append('R')    # save RIGHT and go down right
                return self.find_path(T.right)

    def addAnomaly(self, x, lab, T):
        if T.ntype == 'exNode':     # check if it's and external node
            T.labs.append(lab)      # save the label (anomaly = 1)
            T.Xanomaly.append(x)    # save the value of the anomaly
        else:
            a = T.q                 # index of the selected features
            if self.x[a] < T.p:     # check with the threshold
                return self.addAnomaly(x, lab, T.left)    # go deep left until you find the exNode
            else:
                return self.addAnomaly(x, lab, T.right)   # go deep right until you find the exNode


### Utils

In [None]:
def EuclideanDist(x, y):
    return np.sqrt(np.sum((x - y) ** 2))
    # (x - y) ** 2: subtract the two array and raise them to the power of 2
    # np.sum((x - y) ** 2): compute the sum of square subtraction between the two array
    # np.sqrt: square root of the sum

# example: if n = 1024 --> c = 13.017 / n = 512 --> 11.631
def c_factor(n): # this factor is used to estimate the number of permutation in combinatory problems
    if(n < 2):   # check if input is lower than 2
        n = 2
    return 2.0 * (np.log(n - 1) + 0.5772156649) - (2.0 * (n - 1.) / (n * 1.0))

# The Average path length of unsuccessful search in BTS as:
#       c(n) = 2 * H(n - 1) - (2 * (n - 1) / n)
#       where: H(i) is the harmonic number and that can be estimated by: log(i) + 0.5772156649 (Euler's constant)
#       as c(n) is the average of h(x) given n, we use it to normalize h(x). The anomaly score s of an istance x is defined as: ...

### Training forest function

In [None]:
def training_forest(X_train, n_trees, max_samples, forest_type, min_samples_size, X_collisions_add = None, df_test_add = None, df_collision = None):

  if max_samples: # setting max_samples
      sample_size = int(max_samples * X_train.shape[0])
  else:
      sample_size = min(min_samples_size, X_train.shape[0])  # if max_samples is None

  X_train_norm = X_train.values

  # UNSUPERVISED extension that exploits a distance knowledge to neighboring 'normal' data
  Forest = hiForest(X_train_norm, n_trees, sample_size)

  if forest_type == 'supervised':
      # SUPERVISED-based extention: if we want we can add anomalies in the Forest
      tot_anomalies = 0       # anomalie totali
      index_anomaly = []      # anomalies index
      idx = 0

      for _, row in df_test_add.iterrows():                 # take row from df_test
          for _, collision_row in df_collision.iterrows():  # take the interval from df_collision
              if (row['time'] >= collision_row['start']) and (row['time'] <= collision_row['end']): # check if the row belongs to the interval
                  tot_anomalies += 1                # increase the total anomalies added
                  index_anomaly.append(idx)         # save the index

                  anomaly_to_add = row.drop(['time'], axis=0, inplace=False)  # drop the time from the row
                  anomaly_to_add = anomaly_to_add.values                      # convert in numpy
                  Forest.addAnomaly(anomaly_to_add, lab=1)                    # add the anomaly, label = 1 (we only have 2 label, normal data and anomalies)
          idx += 1
      print(f"Anomalies detected: {tot_anomalies}")

  Forest.computeAnomalyCentroid()
  print("Forest created with success.")
  return Forest

In [None]:
def aggregate_scores(s_if, s_unsupervised, s_supervised, alpha1, alpha2):

  # normalize the scores and check if they are equal to 0 to avoid division by 0
  if np.max(s_if) - np.min(s_if) == 0:
    s_if_norm = np.zeros_like(s_if)
  else:
    s_if_norm = (s_if - np.min(s_if)) / (np.max(s_if) - np.min(s_if))

  if np.max(s_unsupervised) - np.min(s_unsupervised) == 0:
    s_unsupervised_norm = np.zeros_like(s_unsupervised)
  else:
    s_unsupervised_norm = (s_unsupervised - np.min(s_unsupervised)) / (np.max(s_unsupervised) - np.min(s_unsupervised))

  if np.max(s_supervised) - np.min(s_supervised) == 0:
    s_supervised_norm = np.zeros_like(s_supervised)
  else:
    s_supervised_norm = (s_supervised - np.min(s_supervised)) / (np.max(s_supervised) - np.min(s_supervised))

  scores = alpha2 * (alpha1 * s_if_norm + (1 - alpha1) * s_unsupervised_norm) + (1 - alpha2) * s_supervised_norm
  return scores

In [None]:
def score_hif(Forest, X_collisions, score_type):
  X_collisions_norm = X_collisions.values     # convert the values
  size = X_collisions_norm.shape[0]           # size of the scores

  scores_if = np.zeros(size)  # path isolation forest
  scores_un = np.zeros(size)  # unsupervised
  scores_s = np.zeros(size)   # supervised

  alpha1 = 0.3                # best results between [0.2, 0.5]

  if score_type == 'unsupervised':
     alpha2 = 1               # with alpha2 = 1 we don't need the supervised score
  else:
     alpha2 = 0.7             #  best results between [0.6, 0,9]

  for i in range(size):       # loop for each point
      scorePath, _, meanDist, meanDistA = Forest.computeAggScore(X_collisions_norm[i])    # compute scores and save them
      scores_if[i] = scorePath
      scores_un[i] = meanDist
      scores_s[i] = meanDistA

  scores = aggregate_scores(scores_if, scores_un, scores_s, alpha1, alpha2)               # aggregate the scores
  #anomaly_scores_norm = (scores - np.min(scores)) / (np.max(scores) - np.min(scores))     # normalize the scores
  print("Scores computed with success.")
  return scores

### Variables init

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)

In [None]:
frequency = 10
max_samples = None
n_trees = 1024
min_samples_size = 256
X_train = read_folder_normal(ROOTDIR_DATASET_NORMAL, frequency) # n_row, n_features (90k, 55), 56 with time
df_collision, X_collisions, df_test = read_folder_collisions(ROOTDIR_DATASET_NORMAL, frequency) # (95815, 55) test data extraction

X_train, corr_features = pre_processing(X_train)               # (95815, 55)
X_collisions, _ = pre_processing(X_collisions, corr_features)
df_test.drop(corr_features, inplace=True, axis=1)

##### Split for adding anomalies and then testing (only for supervised hif)

In [None]:
anomaly_split_number = 0.02 # 2%-3% del dataset
split_at = int(len(X_collisions) * anomaly_split_number)  # range dello split

X_collisions_add = X_collisions[:split_at]   # (685, 55)             # le anomalie in questo range verranno aggiunte nella foresta
X_collisions = X_collisions[split_at:]  # (33590, 55)

df_test_add = df_test[:split_at]                  # (685, 56)
df_test = df_test[split_at:]                 # (33590, 56)

### Training Forest Unsupervised

In [None]:
model_title = 'HIF_unsupervised'
Forest = training_forest(X_train, n_trees, max_samples, 'unsupervised', min_samples_size)

In [None]:
scores = score_hif(Forest, X_collisions, 'unsupervised') ##### Computing Scores Forest Unsupervised (no anomalies added)

 ### Training Forest Supervised (with anomaly split)


In [None]:
model_title = 'HIF_supervised'
Forest = training_forest(X_train, n_trees, max_samples, 'supervised', min_samples_size, X_collisions_add, df_test_add, df_collision)

In [None]:
scores = score_hif(Forest, X_collisions, 'supervised') ##### Computing Scores Forest Supervised (we added anomalies)

### Evaluate functions

In [None]:
# plot distribution and return the true_labels
def plot_hist(anomaly_scores, df_collision, df, title):
    index_anomaly = []      # anomalies' index
    idx = 0
    for _, row in df.iterrows():
        for _, collision_row in df_collision.iterrows():
            if (row['time'] >= collision_row['start']) and (row['time'] <= collision_row['end']):
                index_anomaly.append(idx)
        idx += 1
    true_labels = np.zeros_like(anomaly_scores)
    true_labels[index_anomaly] = 1
    logging.info(f"Anomalies detected: {int(true_labels.sum())}")
    anomaly_values = anomaly_scores[index_anomaly]
    normal_values = np.delete(anomaly_scores, index_anomaly)

    plt.hist(normal_values, bins=30, color="tab:blue", ec="dodgerblue", alpha=0.5, label='Normal')
    plt.hist(anomaly_values, bins=30, color='tab:red', ec="darkred", alpha=0.7, label='Anomalies')

    plt.xlabel('Values')
    plt.ylabel('Occurrencies')
    plt.legend(loc='upper right')
    plt.title(title)
    plt.savefig(f'/content/{title}.jpg')  # Modify the path and filename as needed
    plt.show()
    return true_labels

# compute f1, fB score, auc-roc, auc-pr
def compute_metrics(anomaly_scores_norm, df_test, y_true, th=None):
    tot_anomalies = y_true.sum()
    sens = list()           # recalls o tpr
    spec = list()
    fpr = list()
    f1 = list()
    f0_1= list()
    prec = list()
    cm_list = list()
    anomlay_indexes_dict = dict()
    acc_with_err = list()
    step = 0.01
    ths = np.arange(0, 1, step)
    if th is None:
        for threshold in tqdm(ths):
            anomalies_pred = anomaly_scores_norm > threshold
            tp = 0                                                          # true positive per quella threshold
            anomaly_indexes = list()
            for index, anomaly_pred in enumerate(anomalies_pred):
                if y_true[index] and anomaly_pred:
                    anomaly_indexes.append(index)
                    tp += 1

            cm_anomaly = np.zeros((2,2))
            n_sample = len(df_test)
            n_not_collision = n_sample - tot_anomalies
            n_detected = anomalies_pred.sum()

            fp = n_detected - tp
            fn = tot_anomalies - tp
            tn = n_not_collision - fp

            cm_anomaly[0, 0] = tn
            cm_anomaly[0, 1] = fp
            cm_anomaly[1, 0] = fn
            cm_anomaly[1, 1] = tp

            cm_list.append(cm_anomaly)
            recall = tp / (tp + fn)
            sens.append(recall)
            fpr.append(1 - tn /(tn + fp))
            precision = tp / (tp + fp)
            prec.append(precision)
            spec.append(tn /(tn + fp))
            f1.append(2 * tp / (2 * tp + fp + fn))
            f0_1.append((1 + 0.1**2) * tp / ((1 + 0.1**2) * tp +  0.1**2*fp + fn))
            cm_anomaly_norm = cm_anomaly.astype('float') / cm_anomaly.sum(axis=1)[:, np.newaxis]
            acc_with_err.append( (np.mean(np.diag(cm_anomaly_norm)), np.std(np.diag(cm_anomaly_norm))) )
            anomlay_indexes_dict[threshold] = anomaly_indexes

        f1_max = max(f1)
        f0_1_max = max(f0_1)
        max_index_f1 = f1.index(f1_max)
        max_index_f0_1 = f0_1.index(f0_1_max)
        th_f1_max = max_index_f1 * step
        th_f0_1_max = max_index_f0_1 * step
        print(f"f1: {f1_max} at th: {th_f1_max}")
        print(f"f0.1: {f0_1_max} at th: {th_f0_1_max}")
        print(f"AUC-PR: {metrics.average_precision_score(y_true, anomaly_scores_norm)}")
        print(f"AUC-ROC: {metrics.roc_auc_score(y_true, anomaly_scores_norm)}")
        return sens, fpr, th_f1_max
    else:
        df_anomaly = df_test.loc[np.array(anomaly_scores_norm > th)]
        tp = 0                                                          # true positive per quella threshold
        anomaly_indexes = list()
        anomalies_pred = anomaly_scores_norm > th

        for index, anomaly_pred in enumerate(anomalies_pred):
            if y_true[index] and anomaly_pred:
                anomaly_indexes.append(index)
                tp += 1

        cm_anomaly = np.zeros((2,2))
        n_sample = len(df_test)
        n_not_collision = n_sample - tot_anomalies
        n_detected = len(df_anomaly)

        fp = n_detected - tp
        fn = tot_anomalies - tp
        tn = n_not_collision - fp

        cm_anomaly[0, 0] = tn
        cm_anomaly[0, 1] = fp
        cm_anomaly[1, 0] = fn
        cm_anomaly[1, 1] = tp

        f1 = 2 * tp / (2 * tp + fp + fn)
        f0_1 = (1 + 0.1**2) * tp / ((1 + 0.1**2) * tp +  0.1**2*fp + fn)
        print(f"f1: {f1} at th: {th} for the test set")
        print(f"f0.1: {f0_1} at th: {th} for the test set")

# another way to compute true_labels
def create_true_labels(df_test, df_collision, scores):
    index_anomaly = []
    idx = 0
    for _, row in df_test.iterrows():    # prende la riga da df_validation
        for _, collision_row in df_collision.iterrows():  # prende la collision da df_collision
            if (row['time'] >= collision_row['start']) and (row['time'] <= collision_row['end']):
                index_anomaly.append(idx)         # salva l'indice
        idx += 1               # aumenta l'indice
    true_labels = np.zeros_like(scores)
    true_labels[index_anomaly] = 1
    logging.info(f"Anomalies detected: {int(true_labels.sum())}")
    return true_labels

# dataset divition for testing with validation
def dataset_div(X_collisions, anomaly_scores_norm, df_test):
  split = 0.9                                    # splitting value
  split_at = int(len(X_collisions) * split)      # elements

  asn_val = anomaly_scores_norm[split_at:]       # validation scores
  asn_col = anomaly_scores_norm[:split_at]       # test scores

  df_val = df_test.iloc[split_at:]
  df_col = df_test.iloc[:split_at]

  df_val = df_val[-asn_val.shape[0]:]
  df_col = df_col[-asn_col.shape[0]:]

  return df_val, df_col, asn_val, asn_col

# Testing on a single model

##### Uploading scores

In [None]:
with open('/content/hif_unsupervised_f100_trees1024_sample256.pkl', "rb") as file:
      scores_hif = pickle.load(file)

##### Computing true_labels and metrics

In [None]:
true_labels = plot_hist(scores, df_collision, df_test, title='HIF_distribution_f=10Hz')

In [None]:
compute_metrics(scores, df_test, true_labels)

##### Testing with validation split

In [None]:
df_val, df_col, asn_val, asn_col = dataset_div(X_collisions, scores, df_test)
true_labels_val = plot_hist(asn_val, df_collision, df_val, title='HIF_supervised_Distribution_Val_f=10Hz')
_, _, th_f1_max = compute_metrics(asn_val, df_val, true_labels_val)
true_labels_test = plot_hist(asn_col, df_collision, df_col, title='HIF_supervised_Distribution_Test_f=10Hz')
compute_metrics(asn_col, df_col, true_labels_test, th_f1_max)

##### Downloading scores

In [None]:
anomaly_score = {
            'anomaly_scores_norm' : scores,
            'true_labels' : true_labels
        }

In [None]:
with open('/content/drive/MyDrive/result_dict.pkl', 'wb') as file:
    pickle.dump(anomaly_score, file)
files.download('/content/drive/MyDrive/result_dict.pkl')