<a href="https://colab.research.google.com/github/ClaudiaMarano/Anomaly-Detection-and-Prediction/blob/main/physical_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Physical Anomaly Detection

## Import Librerie

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape
from tensorflow.keras.callbacks import Callback
from keras.optimizers import Adam
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.layers import LeakyReLU

## Funzione per caricamento e segmentazione dataset
Il dataset viene suddiviso in intervalli di un minuto in modo da addestrare la rete su intervalli di campioni senza anomalie, in modo da poter riconoscere in fase di test quando un intervallo contiene invece un'anomalia.

In [19]:
# 1.
def dataset_load_and_segmentation(path_norm, att_paths = []):
    # Carico il CSV con diversa codifica
    df_norm = pd.read_csv(path_norm, sep='\t', encoding='utf-16')

    df_norm = df_norm.drop(columns=["Label"])

    df_norm['Time'] = pd.to_datetime(df_norm['Time'])

    # Ordino per timestamp nel caso non siano già ordinati
    df_norm = df_norm.sort_values(by='Time')

    # Definisco la durata della finestra in due minuti
    window_duration = pd.Timedelta(minutes=1)

    # Lista per i segmenti
    segments = []
    start_time = df_norm['Time'].iloc[0]

    while start_time < df_norm['Time'].iloc[-1]:
        end_time = start_time + window_duration
        segment = df_norm[(df_norm['Time'] >= start_time) & (df_norm['Time'] < end_time)]
        if len(segment) > 0:
            segments.append(segment.drop(columns=['Time']).values)
        start_time = end_time

    # print(len(segments))
    # for seg in segments:
    #     print(len(seg))

    # Mantengo solo i segmenti che hanno lunghezza pari a 60 o 120 in base a se divido in intervalli di 2 o 1 minuti.
    uniform_segments = [segment for segment in segments if len(segment) == 60]

    print(len(uniform_segments))

    # Prendo tutti gli intervalli 'normali' dai file relativi alle anomalie e li aggiungo al mio dataset di addestramento
    # if att_paths:
    #     for path in att_paths:
    #         segments_from_att = load_from_att_file(path)
    #         uniform_segments.extend(segments_from_att)

    print(len(uniform_segments))

    # print("\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n")
    #
    # print(len(uniform_segments))
    # for seg in uniform_segments:
    #     print(len(seg))

    return uniform_segments

### Estrazione intervalli normali da anomalie

Funzione che estrae intervalli normali della stessa durata dai file che contengono anomalie. In questo modo ho un dataset più grande per l'addestramento

In [20]:
def load_from_att_file(path):
    # Carico il CSV con diversa codifica
    df_norm = pd.read_csv(path, sep='\t', encoding='utf-16')

    df_norm['Time'] = pd.to_datetime(df_norm['Time'])

    # Ordino per timestamp nel caso non siano già ordinati
    df_norm = df_norm.sort_values(by='Time')

    # Definisco la durata della finestra in due minuti
    window_duration = pd.Timedelta(minutes=1)

    # Lista per i segmenti
    segments = []
    start_time = df_norm['Time'].iloc[0]

    while start_time < df_norm['Time'].iloc[-1]:
        end_time = start_time + window_duration
        segment = df_norm[(df_norm['Time'] >= start_time) & (df_norm['Time'] < end_time)]
        if len(segment) > 0:
            segments.append(segment.drop(columns=['Time']).values)
        start_time = end_time

    # print(len(segments))
    # for seg in segments:
    #     print(len(seg))

    # Mantengo solo i segmenti che hanno lunghezza pari a 60 o 120 in base a se divido in intervalli di 2 o 1 minuti.
    uniform_segments = [
        segment for segment in segments
        if len(segment) == 60 and all(row[-1] == 'normal' for row in segment)
    ]

    # print("\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n")
    #
    # print(len(uniform_segments))
    # for seg in uniform_segments:
    #     print(len(seg))

    # Rimuove l'ultima colonna da ogni segmento
    clean_segments = [segment[:, :-1] for segment in uniform_segments]

    return clean_segments

## Funzione per il Preprocessing

In [21]:
# 2.
def preprocessing(segments):
    """
    1) Accorpo tutti i segmenti in un unico numpy array per poter applicare la normalizzazione.
    2) Normalizzo.
    3) Divido di nuovo i segmenti normalizzati nel numero originario dei segmenti in input.
    4) Ritorno i segmenti normalizzati e lo scaler utilizzato.
    :param segments:
    :return:
    """
    # 1)
    segments_array = np.vstack(segments)

    # print(f"Array Unico: {segments_array}")
    # print(f"Tipologia: {type(segments_array)}")
    # print(f"Lunghezza: {len(segments_array)}")
    # print("\n---------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n")

    # 2)
    scaler = StandardScaler()
    segments_scaled = scaler.fit_transform(segments_array)

    # print(f"Segmenti Normalizzati: {segments_scaled}")
    # print(f"Tipologia: {type(segments_scaled)}")
    # print(f"Lunghezza: {len(segments_scaled)}")
    # print("\n---------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n")

    # 3)
    segments_scaled_split = np.array_split(segments_scaled, len(segments))

    # print(f"Segmenti Normalizzati e Ricostruiti: {segments_scaled_split}")
    # print(f'Tipologia: {type(segments_scaled_split)}')
    # print(f"Lunghezza: {len(segments_scaled_split)}")
    # print("\n---------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n")

    # 4)
    return segments_scaled_split, scaler

## Funzione per il Training

In [22]:
# 3.
def building_and_training(segments_scaled_split):
    # Callback per visualizzare statistiche al termine dell'addestramento
    class TrainingSummary(Callback):
        def on_train_end(self, logs=None):
            print("\n--- Statistiche Finali ---")
            print(f"Loss finale su training set: {logs['loss']:.4f}")
            if 'val_loss' in logs:
                print(f"Loss finale su validation set: {logs['val_loss']:.4f}")

    # Definisco l'input shape
    input_shape = segments_scaled_split[0].shape  # Forma di un segmento

    # Definisco la struttura dell'autoencoder
    input_layer = Input(shape=input_shape)
    x = Flatten()(input_layer)
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dense(128)(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    encoded = Dense(64)(x)
    encoded = LeakyReLU(alpha=0.1)(encoded)
    x = Dense(128)(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dense(256 )(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dense(np.prod(input_shape), activation="sigmoid")(x)
    x = BatchNormalization()(x)
    decoded = Reshape(input_shape)(x)

    # Creo il Modello
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss="mse")

    early_stopping = EarlyStopping(
        monitor="val_loss", patience=10, restore_best_weights=True
    )

    # Addestramento dell'autoencoder sui dati "normali", senza anomalie
    history = autoencoder.fit(
        np.array(segments_scaled_split), np.array(segments_scaled_split),
        epochs=500, batch_size=16, shuffle=True, validation_split=0.1,
        callbacks=[TrainingSummary(), early_stopping]
    )

    # Stampo statistiche finali direttamente dal dizionario `history.history`
    final_loss = history.history['loss'][-1]
    final_val_loss = history.history['val_loss'][-1]

    print("\n--- Risultati Finali ---")
    print(f"Training Loss: {final_loss:.4f}")
    print(f"Validation Loss: {final_val_loss:.4f}")

    return autoencoder


## Calcolo dell'Errore di Ricostruzione

In [23]:
# 4.
def get_rebuilding_error(autoencoder, segments_scaled_split):
    # Ricostruzione dei dati di addestramento
    reconstructed_train = autoencoder.predict(np.array(segments_scaled_split))
    mse_train = np.mean(np.power(segments_scaled_split - reconstructed_train, 2), axis=(1, 2))

    # Imposto il threshold al 95° percentile dell'errore
    threshold = np.percentile(mse_train, 95)
    print("Soglia di errore di ricostruzione:", threshold)

    return threshold

## Test del modello

In [24]:
# 5.
def testing(path, scaler, autoencoder, threshold):
    # Carico il CSV con diversa codifica
    df_anomalous = pd.read_csv(path, sep='\t', encoding='utf-16')
    df_anomalous = df_anomalous.drop(columns=["Label"])

    df_anomalous['Time'] = pd.to_datetime(df_anomalous['Time'])
    df_anomalous = df_anomalous.sort_values(by='Time')

    # Segmentazione del file di test in blocchi di tot minuti
    test_segments = []
    start_time = df_anomalous['Time'].iloc[0]

    # Definisco la finestra di tot minuti
    window_duration = pd.Timedelta(minutes=1)

    while start_time < df_anomalous['Time'].iloc[-1]:
        end_time = start_time + window_duration
        segment = df_anomalous[(df_anomalous['Time'] >= start_time) & (df_anomalous['Time'] < end_time)]
        if len(segment) > 0:
            test_segments.append(segment.drop(columns=['Time']).values)
        start_time = end_time

    # Normalizzazione dei segmenti di test usando lo scaler già addestrato
    test_segments_scaled = [scaler.transform(segment) for segment in test_segments]

    # Mantengo solo i segmenti che hanno lunghezza pari a 60 o 120 in base a se divido in intervalli di tot minuti.
    uniform_segments = [segment for segment in test_segments_scaled if len(segment) == 60]

    for seg in uniform_segments:
        print(seg)

    # Ricostruzione e calcolo dell'errore per ogni segmento di test
    reconstructed_test = autoencoder.predict(np.array(uniform_segments))
    mse_test = np.mean(np.power(uniform_segments - reconstructed_test, 2), axis=(1, 2))

    # Identificazione delle anomalie nei blocchi di test
    anomalies = mse_test > threshold
    print("Numero di blocchi anomali rilevati:", np.sum(anomalies))
    print(f"Blocchi anomali: {anomalies}")

## Esecuzione del processo

In [25]:
segments = dataset_load_and_segmentation("phy_norm.csv", ["phy_att_1.csv", "phy_att_2.csv"])

57
57


In [26]:
segments_scaled_split, scaler = preprocessing(segments)

In [31]:
autoencoder = building_and_training(segments_scaled_split)

Epoch 1/500




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 149ms/step - loss: 1.4571 - val_loss: 0.8411
Epoch 2/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.9248 - val_loss: 0.7825
Epoch 3/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 0.6142 - val_loss: 0.7266
Epoch 4/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 0.5507 - val_loss: 0.6853
Epoch 5/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.4495 - val_loss: 0.6534
Epoch 6/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 0.4485 - val_loss: 0.6249
Epoch 7/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.4024 - val_loss: 0.5989
Epoch 8/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.3940 - val_loss: 0.5814
Epoch 9/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [32]:
threshols = get_rebuilding_error(autoencoder, segments_scaled_split)



[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 200ms/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step
Soglia di errore di ricostruzione: 0.12967185166456438


In [33]:
testing("phy_att_3.csv", scaler, autoencoder, threshols)

[[-1.07374237 -1.12949751 -1.28165638 ...  0.         -1.04180893
   0.        ]
 [-1.07374237 -1.12949751 -1.28165638 ...  0.         -1.04180893
   0.        ]
 [-1.07374237 -1.12949751 -1.28165638 ...  0.         -1.04180893
   0.        ]
 ...
 [ 1.55041057  0.74633671 -1.28165638 ...  0.         -1.04180893
   0.        ]
 [ 1.59867085  0.76177568 -1.28165638 ...  0.         -1.04180893
   0.        ]
 [ 1.6408986   0.7957414  -1.28165638 ...  0.         -1.04180893
   0.        ]]
[[ 1.68614262  0.84514609 -1.28165638 ...  0.         -1.04180893
   0.        ]
 [ 1.71932156  0.87293623 -1.28165638 ...  0.         -1.04180893
   0.        ]
 [ 1.77059811  0.90535805 -1.28165638 ...  0.         -1.04180893
   0.        ]
 ...
 [ 1.95157418  1.45961689 -0.18218347 ...  0.         -1.04180893
   0.        ]
 [ 1.89426509  1.43954623 -0.16293199 ...  0.         -1.04180893
   0.        ]
 [ 1.9078383   1.42410727 -0.14154147 ...  0.         -1.04180893
   0.        ]]
[[ 1.88220002  1