<a href="https://colab.research.google.com/github/ClaudiaMarano/Anomaly-Detection-and-Prediction/blob/main/network_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Physical Anomaly Detection

## Import Librerie

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape
from tensorflow.keras.callbacks import Callback
from keras.optimizers import Adam
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.layers import LeakyReLU
from keras.layers import Dropout

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


## Caricamento e Preprocessing
Il dataset viene suddiviso in intervalli di un minuto in modo da addestrare la rete su intervalli di campioni senza anomalie, in modo da poter riconoscere in fase di test quando un intervallo contiene invece un'anomalia.


In [24]:
def preprocess_network_data(path):
    # Caricamento del dataset
    df = pd.read_csv(path, encoding="utf-8")

    # Assicuriamoci che 'Time' sia in formato datetime
    df['Time'] = pd.to_datetime(df['Time'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')

    # Rimuovi righe con valori di data non validi
    df = df.dropna(subset=['Time'])

    # Ordinare i dati per timestamp
    df_normal = df.sort_values(by='Time')

    # Divisione in intervalli di un minuto
    segments = []
    window_duration = pd.Timedelta(milliseconds=100)
    start_time = df_normal['Time'].iloc[0]

    print("Start Time: ", df_normal['Time'].iloc[0])
    print("Finish Time: ", df_normal['Time'].iloc[-1])

    while start_time < df_normal['Time'].iloc[-1]:
        end_time = start_time + window_duration
        segment = df_normal[(df_normal['Time'] >= start_time) & (df_normal['Time'] < end_time)]
        if len(segment) > 0:
            segments.append(segment.drop(columns=['Time', 'label', 'label_n']).reset_index(drop=True))
        start_time = end_time

    # Filtra i segmenti per ottenere solo quelli esattamente di 200 righe
    valid_segments = []
    for segment in segments:
        if len(segment) > 200:
            # Mantieni solo le prime 200 righe
            valid_segments.append(segment.iloc[:200].reset_index(drop=True))
        elif len(segment) == 200:
            # Segmento già valido
            valid_segments.append(segment)

    print(f"Number of valid segments: {len(valid_segments)}")

    # Preprocessing delle feature
    # Separiamo colonne categoriali e numeriche
    categorical_columns = ['mac_s', 'mac_d', 'ip_s', 'ip_d', 'proto', 'flags', 'modbus_fn']
    numerical_columns = ['sport', 'dport', 'size', 'n_pkt_src', 'n_pkt_dst']

    # Creiamo un trasformatore combinato
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
            ('num', MinMaxScaler(), numerical_columns)
        ]
    )

    # Normalizzazione e codifica di ogni segmento
    processed_segments = []
    i = 0
    for segment in valid_segments:
      print(len(segment))
      if i > 3:
        break
      processed = preprocessor.fit_transform(segment)
      processed_segments.append(processed)
      i += 1

    # Ritorniamo i segmenti processati (normalizzati e codificati)
    return processed_segments

In [None]:

def preprocess_and_pad_segments(segments, target_shape):
    """
    Preprocessa i segmenti per garantire che abbiano la stessa forma.
    Applica padding o troncamento per adattarsi a `target_shape`.

    Parameters:
        segments (list of np.ndarray): Lista di segmenti.
        target_shape (tuple): Forma obiettivo (righe, colonne).

    Returns:
        np.ndarray: Array NumPy uniforme con tutti i segmenti della stessa forma.
    """
    padded_segments = []
    for segment in segments:
        # Se il segmento è più lungo, lo tronchiamo
        if segment.shape[0] > target_shape[0]:
            segment = segment[:target_shape[0], :]
        # Se il segmento è più corto, lo riempiamo di zeri
        elif segment.shape[0] < target_shape[0]:
            padding = np.zeros((target_shape[0] - segment.shape[0], segment.shape[1]))
            segment = np.vstack((segment, padding))
        # Aggiungiamo il segmento uniformato
        padded_segments.append(segment)

    return np.array(padded_segments)

## Funzione per il Training

In [26]:
# 3.
def building_and_training(segments_scaled_split):
    # Uniformare i segmenti
    target_shape = (200, segments_scaled_split[0].shape[1])  # 200 righe, numero colonne dal primo segmento
    segments_padded = preprocess_and_pad_segments(segments_scaled_split, target_shape)

    # Callback per visualizzare statistiche al termine dell'addestramento
    class TrainingSummary(Callback):
        def on_train_end(self, logs=None):
            print("\n--- Statistiche Finali ---")
            print(f"Loss finale su training set: {logs['loss']:.4f}")
            if 'val_loss' in logs:
                print(f"Loss finale su validation set: {logs['val_loss']:.4f}")

    # Definisco l'input shape
    input_shape = segments_scaled_split[0].shape  # Forma di un segmento

    # Definisco la struttura dell'autoencoder
    input_layer = Input(shape=input_shape)
    x = Flatten()(input_layer)
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)  # Dropout 10%
    x = Dense(128)(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)  # Dropout 10%
    encoded = Dense(64)(x)
    encoded = LeakyReLU(alpha=0.1)(encoded)
    x = Dense(128)(encoded)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)  # Dropout 10%
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)  # Dropout 10%
    x = Dense(np.prod(input_shape), activation="sigmoid")(x)
    x = BatchNormalization()(x)
    decoded = Reshape(input_shape)(x)

    # Creo il Modello
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss="mse")

    early_stopping = EarlyStopping(
        monitor="val_loss", patience=10, restore_best_weights=True
    )

    # Addestramento dell'autoencoder sui dati "normali", senza anomalie
    history = autoencoder.fit(
        np.array(segments_scaled_split), np.array(segments_scaled_split),
        epochs=100, batch_size=16, shuffle=True, validation_split=0.1,
        callbacks=[TrainingSummary(), early_stopping]
    )

    # Stampo statistiche finali direttamente dal dizionario `history.history`

    print("\n--- Risultati Finali ---")
    print(f"Training Loss: {history.history['loss'][-1]:.4f}")
    print(f"Validation Loss: {history.history['val_loss'][-1]:.4f}")

    return autoencoder


## Calcolo dell'Errore di Ricostruzione

In [27]:
# 4.
def get_rebuilding_error(autoencoder, segments_scaled_split):
    # Ricostruzione dei dati di addestramento
    reconstructed_train = autoencoder.predict(np.array(segments_scaled_split))
    mse_train = np.mean(np.power(segments_scaled_split - reconstructed_train, 2), axis=(1, 2))

    # Imposto il threshold al 95° percentile dell'errore
    threshold = np.percentile(mse_train, 95)
    print("Soglia di errore di ricostruzione:", threshold)

    return threshold

## Test del modello

In [28]:
# 5.
def testing(path, scaler, autoencoder, threshold):
    # Carico il CSV con diversa codifica
    df_anomalous = pd.read_csv(path, sep='\t', encoding='utf-16')
    df_anomalous = df_anomalous.drop(columns=["Label"])

    df_anomalous['Time'] = pd.to_datetime(df_anomalous['Time'])
    df_anomalous = df_anomalous.sort_values(by='Time')

    # Segmentazione del file di test in blocchi di tot minuti
    test_segments = []
    start_time = df_anomalous['Time'].iloc[0]

    # Definisco la finestra di tot minuti
    window_duration = pd.Timedelta(minutes=1)

    while start_time < df_anomalous['Time'].iloc[-1]:
        end_time = start_time + window_duration
        segment = df_anomalous[(df_anomalous['Time'] >= start_time) & (df_anomalous['Time'] < end_time)]
        if len(segment) > 0:
            test_segments.append(segment.drop(columns=['Time']).values)
        start_time = end_time

    # Normalizzazione dei segmenti di test usando lo scaler già addestrato
    test_segments_scaled = [scaler.transform(segment) for segment in test_segments]

    # Mantengo solo i segmenti che hanno lunghezza pari a 60 o 120 in base a se divido in intervalli di tot minuti.
    uniform_segments = [segment for segment in test_segments_scaled if len(segment) == 60]

    for seg in uniform_segments:
        print(seg)

    # Ricostruzione e calcolo dell'errore per ogni segmento di test
    reconstructed_test = autoencoder.predict(np.array(uniform_segments))
    mse_test = np.mean(np.power(uniform_segments - reconstructed_test, 2), axis=(1, 2))

    # Identificazione delle anomalie nei blocchi di test
    anomalies = mse_test > threshold
    print("Numero di blocchi anomali rilevati:", np.sum(anomalies))
    print(f"Blocchi anomali: {anomalies}")

## Esecuzione del processo

In [30]:
segments = preprocess_network_data_50("normal_reduced_0005.csv")
print("Numero di segmenti:", len(segments))

Processing segment 1 with 50 rows
Processing segment 2 with 50 rows
Processing segment 3 with 50 rows
Processing segment 4 with 50 rows
Numero di segmenti: 4


In [None]:
autoencoder = building_and_training(segments)



ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (6,) + inhomogeneous part.

In [None]:
threshols = get_rebuilding_error(autoencoder, segments)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step
Soglia di errore di ricostruzione: 0.18992139808024558


In [None]:
testing("attack_1.csv", scaler, autoencoder, threshols)