# Laboratory 3 — Anomaly Detection

text

## Setup

In [1]:
# --- Check Python and pip versions ---
!python --version
!pip install --upgrade pip

Python 3.12.12
Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3


In [None]:
# --- Install required libraries ---
!pip install torch
!pip install numpy pandas scikit-learn matplotlib seaborn
!pip install tqdm

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.2 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [1]:
# --- Import libraries ---
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, log_loss
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from tqdm import tqdm

### Colab Pro

In [2]:
# --- Check GPU availability ---
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

zsh:1: command not found: nvidia-smi


In [3]:
# --- Check RAM availability ---
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 8.6 gigabytes of available RAM

Not using a high-RAM runtime


### Paths setup


In [6]:
# --- Mount Google Drive (for Google Colab users) ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Define Paths ---
group = 'AImSecure'
laboratory = 'Laboratory3'

base_path = '/content/drive/MyDrive/'
project_path = base_path + f'Projects/{group}/{laboratory}/'
data_path = project_path + 'data/'
results_path = project_path + 'results/'

# Ensure directories exist
os.makedirs(project_path, exist_ok=True)
os.makedirs(data_path, exist_ok=True)
os.makedirs(results_path, exist_ok=True)

print(f"Project path: {project_path}")
print(f"Data path: {data_path}")
print(f"Results path: {results_path}")

Project path: /content/drive/MyDrive/Projects/AImSecure/Laboratory2/
Data path: /content/drive/MyDrive/Projects/AImSecure/Laboratory2/data/
Results path: /content/drive/MyDrive/Projects/AImSecure/Laboratory2/results/


In [8]:
# --- Set visual style ---
sns.set(style="whitegrid", palette="muted", font_scale=1.1)

def save_plot(fig: plt.Figure, filename: str, path: str = "./plots/", fmt: str = "png", dpi: int = 300, close_fig: bool = False) -> None:
    """
    Save a Matplotlib figure in a specific to a specified directory.

    Args:
        fig (plt.Figure): Matplotlib figure object to save.
        filename (str): Name of the file to save (e.g., 'plot.png').
        path (str, optional): Directory path to save the figure. Defaults to './plots/'.
        fmt (str, optional): File format for the saved figure. Defaults to 'png'.
        dpi (int, optional): Dots per inch for the saved figure. Defaults to 300.

    Returns:
        None
    """
    # Ensure the directory exists
    os.makedirs(path, exist_ok=True)
    save_path = os.path.join(path, f"{filename}.{fmt}")

    # Save the figure
    fig.savefig(save_path, bbox_inches='tight', pad_inches=0.1, dpi=dpi, format=fmt)
    # plt.close(fig) # Removed to display plots in notebook

    if close_fig:
        plt.close(fig)

    print(f"Saved plot: {save_path}")

## Task 1 — Dataset Characterization and Preprocessing

text


In [9]:
# Create directory for plots
save_dir = results_path + 'images/' + 'task1_plots/'
os.makedirs(save_dir, exist_ok=True)

### Explore the dataset

Before preprocessing, we explore the data to understand the available features.

In [None]:
# [CODE BLOCK]
# Code to explore the dataset
# e.g., train_df.info()
# e.g., train_df.describe()
# Identify categorical and numerical features
# Check distribution of 'attack_label' and 'binary_label'

#### Q: What are the dataset characteristics? How many categorical and numerical attributes do you have? How are your attack labels and binary label distributed?

TODO

### Preprocessing

Preprocess features before performing any AI/ML algorithms.

In [None]:
# [CODE BLOCK]
# Implement preprocessing pipeline
# e.g., Use StandardScaler for numerical features
# e.g., Use OneHotEncoder for categorical features
# Fit the preprocessors on train.csv and transform both train.csv and test.csv

#### Q: How do you preprocess categorical and numerical data?

TODO

### Study your data from a domain expert perspective

We will plot heatmaps that describe the statistical characteristics of each feature for each attack label. We consider 0/1 features as numerical.

In [None]:
# [CODE BLOCK]
# 1. Mean heatmap
# Group data by 'attack_label' and calculate the mean of each feature
# Plot the heatmap

In [None]:
# [CODE BLOCK]
# 2. Standard Deviation heatmap
# Group data by 'attack_label' and calculate the standard deviation
# Plot the heatmap

In [None]:
# [CODE BLOCK]
# 3. Median Heatmap
# Group data by 'attack_label' and calculate the median
# Plot the heatmap

#### Q: Looking at the different heatmaps, do you find any main characteristics that are strongly correlated with a specific attack?

TODO

## Task 2 - Shallow Anomaly Detection - Supervised vs Unsupervised

text

In [28]:
# Create directory for plots
save_dir = results_path + 'images/' + 'task2_plots/'
os.makedirs(save_dir, exist_ok=True)

### One-Class SVM with Normal data only

First, train a One-Class Support Vector Machine (OC-SVM) with benign (normal) traffic only using an rbf kernel. Then, evaluate the performance using all training data (normal + anomalies).

In [None]:
# [CODE BLOCK]
# 1. Filter the preprocessed training data to get 'normal' samples only
# 2. Define two OC-SVM models (rbf kernel):
#    - Model 1: Use your estimated 'nu'
#    - Model 2: Use the default 'nu'
# 3. Train both models on normal data only
# 4. Evaluate both models on the *full* training set (normal + anomalies)
# 5. Report classification report (precision, recall, f1-score)

#### Q: Considering that you are currently training only on normal data, which is a good estimate for the parameter $nu^{2}$? What is the impact on training performance? Try both your estimate and the default value of nu.

TODO

### One-Class SVM with All data

Now train the OC-SVM with both normal and anomalous data. Estimate nu as the ratio of anomalous data across the entire collection. Then, evaluate the performance.

In [None]:
# [CODE BLOCK]
# 1. Calculate 'nu' as the ratio of anomalies in the full training set
# 2. Train a new OC-SVM (rbf kernel) on the *full* training set using this 'nu'
# 3. Evaluate the model on the full training set
# 4. Report classification report

#### Q: Which model performs better? Why do you think that?

TODO

### One-Class SVM with normal traffic and some anomalies

Evaluate the impact of the percentage of anomalies while training. Train several OC-SVMs with an increasing subsample of anomalous classes (10%, 20%, 50%, 100% of anomalies). Estimate the nu parameter for each scenario.

In [None]:
# [CODE BLOCK]
# 1. Get all normal data
# 2. Get all anomalous data
# 3. Create a loop for percentages [0.1, 0.2, 0.5, 1.0]
# 4. In the loop:
#    - Subsample the anomalous data based on the percentage
#    - Combine with all normal data to create a new training subset
#    - Calculate 'nu' for this subset (ratio of anomalies)
#    - Train an OC-SVM on this subset
#    - Evaluate the trained model on the *full* training set (normal + all anomalies)
#    - Store the f1-macro score
# 5. Plot the f1-macro scores against the anomaly percentages

#### Q: Plot the f1-macro score for each scenario. How does the increasing ratio of anomalies affect the results?

TODO

### Robustness of the One-Class SVM model

Finally, use the test set to assess the robustness. [cite: 106] Use models trained with:
1. Only normal data
2. All data
3. 10% of anomalous data

In [None]:
# [CODE BLOCK]
# 1. Take the three models trained previously (from point 1, 2, and 3 of Task 2)
# 2. Evaluate each of them on the *preprocessed test set*
# 3. For each model, print the classification report and a confusion matrix

#### Q: Is the best-performing model in the training set also the best here? Does it confuse normal data with anomalies? Which attack is the most confused?

TODO

## Task 3 - Deep Anomaly Detection and Data Representation

text

In [59]:
# Create directory for plots
save_dir = results_path + 'images/' + 'task3_plots/'
os.makedirs(save_dir, exist_ok=True)

### Training and Validating Autoencoder with Normal data only

Create an Autoencoder with a shrinking encoder and an expansion decoder. Use normal data only; split into training and validation sets.

In [None]:
# [CODE BLOCK]
# 1. Get the preprocessed 'normal' data from the training set
# 2. Split this normal data into a new AE-training set and an AE-validation set (e.g., 80/20 split)
# 3. Define the Autoencoder (AE) architecture using Keras/TensorFlow
#    - Shrinking encoder
#    - Bottleneck layer
#    - Expansion decoder
# 4. Compile the model (e.g., optimizer='adam', loss='mse')
# 5. Train the model using the AE-training set and validate on the AE-validation set
# 6. Use callbacks like EarlyStopping to find the best number of epochs
# 7. Plot the training and validation loss curves

### Estimate the Reconstruction Error Threshold

Estimate a threshold using the reconstruction error on the validation set. Plot the ECDF curve.

In [None]:
# [CODE BLOCK]
# 1. Get predictions (reconstructions) for the AE-validation set
# 2. Calculate the reconstruction error (e.g., MSE) for each sample in the validation set
# 3. Plot the ECDF (Empirical Cumulative Distribution Function) of these errors
# 4. Choose a threshold based on the ECDF (e.g., 95th or 99th percentile)

#### Q: How did you pick the threshold? What is its value?

TODO

### Anomaly Detection with reconstruction error

Use the trained model and threshold to classify anomalies in the full training set and test set.

In [None]:
# [CODE BLOCK]
# 1. Calculate reconstruction errors for:
#    - i) AE-validation set (already done)
#    - ii) Full training set (normal + anomalies)
#    - iii) Full test set
# 2. Plot the ECDFs for all three sets of errors on one graph
# 3. Using the threshold from the previous step:
#    - Classify anomalies in the full training set
#    - Classify anomalies in the test set
# 4. Report classification reports for both

#### Q: Plot and report the ECDF... Why the reconstruction errors higher on the full training set than on the validation one? And why are the reconstruction errors in the test set even higher? How is the performance on the training... and test set?

TODO

### Auto-Encoder's bottleneck and OC-SVM

Use the encoder's bottleneck for data representation. Train an OC-SVM on the bottleneck embeddings of the normal data.

In [None]:
# [CODE BLOCK]
# 1. Create an 'encoder' model from the trained AE (inputs -> bottleneck layer)
# 2. Extract bottleneck embeddings for the *normal* training data
# 3. Train a new OC-SVM on these normal data embeddings (like in Task 2.1)
# 4. Extract bottleneck embeddings for the *full test set*
# 5. Use the trained OC-SVM to predict anomalies on the test embeddings
# 6. Report the classification report for the test set

#### Q: Compare the results with the best original OC-SVM and with the Autoencoder with reconstruction error. Describe the performance...

TODO

### PCA and OC-SVM

Use PCA for data representation. Analyze the explained variance on normal data only.

In [None]:
# [CODE BLOCK]
# 1. Use the preprocessed *normal* training data
# 2. Fit PCA on this data (try a large number of components first, e.g., n_components=0.99)
# 3. Plot the cumulative explained variance vs. number of components
# 4. Identify the 'elbow point' or the number of components explaining (e.g.) 95% of the variance
# 5. Fit and transform the normal training data with this 'best' number of components
# 6. Transform the *full test set* with the *same fitted PCA*
# 7. Train an OC-SVM on the PCA-transformed normal training data
# 8. Evaluate this OC-SVM on the PCA-transformed test set
# 9. Report classification report

#### Q: compare results with the original OC-SVM and the OC-SVM trained using the Encoder embeddings. Describe the performance... 

TODO

## Task 4 - Unsupervised Anomaly Detection and Interpretation

text

In [76]:
# Create directory for plots
save_dir = results_path + 'images/' + 'task4_plots/'
os.makedirs(save_dir, exist_ok=True)

### K-means with little domain knowledge

Fit k-means with 4 clusters and the full training data (normal + anomalous). (Normal + 3 attack types = 4 clusters).

In [None]:
# [CODE BLOCK]
# 1. Use the *full preprocessed training data*
# 2. Fit a KMeans model with n_clusters=4
# 3. Get the cluster assignments (labels) for all training data

### K-means cluster interpretation

Examine the clusters to understand their quality.

In [None]:
# [CODE BLOCK]
# 1. Add the K-means cluster labels to the training dataframe (which has the real attack labels)
# 2. Calculate and print the size of each cluster
# 3. Create a contingency table (crosstab) of K-means clusters vs. true 'attack_label'
# 4. Calculate silhouette scores per cluster

#### Q: How big are the clusters? How are the attack labels distributed across the clusters? Are the clusters pure?

TODO

#### Q: How high is the silhouette per cluster? Is there any clusters with a lower silhouette value? If it is the case, what attack labels are present in these clusters?

TODO

#### Q: Use the t-SNE algorithm... Plot and report: i) t-SNE... with cluster ID. ii) t-SNE... with the attack label. 

TODO

In [None]:
# [CODE BLOCK]
# 1. Run t-SNE on the full training data (n_components=2)
#    - Try a few perplexity values (e.g., 30, 50, 100)
# 2. Choose the best-looking perplexity
# 3. Plot 1: 2D t-SNE scatter plot, colored by *K-means cluster ID*
# 4. Plot 2: 2D t-SNE scatter plot, colored by *true attack_label*

#### Q: Can you find a difference between the two visualizations? What are the misinterpreted points?

TODO

### DB-Scan anomalies are anomalies?

Use DB-Scan to detect anomalous patterns.

In [None]:
# [CODE BLOCK]
# 1. Determine 'min_points'
#    - (Justify your choice. e.g., "From the k-means analysis, the smallest 'pure' normal cluster was not found. We will set min_points=... based on domain knowledge/heuristics...") 
# 2. Determine 'eps' (e) using the elbow rule 
#    - Calculate distance to the k-th nearest neighbor (k=min_points)
#    - Sort and plot these distances
#    - Find the 'elbow' (point of max curvature)
# 3. Run DBSCAN on the full training set with your chosen 'eps' and 'min_points' 
# 4. Get the DBSCAN cluster labels (includes -1 for noise)
# 5. Check the noise cluster (label -1)
# 6. Create a crosstab of DBSCAN cluster labels vs. true 'binary_label' (0=normal, 1=anomaly)

#### Q: Create the clustering results... Does the DB-Scan noise cluster (cluster -1) consist only of anomalous points (cross-reference with real attack labels)?

TODO

---