In [1]:
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch import Tensor
from torchsummary import summary
import random
from numpy import load
from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import pickle
import os
from collections import Counter
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader, TensorDataset

%run './Attention_based_model.ipynb'
# check the availability of cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device, f"({torch.cuda.get_device_name(device)})" if torch.cuda.is_available() else "")

Tue Jul  2 13:49:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4080        Off | 00000000:01:00.0  On |                  N/A |
|  0%   38C    P8               7W / 320W |   4548MiB / 16376MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
with open('./new_dataset/CIC2018-dataset-all-benign-1000.pkl', 'rb') as f:
    test_data = pickle.load(f)  
test_loader = DataLoader(test_data, batch_size=32, shuffle=True, pin_memory=True)

In [9]:
data_points = []
labels = []

with torch.no_grad():
    # progress bar
    data_iter = tqdm.tqdm(
        enumerate(test_loader),
        total=len(test_loader),
        bar_format="{l_bar}{r_bar}"
    )
    for i, data in data_iter:  # Assuming you have an inference data loader
        data = {key: value.to(device) for key, value in data.items()}
        inputs, label = data["netformer_input"], data["sequence_label"]
        
        inputs = inputs.cpu().numpy()
        label = label.cpu().numpy()
        
        # Collect the data points and labels
        for j in range(inputs.shape[0]):
            data_points.append(inputs[j])
            labels.append(label[j])

# Convert lists to numpy arrays
data_points_np = np.array(data_points)
labels_np = np.array(labels)

# Optionally save to files
np.save('data_for_baselines_benign.npy', data_points_np)
np.save('labels_for_baselines_benign.npy', labels_np)

# Print shapes to verify
print('Data points shape:', data_points_np.shape)
print('Labels shape:', labels_np.shape)

100%|| 961/961 [00:01<00:00, 547.64it/s]

Data points shape: (30750, 200)
Labels shape: (30750,)





In [4]:
## Load the training and test datasets
print("Loading data...")
train_data_points = np.load('data_points_benign_training.npy')
train_labels = np.load('labels_benign_training.npy')
test_data_points = np.load('data_points_testing.npy')
test_labels = np.load('labels_testing.npy')

print('Training')
print(train_data_points.shape)
print(train_labels.shape)
print(Counter(train_labels))

print('testing')
print(test_data_points.shape)
print(test_labels.shape)
print(Counter(test_labels))


# Verify and reshape the data if necessary
print("Reshaping data if necessary...")
if len(train_data_points.shape) == 3:
    train_data_points = train_data_points.reshape(train_data_points.shape[0], -1)  # Flatten to (n_samples, n_features)
if len(test_data_points.shape) == 3:
    test_data_points = test_data_points.reshape(test_data_points.shape[0], -1)  # Flatten to (n_samples, n_features)

# Scale the data
scaler = MinMaxScaler()
train_data_points_scaled = scaler.fit_transform(train_data_points)
test_data_points_scaled = scaler.transform(test_data_points)

# Convert labels to binary (1 for anomaly, -1 for normal)
train_labels_binary = np.where(train_labels == 1, 1, -1)
test_labels_binary = np.where(test_labels == 1, 1, -1)

# Initialize results dataframe
results = pd.DataFrame(columns=['Model', 'F1 Score', 'Accuracy', 'Precision', 'Recall', 'AUC'])

# Prepare data for PyTorch
train_data_tensor = torch.tensor(train_data_points_scaled, dtype=torch.float32)
test_data_tensor = torch.tensor(test_data_points_scaled, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels_binary, dtype=torch.float32)
test_labels_tensor = torch.tensor(test_labels_binary, dtype=torch.float32)
train_dataset = TensorDataset(train_data_tensor, train_labels_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

Loading data...
Training
(7565, 1000, 10)
(7565,)
Counter({0.0: 7565})
testing
(10000, 1000, 10)
(10000,)
Counter({0.0: 5000, 1.0: 5000})
Reshaping data if necessary...


In [5]:
# Autoencoder Model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Variational Autoencoder Model
class VariationalAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU()
        )
        self.z_mean = nn.Linear(64, latent_dim)
        self.z_log_var = nn.Linear(64, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        h = self.encoder(x)
        z_mean = self.z_mean(h)
        z_log_var = self.z_log_var(h)
        std = torch.exp(0.5 * z_log_var)
        eps = torch.randn_like(std)
        z = z_mean + eps * std
        x_decoded = self.decoder(z)
        return x_decoded, z_mean, z_log_var

# LSTM Autoencoder Model
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMAutoencoder, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, num_layers, batch_first=True)
    
    def forward(self, x):
        _, (hidden, _) = self.encoder(x)
        decoded, _ = self.decoder(hidden.repeat(x.size(1), 1, 1).permute(1, 0, 2))
        return decoded

# Function to train and evaluate the models
def train_model(model, dataloader, num_epochs=50):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(num_epochs):
        for data, _ in dataloader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, data)
            loss.backward()
            optimizer.step()

def evaluate_model(model, data):
    with torch.no_grad():
        reconstructions = model(data).numpy()
        mse_loss = np.mean(np.power(data.numpy() - reconstructions, 2), axis=1)
    return mse_loss


In [7]:
# Training LSTM Autoencoder
print("Training LSTM Autoencoder...")
timesteps = 10
train_data_points_reshaped = train_data_points_scaled.reshape((train_data_points_scaled.shape[0], timesteps, train_data_points_scaled.shape[1] // timesteps))
test_data_points_reshaped = test_data_points_scaled.reshape((test_data_points_scaled.shape[0], timesteps, test_data_points_scaled.shape[1] // timesteps))
train_data_tensor_reshaped = torch.tensor(train_data_points_reshaped, dtype=torch.float32)
test_data_tensor_reshaped = torch.tensor(test_data_points_reshaped, dtype=torch.float32)
train_dataset_reshaped = TensorDataset(train_data_tensor_reshaped, train_labels_tensor)
train_dataloader_reshaped = DataLoader(train_dataset_reshaped, batch_size=32, shuffle=True)

input_dim = train_data_points_scaled.shape[1] // timesteps
hidden_dim = 100
num_layers = 1
lstm_ae = LSTMAutoencoder(input_dim, hidden_dim, num_layers)
train_model(lstm_ae, train_dataloader_reshaped)

# Evaluating LSTM Autoencoder
with torch.no_grad():
    reconstructions = lstm_ae(test_data_tensor_reshaped).numpy()
    mse_loss = np.mean(np.power(test_data_tensor_reshaped.numpy() - reconstructions, 2), axis=(1, 2))
threshold = np.percentile(mse_loss, 95)
lstm_ae_pred = (mse_loss > threshold).astype(int)
lstm_ae_pred = np.where(lstm_ae_pred == 1, 1, -1)

Training LSTM Autoencoder...


In [5]:
# Function to calculate performance metrics
def calculate_metrics(y_true, y_pred, model_name):
    print(f"Calculating metrics for {model_name}...")
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return pd.Series([model_name, f1, accuracy, precision, recall, auc], 
                     index=['Model', 'F1 Score', 'Accuracy', 'Precision', 'Recall', 'AUC'])

In [6]:
results = pd.concat([results, calculate_metrics(test_labels_binary, lstm_ae_pred, 'LSTM Autoencoder').to_frame().T], ignore_index=True)

# Display results
print("Displaying results...")
print(results)

# Save results to a CSV file
print("Saving results to CSV file...")
results.to_csv('deep_learning_anomaly_detection_baseline_results.csv', index=False)

print("Script execution completed.")

Calculating metrics for LSTM Autoencoder...
Displaying results...
              Model F1 Score Accuracy Precision Recall   AUC
0  LSTM Autoencoder      0.0     0.45       0.0    0.0  0.45
Saving results to CSV file...
Script execution completed.


In [None]:
              Model  F1 Score Accuracy Precision  Recall     AUC
0       Autoencoder  0.077818   0.4928     0.428  0.0428  0.4928
1  LSTM Autoencoder  0.152727    0.534      0.84   0.084   0.534
              Model F1 Score Accuracy Precision Recall   AUC
0  LSTM Autoencoder      0.0     0.45       0.0    0.0  0.45
Saving results to CSV file...
Script execution completed.

In [14]:
# Training Autoencoder
print("Training Autoencoder...")
input_dim = train_data_points_scaled.shape[1]
encoding_dim = 32
ae = Autoencoder(input_dim, encoding_dim)
train_model(ae, train_dataloader)
print("Evaluating Autoencoder...")
# Evaluating Autoencoder
mse_loss = evaluate_model(ae, test_data_tensor)
threshold = np.percentile(mse_loss, 95)
ae_pred = (mse_loss > threshold).astype(int)
ae_pred = np.where(ae_pred == 1, 1, -1)

Training Autoencoder...
Evaluating Autoencoder...


In [15]:
print("Calculating and storing results...")
results = pd.concat([results, calculate_metrics(labels_binary, ae_pred, 'Autoencoder').to_frame().T], ignore_index=True)
print(results)

Calculating and storing results...


NameError: name 'labels_binary' is not defined

In [5]:
# Function to train and evaluate the Variational Autoencoder model
def train_vae_model(model, dataloader, num_epochs=50):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(num_epochs):
        for data, _ in dataloader:
            optimizer.zero_grad()
            x_decoded, z_mean, z_log_var = model(data)
            recon_loss = nn.functional.mse_loss(x_decoded, data, reduction='sum')
            kl_loss = -0.5 * torch.sum(1 + z_log_var - z_mean**2 - torch.exp(z_log_var))
            loss = recon_loss + kl_loss
            loss.backward()
            optimizer.step()

def evaluate_vae_model(model, data):
    with torch.no_grad():
        reconstructions, _, _ = model(data)
        mse_loss = np.mean(np.power(data.numpy() - reconstructions.numpy(), 2), axis=1)
    return mse_loss


# Training Variational Autoencoder
print("Training Variational Autoencoder...")
latent_dim = 2
vae = VariationalAutoencoder(input_dim, latent_dim)
train_vae_model(vae, train_dataloader)
print("Evaluating Variational Autoencoder...")
# Evaluating Variational Autoencoder
mse_loss = evaluate_vae_model(vae, test_data_tensor)
threshold = np.percentile(mse_loss, 95)
vae_pred = (mse_loss > threshold).astype(int)
vae_pred = np.where(vae_pred == 1, 1, -1)

Training Variational Autoencoder...


NameError: name 'input_dim' is not defined