Library

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import matplotlib.pyplot as plt
import os

# Optional: Enable CUDA synchronous error reporting for debugging
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

Load Dataset & Normalize

In [None]:
# Load the dataset
file_path = '/home/abdul_desktop/jupyterENV/notebooks/CICIDS2017_dataset/cicids2017.csv'
data = pd.read_csv(file_path)

# Inspect columns to identify the correct label column
print("Columns in the dataset:", data.columns)

# Use the correct label column (adjust based on your dataset)
label_column = ' Label' if ' Label' in data.columns else 'label'
labels = data[label_column].astype(str).str.strip()  # Ensure labels are strings without spaces

# Extract features by dropping the label column
features = data.drop(columns=[label_column])

# Handle missing values
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.fillna(features.mean(), inplace=True)

# Normalize features
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Verify label encoding
print("Unique labels after encoding:", np.unique(labels_encoded))

# Reshape data for Transformer (sequence_length)
sequence_length = 10
num_samples = len(features_scaled) - sequence_length
X = np.array([features_scaled[i:i + sequence_length] for i in range(num_samples)])
y = labels_encoded[sequence_length:]

# Verify that labels are within the expected range
print("Unique labels in y:", np.unique(y))

# Determine the number of classes
num_classes = len(np.unique(y))
print(f"Number of classes: {num_classes}")

Split

In [None]:
# Split into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

print(f"Train shape: {X_train_split.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

Convert & DataLoaders

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_split, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_split, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Check GPU

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Define Model Transformer & Custom

In [None]:
# Define Custom MultiheadAttention Layer
class CustomMultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super(CustomMultiheadAttention, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)

    def forward(self, query, key, value):
        attn_output, _ = self.multihead_attn(query, key, value)
        attn_output = torch.relu(attn_output)
        return attn_output

# Custom Transformer Encoder Layer
class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super(CustomTransformerEncoderLayer, self).__init__()
        self.self_attn = CustomMultiheadAttention(embed_dim, num_heads, dropout)
        self.linear1 = nn.Linear(embed_dim, embed_dim * 4)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(embed_dim * 4, embed_dim)

        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = nn.ReLU()

    def forward(self, src):
        # Self-attention part
        src2 = self.self_attn(src, src, src)
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        # Feedforward network
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

# Custom Transformer Model
class CustomTransformerModel(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers, output_dim, dropout=0.1):
        super(CustomTransformerModel, self).__init__()
        self.encoder_layers = nn.ModuleList(
            [CustomTransformerEncoderLayer(input_dim, num_heads, dropout) for _ in range(num_layers)]
        )
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        # x shape: (batch_size, seq_len, feature_dim)
        x = x.permute(1, 0, 2)  # (seq_len, batch_size, feature_dim)
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)
        x = x.mean(dim=0)  # (batch_size, feature_dim)
        x = self.fc(x)     # (batch_size, output_dim)
        return x

Hyper Param

In [None]:
# Hyperparameters
input_dim = X_train_tensor.shape[2]  # Number of features
num_heads = 2                        # Number of attention heads
num_layers = 2                       # Number of transformer layers
output_dim = num_classes             # Number of classes
dropout = 0.1
num_epochs = 50                      # You can increase this as needed

print(f"Input Dimension: {input_dim}, Output Dimension: {output_dim}")

# Initialize the model, criterion, optimizer, and scheduler
model = CustomTransformerModel(input_dim=input_dim, num_heads=num_heads, num_layers=num_layers, output_dim=output_dim, dropout=dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

# Early stopping parameters
early_stopping_patience = 5
best_val_loss = float('inf')
patience_counter = 0

# Lists to store loss and accuracy
train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []

Train & Val

In [None]:
# Training and validation loop
for epoch in range(num_epochs):
    # Training phase
    model.train()
    running_loss, correct_predictions, total_samples = 0.0, 0, 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == targets).sum().item()
        total_samples += targets.size(0)

    train_loss = running_loss / total_samples
    train_accuracy = correct_predictions / total_samples
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    # Validation phase
    model.eval()
    val_running_loss, val_correct_predictions, val_total_samples = 0.0, 0, 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            val_running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            val_correct_predictions += (predicted == targets).sum().item()
            val_total_samples += targets.size(0)

    val_loss = val_running_loss / val_total_samples
    val_accuracy = val_correct_predictions / val_total_samples
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    # Print metrics for each epoch
    print(f'Epoch {epoch + 1}/{num_epochs} - '
          f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f} - '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')

    # Step learning rate scheduler
    scheduler.step(val_loss)

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # Optionally, save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

Evaluate & Plotting

In [None]:

# Load the best model (optional)
model.load_state_dict(torch.load('best_model.pth'))

# Test the model
model.eval()
test_correct_predictions, test_total_samples = 0, 0

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        test_correct_predictions += (predicted == targets).sum().item()
        test_total_samples += targets.size(0)

test_accuracy = test_correct_predictions / test_total_samples
print(f'Test Accuracy: {test_accuracy:.4f}')

# Plotting training and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Plotting training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()
plt.show()
