<center><b><font size=6>Language Models exploration<b><center>

<center><b><font size=5>Install Dependencies<b><center>

In [1]:
!python ../scripts/install_dependencies.py section4

[34mInstalling common packages: pandas, pyarrow[0m
[0m[32mSuccessfully installed: pandas[0m
Collecting pyarrow
  Using cached pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.1 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-12.0.1
[0m[32mSuccessfully installed: pyarrow[0m
[0m[34mInstalling Section 4 packages: matplotlib, scikit-learn, torch, transformers[0m
[0m[32mSuccessfully installed: matplotlib[0m
[0m[32mSuccessfully installed: scikit-learn[0m
Collecting torch
  Using cached torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)
Collecting nvidia-cublas-cu11==11.10.3.66
  Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99
  Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
Collecting nvidia-cuda-runtime-cu11==11.7.99
  Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (8

In [4]:
# Standard library imports
import logging   # For logging messages and debugging information
import datetime  # For tracking execution time and timestamps
from datetime import datetime

# Third-party imports for data processing and machine learning
import torch                                           # PyTorch deep learning framework
from torch import nn                                   # Neural network modules
from torch.utils.data import Dataset, DataLoader       # Data handling utilities
import pandas as pd                                    # For DataFrame operations
import numpy as np                                     # For numerical operations
from sklearn.model_selection import train_test_split   # For splitting dataset
from sklearn.preprocessing import MultiLabelBinarizer  # For label encoding
import matplotlib.pyplot as plt                        # For plotting learning curves
from tqdm import tqdm                                  # For progress bars

# Hugging Face transformers imports
from transformers import BertModel, BertTokenizer, AdamW  # BERT model and utilities

<center><b><font size=5>Training<b><center>

In [5]:
# Configure logging to show timestamp, log level, and message
logging.basicConfig(
    level=logging.INFO,  # Show all info messages and above (info, warning, error, critical)
    format='%(asctime)s - %(levelname)s - %(message)s',  # Format: timestamp - level - message
    datefmt='%Y-%m-%d %H:%M:%S'  # Date format for the timestamp
)

# Log the start of the program and verify logging is working
logging.info("Starting program and initializing imports...")

2025-01-10 12:25:44 - INFO - Starting program and initializing imports...


In [6]:
class ShellAttackDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Log dataset statistics
        logging.info(f"Created dataset with {len(texts)} samples")
        logging.info(f"Number of labels: {labels.shape[1]}")
        
        # Calculate and log average sequence length
        avg_len = np.mean([len(" ".join(text).split()) for text in texts])
        logging.info(f"Average sequence length (words): {avg_len:.2f}")
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = " ".join(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(self.labels[idx])
        }

In [7]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertClassifier, self).__init__()
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Add custom classification head
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()
        
        # Log model architecture details
        logging.info(f"Initialized BERT Classifier with:")
        logging.info(f"- BERT hidden size: {self.bert.config.hidden_size}")
        logging.info(f"- Number of labels: {num_labels}")
        logging.info(f"- Dropout rate: 0.1")
    
    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use the [CLS] token representation
        pooled_output = outputs.pooler_output
        
        # Apply dropout and classification
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)

In [8]:
def train_model(model, train_loader, val_loader, device, num_epochs=10):
    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    # Binary Cross Entropy loss for multi-label classification
    criterion = nn.BCELoss()
    
    # Store metrics
    train_losses = []
    val_losses = []
    
    # Calculate total number of training steps
    total_steps = len(train_loader) * num_epochs
    logging.info(f"Starting training with {total_steps} total steps")
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        train_steps = 0
        
        epoch_start_time = datetime.now()
        
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
            train_steps += 1
        
        avg_train_loss = total_train_loss / train_steps
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        total_val_loss = 0
        val_steps = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs, labels)
                
                total_val_loss += loss.item()
                val_steps += 1
        
        avg_val_loss = total_val_loss / val_steps
        val_losses.append(avg_val_loss)
        
        epoch_time = datetime.now() - epoch_start_time
        
        logging.info(f"\nEpoch {epoch + 1} Summary:")
        logging.info(f"Time taken: {epoch_time}")
        logging.info(f"Average training loss: {avg_train_loss:.4f}")
        logging.info(f"Average validation loss: {avg_val_loss:.4f}")
        
        # Calculate and log loss improvement
        if epoch > 0:
            train_improvement = train_losses[-2] - train_losses[-1]
            val_improvement = val_losses[-2] - val_losses[-1]
            logging.info(f"Training loss improvement: {train_improvement:.4f}")
            logging.info(f"Validation loss improvement: {val_improvement:.4f}")
    
    return train_losses, val_losses

In [9]:
def save_training_results(model, training_history, mlb, save_path='training_results.pt'):
    """Save model, training history and label binarizer."""
    torch.save({
        'model_state_dict': model.state_dict(),
        'training_history': training_history,
        'label_binarizer_classes': mlb.classes_
    }, save_path)
    logging.info(f"Training results saved to {save_path}")

def load_training_results(model_class, save_path='training_results.pt'):
    """Load model and training history."""
    checkpoint = torch.load(save_path)
    
    # Recreate model with same number of labels
    model = model_class(num_labels=len(checkpoint['label_binarizer_classes']))
    model.load_state_dict(checkpoint['model_state_dict'])
    
    return model, checkpoint['training_history'], checkpoint['label_binarizer_classes']

In [10]:
# Start timing
start_time = datetime.now()
logging.info("Starting data processing and model training pipeline")
    
# Load data
logging.info("Loading dataset from parquet file...")
df = pd.read_parquet("../data/processed/ssh_attacks_decoded.parquet")
logging.info(f"Loaded dataset with {len(df)} rows")

"""
Temporarily working on only a percentage of the dataset
"""

# Define percentage of the dataset to work on
percentage = 0.05  # 5%

# Sample the dataset
df_sampled = df.sample(frac=percentage, random_state=42)
logging.info(f"Sampled dataset with {len(df_sampled)} rows (percentage={percentage*100}%)")
    
# Log data statistics
logging.info("\nDataset Statistics:")
logging.info(f"Number of unique session IDs: {df['session_id'].nunique()}")
logging.info(f"Date range: {df['first_timestamp'].min()} to {df['first_timestamp'].max()}")

2025-01-10 12:25:46 - INFO - Starting data processing and model training pipeline
2025-01-10 12:25:46 - INFO - Loading dataset from parquet file...
2025-01-10 12:25:47 - INFO - Loaded dataset with 233035 rows
2025-01-10 12:25:47 - INFO - Sampled dataset with 11652 rows (percentage=5.0%)
2025-01-10 12:25:47 - INFO - 
Dataset Statistics:
2025-01-10 12:25:47 - INFO - Number of unique session IDs: 233035
2025-01-10 12:25:47 - INFO - Date range: 2019-06-04 09:45:11.151186+00:00 to 2020-02-29 23:59:22.199490+00:00


In [11]:
# Process labels
logging.info("\nProcessing labels...")
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform([set(x) for x in df['Set_Fingerprint']])
logging.info(f"Number of unique labels: {len(mlb.classes_)}")
logging.info("Most common labels:")
label_counts = pd.Series([label for labels_list in df['Set_Fingerprint'] for label in labels_list]).value_counts()
for label, count in label_counts.head().items():
    logging.info(f"- {label}: {count} occurrences")

2025-01-10 12:25:47 - INFO - 
Processing labels...
2025-01-10 12:25:48 - INFO - Number of unique labels: 7
2025-01-10 12:25:48 - INFO - Most common labels:
2025-01-10 12:25:48 - INFO - - Discovery: 232145 occurrences
2025-01-10 12:25:48 - INFO - - Persistence: 211295 occurrences
2025-01-10 12:25:48 - INFO - - Execution: 92927 occurrences
2025-01-10 12:25:48 - INFO - - Defense Evasion: 18999 occurrences
2025-01-10 12:25:48 - INFO - - Harmless: 2206 occurrences


In [12]:
# Split data
logging.info("\nSplitting data into train and validation sets...")
X_train, X_val, y_train, y_val = train_test_split(
    df['full_session'].values,
    labels,
    test_size=0.2,
    random_state=42
)
logging.info(f"Training set size: {len(X_train)}")
logging.info(f"Validation set size: {len(X_val)}")

2025-01-10 12:25:49 - INFO - 
Splitting data into train and validation sets...
2025-01-10 12:25:49 - INFO - Training set size: 186428
2025-01-10 12:25:49 - INFO - Validation set size: 46607


In [13]:
# Initialize tokenizer
logging.info("\nInitializing BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

2025-01-10 12:25:50 - INFO - 
Initializing BERT tokenizer...


In [14]:
# Create datasets
logging.info("Creating datasets...")
train_dataset = ShellAttackDataset(X_train, y_train, tokenizer)
val_dataset = ShellAttackDataset(X_val, y_val, tokenizer)
    
# Create data loaders
batch_size = 16
logging.info(f"\nCreating data loaders with batch size {batch_size}...")
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
logging.info(f"Number of training batches: {len(train_loader)}")
logging.info(f"Number of validation batches: {len(val_loader)}")

2025-01-10 12:25:54 - INFO - Creating datasets...
2025-01-10 12:25:54 - INFO - Created dataset with 186428 samples
2025-01-10 12:25:54 - INFO - Number of labels: 7
2025-01-10 12:25:55 - INFO - Average sequence length (words): 80.43
2025-01-10 12:25:55 - INFO - Created dataset with 46607 samples
2025-01-10 12:25:55 - INFO - Number of labels: 7
2025-01-10 12:25:56 - INFO - Average sequence length (words): 80.26
2025-01-10 12:25:56 - INFO - 
Creating data loaders with batch size 16...
2025-01-10 12:25:56 - INFO - Number of training batches: 11652
2025-01-10 12:25:56 - INFO - Number of validation batches: 2913


In [15]:
# Initialize model
logging.info("\nInitializing model...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.info(f"Using device: {device}")
model = BertClassifier(num_labels=len(mlb.classes_))
model.to(device)
    
# Count model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logging.info(f"Total parameters: {total_params:,}")
logging.info(f"Trainable parameters: {trainable_params:,}")

2025-01-10 12:25:56 - INFO - 
Initializing model...
2025-01-10 12:25:56 - INFO - Using device: cpu
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2025-01-10 12:26:06 - INFO - Initialized BERT Classifier with:
2025-01-10 12:26:06 - INFO 

In [None]:
# Train the model
logging.info("\nStarting model training...")
train_losses, val_losses = train_model(model, train_loader, val_loader, device, num_epochs=4)

2025-01-10 09:17:33 - INFO - 
Starting model training...
2025-01-10 09:17:33 - INFO - Starting training with 46608 total steps
Epoch 1/4:   0%|          | 1/11652 [01:31<296:22:43, 91.58s/it]

In [None]:
# After training:
training_history = {
    'train_losses': train_losses,
    'val_losses': val_losses,
    'true_labels': y_true,
    'predicted_probabilities': y_pred_probs,
    'probabilities_per_epoch': y_pred_probs_per_epoch
}

save_training_results(model, training_history, mlb, save_path='../results/models/training_results.pt')

# Later, to load:
# model, history, class_names = load_training_results(BertClassifier)

In [None]:
# Log total execution time
total_time = datetime.now() - start_time
logging.info(f"\nTotal execution time: {total_time}")

<center><b><font size=5>Plots<b><center>

In [None]:
import sys
sys.path.append("../")

In [None]:
# Global variables
global_overwrite = True
plot_directory = "../results/figures/plots/section4"

In [None]:
from scripts.data_storage_utils import save_plot, plot_and_save
from scripts.plotting_utils import *

In [None]:
# Assuming you have these variables from your model training:
# y_true: Ground truth labels
# y_pred_probs: Model's predicted probabilities
# train_losses: List of training losses per epoch
# val_losses: List of validation losses per epoch
# class_names: List of your label names
# y_pred_probs_per_epoch: List of prediction probabilities for each epoch

y_true = model_evaluation['true_labels']
y_pred_probs = model_evaluation['predicted_probabilities']
y_pred_probs_per_epoch = model_evaluation['probabilities_per_epoch']
train_losses = model_evaluation['train_losses']
val_losses = model_evaluation['val_losses']
class_names = mlb.classes_  # from your MultiLabelBinarizer

# Generate all plots

# 1. Metrics over epochs
plot_metrics_over_epochs(y_true, y_pred_probs_per_epoch, len(train_losses))
    
# 2. Loss curves
plot_loss_curves(train_losses, val_losses)
    
# 3. ROC curves
plot_roc_curves(y_true, y_pred_probs, class_names)
    
# 4. Precision-Recall curves
plot_pr_curves(y_true, y_pred_probs, class_names)
    
# 5. Probability histograms
plot_prob_histograms(y_pred_probs, class_names)
    
# 6. 3D ROC curve
plot_3d_roc(y_true, y_pred_probs, class_names)
    
# 7. F1 scores
plot_f1_scores(y_true, y_pred_probs, class_names)
    
# 8. Performance metrics
plot_performance_metrics(y_true, y_pred_probs, class_names)

In [None]:
# Generate and save all plots using plot_and_save
plots_to_generate = [
    {
        "func": plot_metrics_over_epochs,
        "args": {"y_true": y_true, "y_pred_probs_per_epoch": y_pred_probs_per_epoch, "num_epochs": len(train_losses)},
        "filename": "metrics_over_epochs"
    },
    {
        "func": plot_loss_curves,
        "args": {"train_losses": train_losses, "val_losses": val_losses},
        "filename": "loss_curves"
    },
    {
        "func": plot_roc_curves,
        "args": {"y_true": y_true, "y_pred_probs": y_pred_probs, "class_names": class_names},
        "filename": "roc_curves"
    },
    {
        "func": plot_pr_curves,
        "args": {"y_true": y_true, "y_pred_probs": y_pred_probs, "class_names": class_names},
        "filename": "precision_recall_curves"
    },
    {
        "func": plot_prob_histograms,
        "args": {"y_pred_probs": y_pred_probs, "class_names": class_names},
        "filename": "probability_histograms"
    },
    {
        "func": plot_3d_roc,
        "args": {"y_true": y_true, "y_pred_probs": y_pred_probs, "class_names": class_names},
        "filename": "3d_roc_curve"
    },
    {
        "func": plot_f1_scores,
        "args": {"y_true": y_true, "y_pred_probs": y_pred_probs, "class_names": class_names},
        "filename": "f1_scores"
    },
    {
        "func": plot_performance_metrics,
        "args": {"y_true": y_true, "y_pred_probs": y_pred_probs, "class_names": class_names},
        "filename": "performance_metrics"
    },
]

# Generate and save each plot
for plot in plots_to_generate:
    plot_and_save(
        plot_func=plot["func"],
        plot_args=plot["args"],
        directory=plot_directory,
        filename=plot["filename"],
        filetype="png",
        overwrite=global_overwrite,
        show_plot=True  # Set to False if you don't want to display plots in the notebook
    )