In [1]:
import polars as pl
import numpy as np
import pickle
from pathlib import Path

In [2]:
RAW_TRAIN_PATH = '../data/raw/GUIDE_Train.parquet'
PROCESSED_INCIDENT_FEATURES_PATH = '../data/processed/incident_features.parquet'
DL_PROCESSED_DATA_DIR = Path('../data/processed_dl/')
DL_PROCESSED_DATA_DIR.mkdir(exist_ok=True)

MAX_SEQ_LENGTH = 128

In [None]:
print("--- Phase 4.1: Preparing Data for Sequence Modeling ---")

#  1. Load Data 
print("Loading raw and processed data...")
raw_df = pl.read_parquet(RAW_TRAIN_PATH)
incident_features_df = pl.read_parquet(PROCESSED_INCIDENT_FEATURES_PATH)

sequential_features = [
    'Category',
    'DetectorId',
    'EntityType',
    'MitreTechniques' 
]

static_features = [
    'OrgId',
    'evidence_count',
    'unique_alert_count',
    'incident_duration_seconds',
    'evidence_rate',
    'alert_rate'
]


--- Phase 4.1: Preparing Data for Sequence Modeling ---
Loading raw and processed data...


In [4]:
print("Creating vocabularies for sequential features...")
vocabularies = {}
for col in sequential_features:
    unique_vals = raw_df[col].fill_null('[NULL]').unique().to_list()
    vocab = {val: i + 1 for i, val in enumerate(unique_vals)}
    vocab['[PAD]'] = 0  # Padding token
    vocabularies[col] = vocab
    print(f"  Vocabulary for '{col}' has {len(vocab)} unique tokens.")

with open(DL_PROCESSED_DATA_DIR / 'vocabularies.pkl', 'wb') as f:
    pickle.dump(vocabularies, f)


Creating vocabularies for sequential features...
  Vocabulary for 'Category' has 21 unique tokens.
  Vocabulary for 'DetectorId' has 8429 unique tokens.
  Vocabulary for 'EntityType' has 34 unique tokens.
  Vocabulary for 'MitreTechniques' has 1195 unique tokens.


In [None]:
print("Grouping by IncidentId and creating tokenized sequences...")

def tokenize_and_pad_list(values_list: list, vocab: dict) -> list:
    tokens = [vocab.get(val, 0) for val in values_list] 
    tokens = tokens[:MAX_SEQ_LENGTH]
    padding_needed = MAX_SEQ_LENGTH - len(tokens)
    return tokens + [vocab['[PAD]']] * padding_needed

sequential_data = (
    raw_df.sort(['IncidentId', 'Timestamp'])
          .group_by('IncidentId')
          .agg([
              pl.col(col).fill_null('[NULL]').alias(f'{col}_list')
              for col in sequential_features
          ])
)

for col in sequential_features:
    sequential_data = sequential_data.with_columns(
        pl.col(f'{col}_list').map_elements(
            lambda values_list: tokenize_and_pad_list(values_list, vocabularies[col]),
            return_dtype=pl.List(pl.Int32)
        ).alias(f'{col}_seq')
    ).drop(f'{col}_list')

print("Tokenization complete.")
print("Shape of sequential data:", sequential_data.shape)

Grouping by IncidentId and creating tokenized sequences...
Tokenization complete.
Shape of sequential data: (466151, 5)


In [6]:
print("Preparing static features and labels...")
org_id_map = raw_df.group_by('IncidentId').agg(pl.first('OrgId'))

final_dl_data = (
    sequential_data.join(incident_features_df, on='IncidentId', how='left').join(org_id_map, on='IncidentId', how='left')
)

label_col = 'IncidentGrade'
final_dl_data = final_dl_data.select(
    ['IncidentId', label_col] + [f'{col}_seq' for col in sequential_features] + static_features
).drop_nulls()

Preparing static features and labels...


In [7]:
# Save the final processed dataset
print(f"Saving final processed dataset for PyTorch to {DL_PROCESSED_DATA_DIR}")
final_dl_data.write_parquet(DL_PROCESSED_DATA_DIR / 'train_sequential_data.parquet')

print("\n Data Preparation Complete")
print("Final dataset shape:", final_dl_data.shape)
print("\nSchema of the final dataframe:")
print(final_dl_data.schema)
print("\nExample of a processed incident:")
print(final_dl_data.head(1))

Saving final processed dataset for PyTorch to ..\data\processed_dl

 Data Preparation Complete
Final dataset shape: (567609, 12)

Schema of the final dataframe:
Schema({'IncidentId': Int64, 'IncidentGrade': String, 'Category_seq': List(Int32), 'DetectorId_seq': List(Int32), 'EntityType_seq': List(Int32), 'MitreTechniques_seq': List(Int32), 'OrgId': Int64, 'evidence_count': UInt32, 'unique_alert_count': UInt32, 'incident_duration_seconds': Int64, 'evidence_rate': Float64, 'alert_rate': Float64})

Example of a processed incident:
shape: (1, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ IncidentI ┆ IncidentG ┆ Category_ ┆ DetectorI ┆ … ┆ unique_al ┆ incident_ ┆ evidence_ ┆ alert_ra │
│ d         ┆ rade      ┆ seq       ┆ d_seq     ┆   ┆ ert_count ┆ duration_ ┆ rate      ┆ te       │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ seconds   ┆ ---       ┆ ---      │
│ i64       ┆ str       ┆ list[i32] ┆ list[i32]

In [8]:
final_dl_data.head()

IncidentId,IncidentGrade,Category_seq,DetectorId_seq,EntityType_seq,MitreTechniques_seq,OrgId,evidence_count,unique_alert_count,incident_duration_seconds,evidence_rate,alert_rate
i64,str,list[i32],list[i32],list[i32],list[i32],i64,u32,u32,i64,f64,f64
0,"""TruePositive""","[16, 16, … 16]","[3104, 3104, … 3104]","[2, 2, … 2]","[840, 840, … 840]",50,29997,3027,825528,0.036337,0.003667
2,"""TruePositive""","[17, 17, … 17]","[2429, 2429, … 2429]","[6, 28, … 28]","[1053, 1053, … 1053]",42,20525,5372,252222,0.081377,0.021299
2,"""BenignPositive""","[17, 17, … 17]","[2429, 2429, … 2429]","[6, 28, … 28]","[1053, 1053, … 1053]",42,20525,5372,252222,0.081377,0.021299
3,"""TruePositive""","[5, 5, … 0]","[6926, 6926, … 0]","[29, 10, … 0]","[1053, 1053, … 0]",457,3,1,1,3.0,1.0
7,"""FalsePositive""","[18, 18, … 18]","[1460, 1460, … 1460]","[2, 2, … 2]","[1053, 1053, … 1053]",14,12252,73,1932573,0.00634,3.8e-05


# Transformer

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import pandas as pd
import numpy as np
from pathlib import Path


In [None]:
# Configuration 
DL_PROCESSED_DATA_DIR = Path('../data/processed_dl/')
VOCAB_PATH = DL_PROCESSED_DATA_DIR / 'vocabularies.pkl'
TRAIN_DATA_PATH = DL_PROCESSED_DATA_DIR / 'train_sequential_data.parquet'

In [12]:
# PyTorch Dataset Class
class IncidentDataset(Dataset):
    def __init__(self, data_path, vocab_path):
        super().__init__()
        self.df = pd.read_parquet(data_path)

        with open(vocab_path, 'rb') as f:
            self.vocabs = pickle.load(f)

        self.labels, self.class_names = pd.factorize(self.df['IncidentGrade'])

        # Define feature columns
        self.sequential_cols = [c for c in self.df.columns if '_seq' in c]
        self.static_cols = [
            'OrgId', 'evidence_count', 'unique_alert_count', 
            'incident_duration_seconds', 'evidence_rate', 'alert_rate'
        ]
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
    
        sequences = {
            col: torch.LongTensor(row[col])
            for col in self.sequential_cols
        }

        static_data = torch.FloatTensor([row[col] for col in self.static_cols])

        label = torch.LongTensor([self.labels[idx]])

        return sequences, static_data, label

In [None]:
# Testing Dataset
print("Initializing and testing the PyTorch Dataset...")
dataset = IncidentDataset(TRAIN_DATA_PATH, VOCAB_PATH)
sequences, static_data, label = dataset[0]

print("Dataset initialised successfully")
print("Example item from dataset:")
print("Sequences keys:", sequences.keys())
print("DetectorId_seq shape:", sequences['DetectorId_seq'].shape)
print("Static data shape:", static_data.shape)
print("Label shape:", label.shape)
print(f"Total number of classes: {len(dataset.class_names)}")

Initializing and testing the PyTorch Dataset...
Dataset initialised successfully
Example item from dataset:
Sequences keys: dict_keys(['Category_seq', 'DetectorId_seq', 'EntityType_seq', 'MitreTechniques_seq'])
DetectorId_seq shape: torch.Size([128])
Static data shape: torch.Size([6])
Label shape: torch.Size([1])
Total number of classes: 3


In [None]:
# 2. Model Architecture
class IncidentTransformer(nn.Module):
    def __init__(self, vocab_sizes, num_static_features, num_classes, embed_dim=64, nhead=4, num_encoder_layers=2, dim_feedforward=256, dropout=0.1):
        super().__init__()
        self.vocab_sizes = vocab_sizes
        
        self.embeddings = nn.ModuleDict({
            f'embed_{name}': nn.Embedding(size, embed_dim, padding_idx=0)
            for name, size in vocab_sizes.items()
        })
        
        total_seq_embed_dim = embed_dim * len(vocab_sizes) 
        
        # Transformer Encoder 
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=total_seq_embed_dim, nhead=nhead, 
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        # MLP for Static Features 
        self.static_mlp = nn.Sequential(
            nn.Linear(num_static_features, embed_dim * 2),
            nn.BatchNorm1d(embed_dim * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim * 2, embed_dim)
        )
        
        # Final Classifier
        self.classifier = nn.Sequential(
            nn.Linear(total_seq_embed_dim + embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, num_classes)
        )

    def forward(self, sequences, static_data):
        # 1. Process Sequences
        all_embeds = [self.embeddings[f'embed_{name}'](sequences[f'{name}_seq']) 
                      for name in self.vocab_sizes.keys()]
        concatenated_embeds = torch.cat(all_embeds, dim=2)
        
        padding_mask = (sequences['Category_seq'] == 0) 

        transformer_out = self.transformer_encoder(concatenated_embeds, src_key_padding_mask=padding_mask)
        
        transformer_out[padding_mask] = 0 
        pooled_out = transformer_out.sum(dim=1) / (~padding_mask).sum(dim=1, keepdim=True)

        # 2. Process Static Features
        static_out = self.static_mlp(static_data)
        
        # 3. Combine and Classify
        combined = torch.cat([pooled_out, static_out], dim=1)
        logits = self.classifier(combined)
        
        return logits

In [None]:
# Test the Model
print("\nInitializing and testing the Transformer model architecture...")
vocab_sizes = {name: len(vocab) for name, vocab in dataset.vocabs.items()}
num_static_features = len(dataset.static_cols)
num_classes = len(dataset.class_names)

# Create a dummy batch of data to test the forward pass
dummy_loader = DataLoader(dataset, batch_size=4)
sequences_batch, static_batch, _ = next(iter(dummy_loader))

model = IncidentTransformer(vocab_sizes, num_static_features, num_classes)

# Test the forward pass
with torch.no_grad():
    logits = model(sequences_batch, static_batch)

print("Model initialized successfully.")
print("Output logits shape (batch_size, num_classes):", logits.shape)



Initializing and testing the Transformer model architecture...
Model initialized successfully.
Output logits shape (batch_size, num_classes): torch.Size([4, 3])
