In [1]:
import polars as pl
import numpy as np
import pickle
from pathlib import Path

In [2]:
RAW_TRAIN_PATH = '../data/raw/GUIDE_Train.parquet'
PROCESSED_INCIDENT_FEATURES_PATH = '../data/processed/incident_features.parquet'
DL_PROCESSED_DATA_DIR = Path('../data/processed_dl/')
DL_PROCESSED_DATA_DIR.mkdir(exist_ok=True)

MAX_SEQ_LENGTH = 128

In [None]:
print("--- Phase 4.1: Preparing Data for Sequence Modeling ---")

#  1. Load Data 
print("Loading raw and processed data...")
raw_df = pl.read_parquet(RAW_TRAIN_PATH)
incident_features_df = pl.read_parquet(PROCESSED_INCIDENT_FEATURES_PATH)

sequential_features = [
    'Category',
    'DetectorId',
    'EntityType',
    'MitreTechniques' 
]

static_features = [
    'OrgId',
    'evidence_count',
    'unique_alert_count',
    'incident_duration_seconds',
    'evidence_rate',
    'alert_rate'
]


--- Phase 4.1: Preparing Data for Sequence Modeling ---
Loading raw and processed data...


In [4]:
print("Creating vocabularies for sequential features...")
vocabularies = {}
for col in sequential_features:
    unique_vals = raw_df[col].fill_null('[NULL]').unique().to_list()
    vocab = {val: i + 1 for i, val in enumerate(unique_vals)}
    vocab['[PAD]'] = 0  # Padding token
    vocabularies[col] = vocab
    print(f"  Vocabulary for '{col}' has {len(vocab)} unique tokens.")

with open(DL_PROCESSED_DATA_DIR / 'vocabularies.pkl', 'wb') as f:
    pickle.dump(vocabularies, f)


Creating vocabularies for sequential features...
  Vocabulary for 'Category' has 21 unique tokens.
  Vocabulary for 'DetectorId' has 8429 unique tokens.
  Vocabulary for 'EntityType' has 34 unique tokens.
  Vocabulary for 'MitreTechniques' has 1195 unique tokens.


In [None]:
print("Grouping by IncidentId and creating tokenized sequences...")

def tokenize_and_pad_list(values_list: list, vocab: dict) -> list:
    tokens = [vocab.get(val, 0) for val in values_list] 
    tokens = tokens[:MAX_SEQ_LENGTH]
    padding_needed = MAX_SEQ_LENGTH - len(tokens)
    return tokens + [vocab['[PAD]']] * padding_needed

sequential_data = (
    raw_df.sort(['IncidentId', 'Timestamp'])
          .group_by('IncidentId')
          .agg([
              pl.col(col).fill_null('[NULL]').alias(f'{col}_list')
              for col in sequential_features
          ])
)

for col in sequential_features:
    sequential_data = sequential_data.with_columns(
        pl.col(f'{col}_list').map_elements(
            lambda values_list: tokenize_and_pad_list(values_list, vocabularies[col]),
            return_dtype=pl.List(pl.Int32)
        ).alias(f'{col}_seq')
    ).drop(f'{col}_list')

print("Tokenization complete.")
print("Shape of sequential data:", sequential_data.shape)

Grouping by IncidentId and creating tokenized sequences...
Tokenization complete.
Shape of sequential data: (466151, 5)


In [6]:
print("Preparing static features and labels...")
org_id_map = raw_df.group_by('IncidentId').agg(pl.first('OrgId'))

final_dl_data = (
    sequential_data.join(incident_features_df, on='IncidentId', how='left').join(org_id_map, on='IncidentId', how='left')
)

label_col = 'IncidentGrade'
final_dl_data = final_dl_data.select(
    ['IncidentId', label_col] + [f'{col}_seq' for col in sequential_features] + static_features
).drop_nulls()

Preparing static features and labels...


In [7]:
# Save the final processed dataset
print(f"Saving final processed dataset for PyTorch to {DL_PROCESSED_DATA_DIR}")
final_dl_data.write_parquet(DL_PROCESSED_DATA_DIR / 'train_sequential_data.parquet')

print("\n Data Preparation Complete")
print("Final dataset shape:", final_dl_data.shape)
print("\nSchema of the final dataframe:")
print(final_dl_data.schema)
print("\nExample of a processed incident:")
print(final_dl_data.head(1))

Saving final processed dataset for PyTorch to ..\data\processed_dl

 Data Preparation Complete
Final dataset shape: (567609, 12)

Schema of the final dataframe:
Schema({'IncidentId': Int64, 'IncidentGrade': String, 'Category_seq': List(Int32), 'DetectorId_seq': List(Int32), 'EntityType_seq': List(Int32), 'MitreTechniques_seq': List(Int32), 'OrgId': Int64, 'evidence_count': UInt32, 'unique_alert_count': UInt32, 'incident_duration_seconds': Int64, 'evidence_rate': Float64, 'alert_rate': Float64})

Example of a processed incident:
shape: (1, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ IncidentI ┆ IncidentG ┆ Category_ ┆ DetectorI ┆ … ┆ unique_al ┆ incident_ ┆ evidence_ ┆ alert_ra │
│ d         ┆ rade      ┆ seq       ┆ d_seq     ┆   ┆ ert_count ┆ duration_ ┆ rate      ┆ te       │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ seconds   ┆ ---       ┆ ---      │
│ i64       ┆ str       ┆ list[i32] ┆ list[i32]

In [8]:
final_dl_data.head()

IncidentId,IncidentGrade,Category_seq,DetectorId_seq,EntityType_seq,MitreTechniques_seq,OrgId,evidence_count,unique_alert_count,incident_duration_seconds,evidence_rate,alert_rate
i64,str,list[i32],list[i32],list[i32],list[i32],i64,u32,u32,i64,f64,f64
0,"""TruePositive""","[16, 16, … 16]","[3104, 3104, … 3104]","[2, 2, … 2]","[840, 840, … 840]",50,29997,3027,825528,0.036337,0.003667
2,"""TruePositive""","[17, 17, … 17]","[2429, 2429, … 2429]","[6, 28, … 28]","[1053, 1053, … 1053]",42,20525,5372,252222,0.081377,0.021299
2,"""BenignPositive""","[17, 17, … 17]","[2429, 2429, … 2429]","[6, 28, … 28]","[1053, 1053, … 1053]",42,20525,5372,252222,0.081377,0.021299
3,"""TruePositive""","[5, 5, … 0]","[6926, 6926, … 0]","[29, 10, … 0]","[1053, 1053, … 0]",457,3,1,1,3.0,1.0
7,"""FalsePositive""","[18, 18, … 18]","[1460, 1460, … 1460]","[2, 2, … 2]","[1053, 1053, … 1053]",14,12252,73,1932573,0.00634,3.8e-05


# Transformer

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

In [2]:
# Configuration 
DL_PROCESSED_DATA_DIR = Path('../data/processed_dl/')
VOCAB_PATH = DL_PROCESSED_DATA_DIR / 'vocabularies.pkl'
TRAIN_DATA_PATH = DL_PROCESSED_DATA_DIR / 'train_sequential_data.parquet'

In [None]:
torch.set_float32_matmul_precision('medium')  
torch.backends.cudnn.benchmark = True  

In [4]:
# PyTorch Dataset Class
class OptimizedIncidentDataset(Dataset):
    def __init__(self, data_path, vocab_path):
        super().__init__()
        print("Loading data...")
        self.df = pd.read_parquet(data_path)
        
        with open(vocab_path, 'rb') as f:
            self.vocabs = pickle.load(f)
        
        self.labels, self.class_names = pd.factorize(self.df['IncidentGrade'])
        
        # Define feature columns
        self.sequential_cols = [c for c in self.df.columns if '_seq' in c]
        self.static_cols = [
            'OrgId', 'evidence_count', 'unique_alert_count', 
            'incident_duration_seconds', 'evidence_rate', 'alert_rate'
        ]
        
        # Pre-convert data to tensors for faster access
        print("Pre-processing tensors...")
        self._preprocess_data()
        print("Data preprocessing complete!")
    
    def _preprocess_data(self):
        self.sequences_cache = {}
        self.static_cache = torch.zeros((len(self.df), len(self.static_cols)), dtype=torch.float32)
        self.labels_cache = torch.tensor(self.labels, dtype=torch.long)
        
        # Cache sequential data
        for col in self.sequential_cols:
            self.sequences_cache[col] = [torch.tensor(seq, dtype=torch.long) for seq in self.df[col]]
        
        # Cache static data
        for i, col in enumerate(self.static_cols):
            self.static_cache[:, i] = torch.tensor(self.df[col].values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        sequences = {col: self.sequences_cache[col][idx] for col in self.sequential_cols}
        static_data = self.static_cache[idx]
        label = self.labels_cache[idx]
        
        return sequences, static_data, label

In [5]:
# 2. Model Architecture
class IncidentTransformer(nn.Module):
    def __init__(self, vocab_sizes, num_static_features, num_classes, embed_dim=64, nhead=4, num_encoder_layers=2, dim_feedforward=256, dropout=0.1):
        super().__init__()
        self.vocab_sizes = vocab_sizes
        
        self.embeddings = nn.ModuleDict({
            f'embed_{name}': nn.Embedding(size, embed_dim, padding_idx=0)
            for name, size in vocab_sizes.items()
        })
        
        total_seq_embed_dim = embed_dim * len(vocab_sizes) 
        
        # Transformer Encoder 
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=total_seq_embed_dim, nhead=nhead, 
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        # MLP for Static Features 
        self.static_mlp = nn.Sequential(
            nn.Linear(num_static_features, embed_dim * 2),
            nn.BatchNorm1d(embed_dim * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim * 2, embed_dim)
        )
        
        # Final Classifier
        self.classifier = nn.Sequential(
            nn.Linear(total_seq_embed_dim + embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, num_classes)
        )

    def forward(self, sequences, static_data):
        # 1. Process Sequences
        all_embeds = [self.embeddings[f'embed_{name}'](sequences[f'{name}_seq']) 
                      for name in self.vocab_sizes.keys()]
        concatenated_embeds = torch.cat(all_embeds, dim=2)
        
        padding_mask = (sequences['Category_seq'] == 0) 

        transformer_out = self.transformer_encoder(concatenated_embeds, src_key_padding_mask=padding_mask)
        
        transformer_out[padding_mask] = 0 
        pooled_out = transformer_out.sum(dim=1) / (~padding_mask).sum(dim=1, keepdim=True)

        # 2. Process Static Features
        static_out = self.static_mlp(static_data)
        
        # 3. Combine and Classify
        combined = torch.cat([pooled_out, static_out], dim=1)
        logits = self.classifier(combined)
        
        return logits

# Train it!

In [6]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import StratifiedGroupKFold
import torchmetrics


In [7]:
class OptimizedIncidentClassifier(pl.LightningModule):
    def __init__(self, model, class_names, learning_rate=1e-4):
        super().__init__()
        self.model = model
        self.class_names = class_names
        self.num_classes = len(class_names)
        self.learning_rate = learning_rate
        
        # Loss function with label smoothing for better generalization
        self.criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
        
        # Metrics
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.num_classes, average='macro')
        self.precision_macro = torchmetrics.Precision(task='multiclass', num_classes=self.num_classes, average='macro')
        self.recall_macro = torchmetrics.Recall(task='multiclass', num_classes=self.num_classes, average='macro')
        
        # Save hyperparameters
        self.save_hyperparameters(ignore=['model'])
    
    def forward(self, sequences, static_data):
        return self.model(sequences, static_data)
    
    def training_step(self, batch, batch_idx):
        sequences, static_data, labels = batch
        logits = self(sequences, static_data)
        loss = self.criterion(logits, labels)
        
        # Log metrics
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        sequences, static_data, labels = batch
        logits = self(sequences, static_data)
        loss = self.criterion(logits, labels)
        preds = torch.argmax(logits, dim=1)
        
        # Update metrics
        self.f1_macro.update(preds, labels)
        self.precision_macro.update(preds, labels)
        self.recall_macro.update(preds, labels)
        
        # Log metrics
        self.log('val_loss', loss, prog_bar=True, sync_dist=True)
        return {'val_loss': loss, 'preds': preds, 'labels': labels}
    
    def on_validation_epoch_end(self):
        # Compute and log epoch metrics
        self.log('val_f1_macro', self.f1_macro.compute(), prog_bar=True, sync_dist=True)
        self.log('val_precision', self.precision_macro.compute(), sync_dist=True)
        self.log('val_recall', self.recall_macro.compute(), sync_dist=True)
        
        # Reset metrics
        self.f1_macro.reset()
        self.precision_macro.reset()
        self.recall_macro.reset()
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(), 
            lr=self.learning_rate, 
            weight_decay=0.01,  # L2 regularization
            eps=1e-8
        )
        
        # Learning rate scheduler
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 
            mode='max', 
            factor=0.5, 
            patience=2
        )
        
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_f1_macro',
                'frequency': 1
            }
        }

In [8]:
def collate_fn(batch):
    sequences_batch = {}
    static_batch = []
    labels_batch = []
    
    # Get all sequence keys from first item
    seq_keys = batch[0][0].keys()
    
    for key in seq_keys:
        sequences_batch[key] = torch.stack([item[0][key] for item in batch])
    
    static_batch = torch.stack([item[1] for item in batch])
    labels_batch = torch.stack([item[2] for item in batch])
    
    return sequences_batch, static_batch, labels_batch

In [None]:
print('Starting Optimized Cross Validation Training for the Transformer Model')

N_SPLITS = 5
BATCH_SIZE = 64  
EPOCHS = 15
NUM_WORKERS = 0  
LEARNING_RATE = 2e-4

print("Creating optimized dataset...")
dataset = OptimizedIncidentDataset(TRAIN_DATA_PATH, VOCAB_PATH)

print("Validating dataset for NaN values...")

static_sample = dataset.static_cache[:1000]  
if torch.isnan(static_sample).any():
    print("WARNING: NaN values found in static features!")
    nan_cols = []
    for i, col in enumerate(dataset.static_cols):
        if torch.isnan(static_sample[:, i]).any():
            nan_cols.append(col)
    print(f"Columns with NaN: {nan_cols}")
    
    # Replace NaN with 0 or mean
    dataset.static_cache = torch.nan_to_num(dataset.static_cache, nan=0.0)
    print("NaN values replaced with 0.")
else:
    print("No NaN values found in static features - good!")

print("Dataset validation complete.")


# Get model parameters
vocab_sizes = {name: len(vocab) for name, vocab in dataset.vocabs.items()}
num_static_features = len(dataset.static_cols)
num_classes = len(dataset.class_names)

# Cross-validation setup
groups = dataset.df['OrgId'].values
labels_for_split = dataset.labels
skf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(dataset))
oof_labels = np.zeros(len(dataset))


Starting Optimized Cross Validation Training for the Transformer Model
Creating optimized dataset...
Loading data...
Pre-processing tensors...
Data preprocessing complete!
Validating dataset for NaN values...
No NaN values found in static features - good!
Dataset validation complete.


In [10]:
def debug_model_and_data(model, train_loader):
    """Debug function to identify NaN issues"""
    print("=== DEBUGGING NAN ISSUES ===")
    
    # Check model parameters
    nan_params = []
    for name, param in model.named_parameters():
        if torch.isnan(param).any():
            nan_params.append(name)
    
    if nan_params:
        print(f"WARNING: NaN found in model parameters: {nan_params}")
    else:
        print("✓ Model parameters are clean (no NaN)")
    
    # Check a batch of data
    print("Checking first batch of training data...")
    batch = next(iter(train_loader))
    sequences, static_data, labels = batch
    
    print(f"Batch size: {len(labels)}")
    print(f"Static data shape: {static_data.shape}")
    print(f"Static data range: [{static_data.min():.4f}, {static_data.max():.4f}]")
    
    if torch.isnan(static_data).any():
        print("NaN found in static_data!")
        nan_mask = torch.isnan(static_data)
        print(f"NaN count: {nan_mask.sum().item()}")
        return False
    else:
        print("Static data is clean")
    
    if torch.isinf(static_data).any():
        print("Inf found in static_data!")
        return False
    else:
        print("No infinite values in static data")
    
    # Check sequence data
    for seq_name, seq_data in sequences.items():
        if torch.isnan(seq_data).any():
            print(f"NaN found in {seq_name}")
            return False
        if (seq_data < 0).any():
            print(f"Negative values found in {seq_name} (should be token indices)")
            return False
    
    print("✓ Sequence data is clean")
    
    # Test forward pass
    print("Testing forward pass...")
    model.eval()
    with torch.no_grad():
        try:
            if torch.cuda.is_available():
                model = model.cuda()
                sequences = {k: v.cuda() for k, v in sequences.items()}
                static_data = static_data.cuda()
            
            logits = model(sequences, static_data)
            
            if torch.isnan(logits).any():
                print("NaN in model output!")
                return False
            else:
                print("Model forward pass successful")
                print(f"Logits range: [{logits.min():.4f}, {logits.max():.4f}]")
        
        except Exception as e:
            print(f"Forward pass failed: {e}")
            return False
    
    return True


In [None]:
class StableIncidentTransformer(nn.Module):
    def __init__(self, vocab_sizes, num_static_features, num_classes, embed_dim=64, nhead=4, num_encoder_layers=2, dim_feedforward=256, dropout=0.1):
        super().__init__()
        self.vocab_sizes = vocab_sizes
        
        self.embeddings = nn.ModuleDict({
            f'embed_{name}': nn.Embedding(size, embed_dim, padding_idx=0)
            for name, size in vocab_sizes.items()
        })
        
        for embed in self.embeddings.values():
            nn.init.normal_(embed.weight, mean=0, std=0.1)
            if embed.padding_idx is not None:
                nn.init.constant_(embed.weight[embed.padding_idx], 0)
        
        total_seq_embed_dim = embed_dim * len(vocab_sizes)
        
        self.input_norm = nn.LayerNorm(total_seq_embed_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=total_seq_embed_dim, 
            nhead=nhead, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout, 
            batch_first=True,
            norm_first=True  
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        self.static_mlp = nn.Sequential(
            nn.Linear(num_static_features, embed_dim * 2),
            nn.BatchNorm1d(embed_dim * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim * 2, embed_dim),
            nn.BatchNorm1d(embed_dim)
        )
        
        # Initialize static MLP
        for layer in self.static_mlp:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_normal_(layer.weight)
                nn.init.constant_(layer.bias, 0)
        
        self.classifier = nn.Sequential(
            nn.Linear(total_seq_embed_dim + embed_dim, embed_dim),
            nn.LayerNorm(embed_dim),  
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim, num_classes)
        )
        
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_normal_(layer.weight)
                nn.init.constant_(layer.bias, 0)

    def forward(self, sequences, static_data):
        if torch.isnan(static_data).any():
            raise ValueError("NaN detected in static_data input")
        
        # 1. Process Sequences
        all_embeds = []
        for name in self.vocab_sizes.keys():
            seq_key = f'{name}_seq'
            if seq_key in sequences:
                embed = self.embeddings[f'embed_{name}'](sequences[seq_key])
                all_embeds.append(embed)
        
        concatenated_embeds = torch.cat(all_embeds, dim=2)
        
        concatenated_embeds = self.input_norm(concatenated_embeds)
        
        first_seq_key = list(sequences.keys())[0]
        padding_mask = (sequences[first_seq_key] == 0)
        
        transformer_out = self.transformer_encoder(
            concatenated_embeds, 
            src_key_padding_mask=padding_mask
        )
        
        transformer_out = transformer_out * (~padding_mask).unsqueeze(-1).float()
        valid_lengths = (~padding_mask).sum(dim=1, keepdim=True).float()
        valid_lengths = torch.clamp(valid_lengths, min=1.0)  
        pooled_out = transformer_out.sum(dim=1) / valid_lengths
        
        pooled_out = torch.clamp(pooled_out, min=-10, max=10)
        
        static_data_normalized = torch.clamp(static_data, min=-100, max=100)
        static_out = self.static_mlp(static_data_normalized)
        static_out = torch.clamp(static_out, min=-10, max=10)
        
        # 3. Combine and Classify
        combined = torch.cat([pooled_out, static_out], dim=1)
        logits = self.classifier(combined)
        
        # Final stability check
        if torch.isnan(logits).any():
            raise ValueError("NaN detected in model output")
        
        return logits


In [None]:
class StableIncidentClassifier(pl.LightningModule):
    def __init__(self, model, class_names, learning_rate=1e-5):
        super().__init__()
        self.model = model
        self.class_names = class_names
        self.num_classes = len(class_names)
        self.learning_rate = learning_rate
        
        # More stable loss function
        self.criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
        
        # Metrics
        self.train_f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.num_classes, average='macro')
        self.val_f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.num_classes, average='macro')
        
        self.save_hyperparameters(ignore=['model'])
    
    def forward(self, sequences, static_data):
        return self.model(sequences, static_data)
    
    def training_step(self, batch, batch_idx):
        sequences, static_data, labels = batch
        
        try:
            logits = self(sequences, static_data)
            loss = self.criterion(logits, labels)
            
            # Stability checks
            if torch.isnan(loss):
                print(f"NaN loss at batch {batch_idx}")
                return None
            
            if loss > 100:  
                print(f"Very high loss ({loss:.4f}) at batch {batch_idx}")
                return None
            
            # Calculate accuracy for monitoring
            preds = torch.argmax(logits, dim=1)
            self.train_f1.update(preds, labels)
            
            # Log with error handling
            if batch_idx % 100 == 0:
                self.log('train_loss_step', loss, on_step=True, on_epoch=False, prog_bar=False)
            
            self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
            return loss
            
        except Exception as e:
            print(f"Training step error at batch {batch_idx}: {e}")
            return None
    
    def validation_step(self, batch, batch_idx):
        sequences, static_data, labels = batch
        
        try:
            logits = self(sequences, static_data)
            loss = self.criterion(logits, labels)
            
            if torch.isnan(loss):
                print(f"NaN validation loss at batch {batch_idx}")
                return {'val_loss': torch.tensor(10.0)}  
            
            preds = torch.argmax(logits, dim=1)
            self.val_f1.update(preds, labels)
            
            return {'val_loss': loss}
            
        except Exception as e:
            print(f"Validation step error at batch {batch_idx}: {e}")
            return {'val_loss': torch.tensor(10.0)}
    
    def on_train_epoch_end(self):
        train_f1 = self.train_f1.compute()
        self.log('train_f1_macro', train_f1, prog_bar=True)
        self.train_f1.reset()
    
    def on_validation_epoch_end(self):
        val_f1 = self.val_f1.compute()
        self.log('val_f1_macro', val_f1, prog_bar=True)
        self.val_f1.reset()
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(), 
            lr=self.learning_rate, 
            weight_decay=0.01,
            eps=1e-8,
            betas=(0.9, 0.95)  
        )
        
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=15,  
            eta_min=self.learning_rate * 0.1
        )
        
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'frequency': 1,
                'interval': 'epoch'  
            }
        }


In [None]:
def preprocess_static_features(static_tensor):
    """Clean and normalize static features"""
    static_tensor = torch.nan_to_num(static_tensor, nan=0.0, posinf=1e6, neginf=-1e6)
    
    # Clip extreme values
    static_tensor = torch.clamp(static_tensor, min=-1e6, max=1e6)
    
    mean_vals = static_tensor.mean(dim=0, keepdim=True)
    std_vals = static_tensor.std(dim=0, keepdim=True)
    std_vals = torch.clamp(std_vals, min=1e-8)  
    
    static_tensor = (static_tensor - mean_vals) / std_vals
    
    # Final clipping after normalization
    static_tensor = torch.clamp(static_tensor, min=-5, max=5)
    
    return static_tensor

In [None]:
from sklearn.metrics import f1_score
import gc

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(dataset)), labels_for_split, groups)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold+1}/{N_SPLITS}")
    print(f"{'='*50}")
    
    # Create subsets
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
    
    print(f"Train samples: {len(train_subset)}, Val samples: {len(val_subset)}")
    
    # Optimized data loaders
    train_loader = DataLoader(
        train_subset, 
        batch_size=BATCH_SIZE, 
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True,
        drop_last=True,  
        collate_fn=collate_fn
    )
    
    val_loader = DataLoader(
        val_subset, 
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True,
        collate_fn=collate_fn
    )
    
    # Initialize model
    model_instance = StableIncidentTransformer(
    vocab_sizes=vocab_sizes, 
    num_static_features=num_static_features, 
    num_classes=num_classes,
    embed_dim=64,
    nhead=4,
    num_encoder_layers=2,
    dim_feedforward=256,
    dropout=0.1
    )
    
    lightning_model = StableIncidentClassifier(
    model_instance, 
    dataset.class_names,
    learning_rate=1e-5  
    )
    
    # Callbacks
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        monitor='val_f1_macro',
        mode='max',
        dirpath=f'checkpoints/fold_{fold+1}',
        filename='best-checkpoint-{epoch:02d}-{val_f1_macro:.3f}',
        save_top_k=1,
        verbose=True
    )
    
    early_stop_callback = pl.callbacks.EarlyStopping(
        monitor='val_f1_macro',
        patience=5,  
        verbose=True,
        mode='max',
        min_delta=0.001
    )
    
    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
    
    logger = TensorBoardLogger(
        "tb_logs", 
        name=f"optimized_transformer_fold_{fold+1}",
        version=None
    )
    
    trainer = pl.Trainer(
        max_epochs=EPOCHS,
        logger=logger,
        callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
        accelerator='gpu',
        devices=1,
        precision='16-mixed',
        gradient_clip_val=1.0,
        accumulate_grad_batches=1,  
        log_every_n_steps=min(50, len(train_loader) // 4),  
        enable_progress_bar=True,
        enable_model_summary=False,  
        deterministic=False,  
        benchmark=True,  
        sync_batchnorm=False,
        enable_checkpointing=True,
        detect_anomaly=False  
    )
    
    try:
        print("DEBUGGING MODEL AND DATA")
        is_data_clean = debug_model_and_data(model_instance, train_loader)
        if not is_data_clean:
            print("Data issues detected! Fixing...")
            dataset.static_cache = preprocess_static_features(dataset.static_cache)
            print("✓ Static features preprocessed and normalized")    

        print(f"Starting training for fold {fold+1}...")
        
        trainer.fit(lightning_model, train_loader, val_loader)
        
        print("Loading best model for inference...")
        best_model = OptimizedIncidentClassifier.load_from_checkpoint(
            checkpoint_callback.best_model_path,
            model=model_instance,
            class_names=dataset.class_names
        )
        
        print("Running manual validation predictions...")
        best_model.eval()
        best_model = best_model.cuda()

        val_predictions = []
        val_true_labels = []

        with torch.no_grad():
            for batch_idx, batch in enumerate(val_loader):
                sequences, static_data, labels = batch
                
                sequences = {k: v.cuda() for k, v in sequences.items()}
                static_data = static_data.cuda()
                
                try:
                    logits = best_model(sequences, static_data)
                    preds = torch.argmax(logits, dim=1)
                    
                    val_predictions.extend(preds.cpu().numpy())
                    val_true_labels.extend(labels.cpu().numpy())
                    
                except Exception as e:
                    print(f"Error in batch {batch_idx}: {e}")
                    continue
                
                if batch_idx % 100 == 0:
                    print(f"Processed {batch_idx}/{len(val_loader)} validation batches")
        
        oof_preds[val_idx] = val_predictions[:len(val_idx)]
        oof_labels[val_idx] = val_true_labels[:len(val_idx)]
        
        fold_f1 = f1_score(val_true_labels[:len(val_idx)], val_predictions[:len(val_idx)], average='macro')
        print(f"Fold {fold+1} Best Macro F1: {fold_f1:.5f}")
        
        del best_model, lightning_model, model_instance, trainer,
        torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(f"Error in fold {fold+1}: {str(e)}")
        import traceback
        traceback.print_exc()
        continue
    
    print(f"Fold {fold+1} completed successfully!")


FOLD 1/5
Train samples: 460741, Val samples: 106868


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


DEBUGGING MODEL AND DATA
=== DEBUGGING NAN ISSUES ===
✓ Model parameters are clean (no NaN)
Checking first batch of training data...
Batch size: 64
Static data shape: torch.Size([64, 6])
Static data range: [0.0000, 2026635.0000]
Static data is clean
No infinite values in static data
✓ Sequence data is clean
Testing forward pass...
Model forward pass successful
Logits range: [-0.6152, 2.6800]
Starting training for fold 1...


c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\Project\Micosoft Security Indicent\notebooks\checkpoints\fold_1 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved. New best score: 0.354
Epoch 0, global step 7199: 'val_f1_macro' reached 0.35417 (best 0.35417), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_1\\best-checkpoint-epoch=00-val_f1_macro=0.354.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.012 >= min_delta = 0.001. New best score: 0.366
Epoch 1, global step 14398: 'val_f1_macro' reached 0.36641 (best 0.36641), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_1\\best-checkpoint-epoch=01-val_f1_macro=0.366.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.016 >= min_delta = 0.001. New best score: 0.382
Epoch 2, global step 21597: 'val_f1_macro' reached 0.38213 (best 0.38213), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_1\\best-checkpoint-epoch=02-val_f1_macro=0.382.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.002 >= min_delta = 0.001. New best score: 0.384
Epoch 3, global step 28796: 'val_f1_macro' reached 0.38402 (best 0.38402), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_1\\best-checkpoint-epoch=03-val_f1_macro=0.384.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.005 >= min_delta = 0.001. New best score: 0.389
Epoch 4, global step 35995: 'val_f1_macro' reached 0.38929 (best 0.38929), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_1\\best-checkpoint-epoch=04-val_f1_macro=0.389.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 43194: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 50393: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.010 >= min_delta = 0.001. New best score: 0.399
Epoch 7, global step 57592: 'val_f1_macro' reached 0.39896 (best 0.39896), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_1\\best-checkpoint-epoch=07-val_f1_macro=0.399.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.002 >= min_delta = 0.001. New best score: 0.401
Epoch 8, global step 64791: 'val_f1_macro' reached 0.40084 (best 0.40084), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_1\\best-checkpoint-epoch=08-val_f1_macro=0.401.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 71990: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10, global step 79189: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11, global step 86388: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 12, global step 93587: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.003 >= min_delta = 0.001. New best score: 0.404
Epoch 13, global step 100786: 'val_f1_macro' reached 0.40372 (best 0.40372), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_1\\best-checkpoint-epoch=13-val_f1_macro=0.404.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 14, global step 107985: 'val_f1_macro' was not in top 1
`Trainer.fit` stopped: `max_epochs=15` reached.


Loading best model for inference...
Running manual validation predictions...
Processed 0/1670 validation batches
Processed 100/1670 validation batches
Processed 200/1670 validation batches
Processed 300/1670 validation batches
Processed 400/1670 validation batches
Processed 500/1670 validation batches
Processed 600/1670 validation batches
Processed 700/1670 validation batches
Processed 800/1670 validation batches
Processed 900/1670 validation batches
Processed 1000/1670 validation batches
Processed 1100/1670 validation batches
Processed 1200/1670 validation batches
Processed 1300/1670 validation batches
Processed 1400/1670 validation batches
Processed 1500/1670 validation batches
Processed 1600/1670 validation batches
Fold 1 Best Macro F1: 0.40380


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\Project\Micosoft Security Indicent\notebooks\checkpoints\fold_2 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Fold 1 completed successfully!

FOLD 2/5
Train samples: 502181, Val samples: 65428
DEBUGGING MODEL AND DATA
=== DEBUGGING NAN ISSUES ===
✓ Model parameters are clean (no NaN)
Checking first batch of training data...
Batch size: 64
Static data shape: torch.Size([64, 6])
Static data range: [0.0000, 1009319.0000]
Static data is clean
No infinite values in static data
✓ Sequence data is clean
Testing forward pass...
Model forward pass successful
Logits range: [-2.2148, 1.8320]
Starting training for fold 2...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved. New best score: 0.391
Epoch 0, global step 7846: 'val_f1_macro' reached 0.39115 (best 0.39115), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_2\\best-checkpoint-epoch=00-val_f1_macro=0.391.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.008 >= min_delta = 0.001. New best score: 0.399
Epoch 1, global step 15692: 'val_f1_macro' reached 0.39867 (best 0.39867), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_2\\best-checkpoint-epoch=01-val_f1_macro=0.399.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 23538: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.003 >= min_delta = 0.001. New best score: 0.402
Epoch 3, global step 31384: 'val_f1_macro' reached 0.40163 (best 0.40163), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_2\\best-checkpoint-epoch=03-val_f1_macro=0.402.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.073 >= min_delta = 0.001. New best score: 0.474
Epoch 4, global step 39230: 'val_f1_macro' reached 0.47438 (best 0.47438), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_2\\best-checkpoint-epoch=04-val_f1_macro=0.474.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.011 >= min_delta = 0.001. New best score: 0.485
Epoch 5, global step 47076: 'val_f1_macro' reached 0.48548 (best 0.48548), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_2\\best-checkpoint-epoch=05-val_f1_macro=0.485.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 54922: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 62768: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 70614: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 78460: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_f1_macro did not improve in the last 5 records. Best score: 0.485. Signaling Trainer to stop.
Epoch 10, global step 86306: 'val_f1_macro' was not in top 1


Loading best model for inference...
Running manual validation predictions...
Processed 0/1023 validation batches
Processed 100/1023 validation batches
Processed 200/1023 validation batches
Processed 300/1023 validation batches
Processed 400/1023 validation batches
Processed 500/1023 validation batches
Processed 600/1023 validation batches
Processed 700/1023 validation batches
Processed 800/1023 validation batches
Processed 900/1023 validation batches
Processed 1000/1023 validation batches
Fold 2 Best Macro F1: 0.48541


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\Project\Micosoft Security Indicent\notebooks\checkpoints\fold_3 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Fold 2 completed successfully!

FOLD 3/5
Train samples: 428227, Val samples: 139382
DEBUGGING MODEL AND DATA
=== DEBUGGING NAN ISSUES ===
✓ Model parameters are clean (no NaN)
Checking first batch of training data...
Batch size: 64
Static data shape: torch.Size([64, 6])
Static data range: [0.0000, 1028103.0000]
Static data is clean
No infinite values in static data
✓ Sequence data is clean
Testing forward pass...
Model forward pass successful
Logits range: [-2.3340, 0.8071]
Starting training for fold 3...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved. New best score: 0.393
Epoch 0, global step 6691: 'val_f1_macro' reached 0.39271 (best 0.39271), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_3\\best-checkpoint-epoch=00-val_f1_macro=0.393.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.024 >= min_delta = 0.001. New best score: 0.417
Epoch 1, global step 13382: 'val_f1_macro' reached 0.41670 (best 0.41670), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_3\\best-checkpoint-epoch=01-val_f1_macro=0.417.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 20073: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.010 >= min_delta = 0.001. New best score: 0.426
Epoch 3, global step 26764: 'val_f1_macro' reached 0.42625 (best 0.42625), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_3\\best-checkpoint-epoch=03-val_f1_macro=0.426.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.005 >= min_delta = 0.001. New best score: 0.431
Epoch 4, global step 33455: 'val_f1_macro' reached 0.43123 (best 0.43123), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_3\\best-checkpoint-epoch=04-val_f1_macro=0.431.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.007 >= min_delta = 0.001. New best score: 0.438
Epoch 5, global step 40146: 'val_f1_macro' reached 0.43819 (best 0.43819), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_3\\best-checkpoint-epoch=05-val_f1_macro=0.438.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.010 >= min_delta = 0.001. New best score: 0.449
Epoch 6, global step 46837: 'val_f1_macro' reached 0.44859 (best 0.44859), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_3\\best-checkpoint-epoch=06-val_f1_macro=0.449.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 53528: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 60219: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.007 >= min_delta = 0.001. New best score: 0.455
Epoch 9, global step 66910: 'val_f1_macro' reached 0.45523 (best 0.45523), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_3\\best-checkpoint-epoch=09-val_f1_macro=0.455.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10, global step 73601: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11, global step 80292: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 12, global step 86983: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13, global step 93674: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.019 >= min_delta = 0.001. New best score: 0.475
Epoch 14, global step 100365: 'val_f1_macro' reached 0.47453 (best 0.47453), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_3\\best-checkpoint-epoch=14-val_f1_macro=0.475.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=15` reached.


Loading best model for inference...
Running manual validation predictions...
Processed 0/2178 validation batches
Processed 100/2178 validation batches
Processed 200/2178 validation batches
Processed 300/2178 validation batches
Processed 400/2178 validation batches
Processed 500/2178 validation batches
Processed 600/2178 validation batches
Processed 700/2178 validation batches
Processed 800/2178 validation batches
Processed 900/2178 validation batches
Processed 1000/2178 validation batches
Processed 1100/2178 validation batches
Processed 1200/2178 validation batches
Processed 1300/2178 validation batches
Processed 1400/2178 validation batches
Processed 1500/2178 validation batches
Processed 1600/2178 validation batches
Processed 1700/2178 validation batches
Processed 1800/2178 validation batches
Processed 1900/2178 validation batches
Processed 2000/2178 validation batches
Processed 2100/2178 validation batches
Fold 3 Best Macro F1: 0.47452


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Fold 3 completed successfully!

FOLD 4/5
Train samples: 463241, Val samples: 104368
DEBUGGING MODEL AND DATA
=== DEBUGGING NAN ISSUES ===
✓ Model parameters are clean (no NaN)
Checking first batch of training data...
Batch size: 64
Static data shape: torch.Size([64, 6])
Static data range: [0.0000, 1465350.0000]
Static data is clean
No infinite values in static data
✓ Sequence data is clean
Testing forward pass...
Model forward pass successful
Logits range: [-2.0370, 1.5383]
Starting training for fold 4...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved. New best score: 0.443
Epoch 0, global step 7238: 'val_f1_macro' reached 0.44324 (best 0.44324), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_4\\best-checkpoint-epoch=00-val_f1_macro=0.443.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 14476: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 21714: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 28952: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.019 >= min_delta = 0.001. New best score: 0.462
Epoch 4, global step 36190: 'val_f1_macro' reached 0.46184 (best 0.46184), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_4\\best-checkpoint-epoch=04-val_f1_macro=0.462.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 43428: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.002 >= min_delta = 0.001. New best score: 0.464
Epoch 6, global step 50666: 'val_f1_macro' reached 0.46396 (best 0.46396), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_4\\best-checkpoint-epoch=06-val_f1_macro=0.464.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 57904: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.003 >= min_delta = 0.001. New best score: 0.467
Epoch 8, global step 65142: 'val_f1_macro' reached 0.46735 (best 0.46735), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_4\\best-checkpoint-epoch=08-val_f1_macro=0.467.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 72380: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10, global step 79618: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.002 >= min_delta = 0.001. New best score: 0.469
Epoch 11, global step 86856: 'val_f1_macro' reached 0.46919 (best 0.46919), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_4\\best-checkpoint-epoch=11-val_f1_macro=0.469.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 12, global step 94094: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13, global step 101332: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 14, global step 108570: 'val_f1_macro' was not in top 1
`Trainer.fit` stopped: `max_epochs=15` reached.


Loading best model for inference...
Running manual validation predictions...
Processed 0/1631 validation batches
Processed 100/1631 validation batches
Processed 200/1631 validation batches
Processed 300/1631 validation batches
Processed 400/1631 validation batches
Processed 500/1631 validation batches
Processed 600/1631 validation batches
Processed 700/1631 validation batches
Processed 800/1631 validation batches
Processed 900/1631 validation batches
Processed 1000/1631 validation batches
Processed 1100/1631 validation batches
Processed 1200/1631 validation batches
Processed 1300/1631 validation batches
Processed 1400/1631 validation batches
Processed 1500/1631 validation batches
Processed 1600/1631 validation batches
Fold 4 Best Macro F1: 0.46922


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Fold 4 completed successfully!

FOLD 5/5
Train samples: 416046, Val samples: 151563
DEBUGGING MODEL AND DATA
=== DEBUGGING NAN ISSUES ===
✓ Model parameters are clean (no NaN)
Checking first batch of training data...
Batch size: 64
Static data shape: torch.Size([64, 6])
Static data range: [0.0000, 2044799.0000]
Static data is clean
No infinite values in static data
✓ Sequence data is clean
Testing forward pass...
Model forward pass successful
Logits range: [-1.4639, 2.5064]
Starting training for fold 5...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Project\Micosoft Security Indicent\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved. New best score: 0.319
Epoch 0, global step 6500: 'val_f1_macro' reached 0.31927 (best 0.31927), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_5\\best-checkpoint-epoch=00-val_f1_macro=0.319.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.010 >= min_delta = 0.001. New best score: 0.330
Epoch 1, global step 13000: 'val_f1_macro' reached 0.32964 (best 0.32964), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_5\\best-checkpoint-epoch=01-val_f1_macro=0.330.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 19500: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.005 >= min_delta = 0.001. New best score: 0.334
Epoch 3, global step 26000: 'val_f1_macro' reached 0.33425 (best 0.33425), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_5\\best-checkpoint-epoch=03-val_f1_macro=0.334.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 32500: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_f1_macro improved by 0.002 >= min_delta = 0.001. New best score: 0.336
Epoch 5, global step 39000: 'val_f1_macro' reached 0.33644 (best 0.33644), saving model to 'C:\\Project\\Micosoft Security Indicent\\notebooks\\checkpoints\\fold_5\\best-checkpoint-epoch=05-val_f1_macro=0.336.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 45500: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 52000: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 58500: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 65000: 'val_f1_macro' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_f1_macro did not improve in the last 5 records. Best score: 0.336. Signaling Trainer to stop.
Epoch 10, global step 71500: 'val_f1_macro' was not in top 1


Loading best model for inference...
Running manual validation predictions...
Processed 0/2369 validation batches
Processed 100/2369 validation batches
Processed 200/2369 validation batches
Processed 300/2369 validation batches
Processed 400/2369 validation batches
Processed 500/2369 validation batches
Processed 600/2369 validation batches
Processed 700/2369 validation batches
Processed 800/2369 validation batches
Processed 900/2369 validation batches
Processed 1000/2369 validation batches
Processed 1100/2369 validation batches
Processed 1200/2369 validation batches
Processed 1300/2369 validation batches
Processed 1400/2369 validation batches
Processed 1500/2369 validation batches
Processed 1600/2369 validation batches
Processed 1700/2369 validation batches
Processed 1800/2369 validation batches
Processed 1900/2369 validation batches
Processed 2000/2369 validation batches
Processed 2100/2369 validation batches
Processed 2200/2369 validation batches
Processed 2300/2369 validation batches

In [None]:
from sklearn.metrics import classification_report, accuracy_score

valid_mask = (oof_preds != -1) & (oof_labels != -1)  
print(f"Total samples in dataset: {len(dataset)}")
print(f"Samples with predictions: {np.sum(valid_mask)}")
print(f"Missing predictions: {len(dataset) - np.sum(valid_mask)}")

overall_f1 = f1_score(oof_labels[valid_mask], oof_preds[valid_mask], average='macro')
overall_accuracy = accuracy_score(oof_labels[valid_mask], oof_preds[valid_mask])

print(f"\n{'='*60}")
print(f"FINAL RESULTS")
print(f"{'='*60}")
print(f"Overall Out-of-Fold Macro F1: {overall_f1:.5f}")
print(f"Overall Accuracy: {overall_accuracy:.5f}")
print(f"Total samples processed: {np.sum(valid_mask)}")


Total samples in dataset: 567609
Samples with predictions: 567609
Missing predictions: 0

FINAL RESULTS
Overall Out-of-Fold Macro F1: 0.42694
Overall Accuracy: 0.48395
Total samples processed: 567609


In [20]:
# Detailed classification report
print("CLASSIFICATION REPORT")
print(classification_report(
    oof_labels[valid_mask], 
    oof_preds[valid_mask], 
    target_names=dataset.class_names,
    digits=4
))

# Confusion matrix for additional insights
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(oof_labels[valid_mask], oof_preds[valid_mask])
print("CONFUSION MATRIX")
print("Rows = Actual, Columns = Predicted")
print(f"Classes: {dataset.class_names}")
print(cm)

# Per-fold performance breakdown
print("PER-FOLD BREAKDOWN")


CLASSIFICATION REPORT
                precision    recall  f1-score   support

  TruePositive     0.2511    0.2280    0.2390    129343
BenignPositive     0.5472    0.6986    0.6137    260495
 FalsePositive     0.5377    0.3557    0.4281    177771

      accuracy                         0.4839    567609
     macro avg     0.4453    0.4274    0.4269    567609
  weighted avg     0.4767    0.4839    0.4702    567609

CONFUSION MATRIX
Rows = Actual, Columns = Predicted
Classes: Index(['TruePositive', 'BenignPositive', 'FalsePositive'], dtype='object')
[[ 29494  80431  19418]
 [ 43573 181970  34952]
 [ 44389  70154  63228]]
PER-FOLD BREAKDOWN
