In [1]:
import polars as pl
import numpy as np
import pickle
from pathlib import Path

In [2]:
RAW_TRAIN_PATH = '../data/raw/GUIDE_Train.parquet'
PROCESSED_INCIDENT_FEATURES_PATH = '../data/processed/incident_features.parquet'
DL_PROCESSED_DATA_DIR = Path('../data/processed_dl/')
DL_PROCESSED_DATA_DIR.mkdir(exist_ok=True)

MAX_SEQ_LENGTH = 128

In [None]:
print("--- Phase 4.1: Preparing Data for Sequence Modeling ---")

#  1. Load Data 
print("Loading raw and processed data...")
raw_df = pl.read_parquet(RAW_TRAIN_PATH)
incident_features_df = pl.read_parquet(PROCESSED_INCIDENT_FEATURES_PATH)

sequential_features = [
    'Category',
    'DetectorId',
    'EntityType',
    'MitreTechniques' 
]

static_features = [
    'OrgId',
    'evidence_count',
    'unique_alert_count',
    'incident_duration_seconds',
    'evidence_rate',
    'alert_rate'
]


--- Phase 4.1: Preparing Data for Sequence Modeling ---
Loading raw and processed data...


In [4]:
print("Creating vocabularies for sequential features...")
vocabularies = {}
for col in sequential_features:
    unique_vals = raw_df[col].fill_null('[NULL]').unique().to_list()
    vocab = {val: i + 1 for i, val in enumerate(unique_vals)}
    vocab['[PAD]'] = 0  # Padding token
    vocabularies[col] = vocab
    print(f"  Vocabulary for '{col}' has {len(vocab)} unique tokens.")

with open(DL_PROCESSED_DATA_DIR / 'vocabularies.pkl', 'wb') as f:
    pickle.dump(vocabularies, f)


Creating vocabularies for sequential features...
  Vocabulary for 'Category' has 21 unique tokens.
  Vocabulary for 'DetectorId' has 8429 unique tokens.
  Vocabulary for 'EntityType' has 34 unique tokens.
  Vocabulary for 'MitreTechniques' has 1195 unique tokens.


In [None]:
print("Grouping by IncidentId and creating tokenized sequences...")

def tokenize_and_pad_list(values_list: list, vocab: dict) -> list:
    tokens = [vocab.get(val, 0) for val in values_list] 
    tokens = tokens[:MAX_SEQ_LENGTH]
    padding_needed = MAX_SEQ_LENGTH - len(tokens)
    return tokens + [vocab['[PAD]']] * padding_needed

sequential_data = (
    raw_df.sort(['IncidentId', 'Timestamp'])
          .group_by('IncidentId')
          .agg([
              pl.col(col).fill_null('[NULL]').alias(f'{col}_list')
              for col in sequential_features
          ])
)

for col in sequential_features:
    sequential_data = sequential_data.with_columns(
        pl.col(f'{col}_list').map_elements(
            lambda values_list: tokenize_and_pad_list(values_list, vocabularies[col]),
            return_dtype=pl.List(pl.Int32)
        ).alias(f'{col}_seq')
    ).drop(f'{col}_list')

print("Tokenization complete.")
print("Shape of sequential data:", sequential_data.shape)

Grouping by IncidentId and creating tokenized sequences...
Tokenization complete.
Shape of sequential data: (466151, 5)


In [None]:
print("Preparing static features and labels...")