In [1]:
import polars as pl
import numpy as np

In [3]:
TRAIN_DATA_PATH = '../data/raw/GUIDE_Train.parquet'
PROCESSED_FEATURES_PATH = '../data/processed/incident_features.parquet'

print("Starting Feature Engineering")

# Load data
lazy_df = pl.scan_parquet(TRAIN_DATA_PATH)

# 3. Create base dataframe for incidents
print("Step 1: Creating base incident table...")
incident_base = (
    lazy_df.select(['IncidentId', 'IncidentGrade']).drop_nulls().unique(subset=['IncidentId', 'IncidentGrade'])
)

# 4. Engineer Count and Diversity Features
print("Step 2: Engineering count and diversity features...")
feature_expressions = [
    pl.len().alias('evidence_count'),
    pl.n_unique('AlertId').alias('unique_alert_count'),
    pl.n_unique('EntityType').alias('unique_entity_type_count'),
    pl.n_unique('DetectorId').alias('unique_detector_id_count'),
    pl.n_unique('MitreTechniques').alias('unique_mitre_techniques_count'),
    pl.n_unique('OrgId').alias('unique_org_id_count'),
]

incident_features = lazy_df.group_by('IncidentId').agg(feature_expressions)

Starting Feature Engineering
Step 1: Creating base incident table...
Step 2: Engineering count and diversity features...


In [4]:
print("Step 3: Joining features to base table...")
incident_base_df = incident_base.collect()
incident_features_df = incident_features.collect()

final_df = incident_base_df.join(incident_features_df, on='IncidentId', how='left')

print(f"Step 4: Saving processed features to {PROCESSED_FEATURES_PATH}...")
final_df.write_parquet(PROCESSED_FEATURES_PATH)

print("\nFeature Engineering Complete...")
print("Shape of the final incident level feature dataframe:", final_df.shape)
print(f"\nFirst 5 rows of the new dataframe:")
print(final_df.head())

Step 3: Joining features to base table...
Step 4: Saving processed features to ../data/processed/incident_features.parquet...

Feature Engineering Complete...
Shape of the final incident level feature dataframe: (567609, 8)

First 5 rows of the new dataframe:
shape: (5, 8)
┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ IncidentId ┆ IncidentGr ┆ evidence_c ┆ unique_al ┆ unique_en ┆ unique_de ┆ unique_mi ┆ unique_or │
│ ---        ┆ ade        ┆ ount       ┆ ert_count ┆ tity_type ┆ tector_id ┆ tre_techn ┆ g_id_coun │
│ i64        ┆ ---        ┆ ---        ┆ ---       ┆ _count    ┆ _count    ┆ iques_cou ┆ t         │
│            ┆ str        ┆ u32        ┆ u32       ┆ ---       ┆ ---       ┆ nt        ┆ ---       │
│            ┆            ┆            ┆           ┆ u32       ┆ u32       ┆ ---       ┆ u32       │
│            ┆            ┆            ┆           ┆           ┆           ┆ u32       ┆           │
╞════════════╪═════

In [None]:
lazy_features_df = pl.scan_parquet(PROCESSED_FEATURES_PATH)
lazy_df = pl.scan_parquet(TRAIN_DATA_PATH)

print("Starting Advanced Feature Engineering")

# 1. Temporal Feature Engineering
print("Step 1: Engineering temporal features...")
temporal_features = (
    lazy_df.with_columns(
        pl.col('Timestamp').str.to_datetime()
    ).group_by('IncidentId').agg(pl.min('Timestamp').alias('first_evidence_ts'), pl.max('Timestamp').alias('last_evidence_ts')).with_columns(
        (pl.col('last_evidence_ts') - pl.col('first_evidence_ts')).dt.total_seconds().alias('incident_duration_seconds') +1
    )
)

# 2. Categorical Aggregation
print("Step 2: Engineering categorical aggregations...")
top_entity_types = ['Ip', 'User', 'MailMessage', 'Machine', 'File']
entity_type_expressions = [
    pl.col('EntityType').filter(pl.col('EntityType') == entity).count().alias(f'entity_{entity}_count')
    for entity in top_entity_types
]

top_categories = ['InitialAccess', 'Exfiltration', 'SuspiciousActivity', 'CommandAndControl', 'Impact']
category_expressions = [
    pl.col('Category').filter(pl.col('Category') == cat).count().alias(f'category_{cat}_count')
    for cat in top_categories
]

categorical_features = lazy_df.group_by('IncidentId').agg(*entity_type_expressions, *category_expressions)

Starting Advanced Feature Engineering
Step 1: Engineering temporal features...
Step 2: Engineering categorical aggregations...


In [None]:
# 3. Join All Features Together ---
print("Step 3: Joining all feature sets...")
# Let's execute and join the new features
temporal_features_df = temporal_features.collect()
categorical_features_df = categorical_features.collect()
base_features_df = lazy_features_df.collect() # Load our first feature set

# Join temporal features
final_df_enhanced = base_features_df.join(temporal_features_df.select(['IncidentId', 'incident_duration_seconds']), on='IncidentId', how='left')

# Join categorical features
final_df_enhanced = final_df_enhanced.join(categorical_features_df, on='IncidentId', how='left')

# --- 4. Create Rate Features ---
# Now that we have duration and counts, we can create rates
print("Step 4: Creating rate features...")
final_df_enhanced = final_df_enhanced.with_columns(
    (pl.col('evidence_count') / pl.col('incident_duration_seconds')).alias('evidence_rate'),
    (pl.col('unique_alert_count') / pl.col('incident_duration_seconds')).alias('alert_rate')
)



Step 3: Joining all feature sets...


ComputeError: `strptime` / `to_datetime` was called with no format and no time zone, but a time zone is part of the data.

This was previously allowed but led to unpredictable and erroneous results. Give a format string, set a time zone or perform the operation eagerly on a Series instead of on an Expr.