In [1]:
import polars as pl
import lightgbm as lgb
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

In [2]:
PROCESSED_FEATURES_PATH = '../data/processed/incident_features.parquet'
df = pl.read_parquet(PROCESSED_FEATURES_PATH)

raw_lazy_df = pl.scan_parquet('../data/raw/GUIDE_Train.parquet')
org_id_map = raw_lazy_df.select(['IncidentId', 'OrgId']).unique(subset=['IncidentId']).collect()
df = df.join(org_id_map, on='IncidentId', how='left')

df = df.fill_null(0)

print("Starting Model Training")
print(f"Loaded feature matrix with shape: {df.shape}")

Starting Model Training
Loaded feature matrix with shape: (567609, 22)


In [4]:
df.columns

['IncidentId',
 'IncidentGrade',
 'evidence_count',
 'unique_alert_count',
 'unique_entity_type_count',
 'unique_detector_id_count',
 'unique_mitre_techniques_count',
 'unique_org_id_count',
 'incident_duration_seconds',
 'entity_Ip_count',
 'entity_User_count',
 'entity_MailMessage_count',
 'entity_Machine_count',
 'entity_File_count',
 'category_InitialAccess_count',
 'category_Exfiltration_count',
 'category_SuspiciousActivity_count',
 'category_CommandAndControl_count',
 'category_Impact_count',
 'evidence_rate',
 'alert_rate',
 'OrgId']

In [3]:
df_pd = df.to_pandas()

TARGET = 'IncidentGrade'
features = [col for col in df_pd.columns if col not in ['IncidentId', TARGET, 'first_evidence_ts', 'last_evidence_ts']]
X = df_pd[features]
y_raw = df_pd[TARGET]
groups = df_pd['OrgId']

# Encode the string labels into integers
le = LabelEncoder()
y = le.fit_transform(y_raw)
class_names = le.classes_
print(f"Target classes encoded: {list(zip(range(len(class_names)), class_names))}")

Target classes encoded: [(0, 'BenignPositive'), (1, 'FalsePositive'), (2, 'TruePositive')]


In [None]:
# Set up cross validation
N_SPLITS = 5
cv = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

lgbm = lgb.LGBMClassifier(random_state=42,               
                          n_jobs=-1,
                          objective='multiclass',
                          class_weight=class_weight_dict,
                          num_leaves=100,
                          learning_rate=0.05,
                          feature_fraction=0.8,
                          bagging_fraction=0.8,
                          bagging_freq=5,
                          min_child_samples=100,
                          reg_alpha=0.1,
                          reg_lambda=0.1
)

In [5]:
oof_preds = np.zeros((len(df_pd), len(class_names)))
f1_scores, precision_scores, recall_scores = [], [], []

print(f"\nStarting {N_SPLITS}-Fold Stratified Group Cross Validation...")

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    print(f"=== Fold {fold+1}/{N_SPLITS} ===")

    X_train, y_train = X.iloc[train_idx], y[train_idx]
    X_val, y_val = X.iloc[val_idx], y[val_idx]

    # Train it
    lgbm.fit(X_train, y_train,
             eval_set=[(X_val, y_val)],
             eval_metric='multi_logloss',
             callbacks=[lgb.early_stopping(100, verbose=False)])
    
    # Predict on Validation set
    val_preds = lgbm.predict(X_val)
    val_preds_proba = lgbm.predict_proba(X_val)
    oof_preds[val_idx] = val_preds_proba

    # Calculate and store metrics
    f1 = f1_score(y_val, val_preds, average='macro')
    precision = precision_score(y_val, val_preds, average='macro')
    recall = recall_score(y_val, val_preds, average='macro')

    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)

    print(f"Macro F1 {f1:.5f} | Macro Precision: {precision:.5f} | Macro Recall: {recall:.5f}")


Starting 5-Fold Stratified Group Cross Validation...
=== Fold 1/5 ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3973
[LightGBM] [Info] Number of data points in the train set: 476009, number of used features: 20
[LightGBM] [Info] Start training from score -1.104123
[LightGBM] [Info] Start training from score -1.063905
[LightGBM] [Info] Start training from score -1.128887
Macro F1 0.36108 | Macro Precision: 0.36748 | Macro Recall: 0.38264
=== Fold 2/5 ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3972
[LightGBM] [Info] Number of data points in the train set: 493613, nu

In [6]:
print("Overcall Cross Validation Results")
print(f"Mean Macro F1: {np.mean(f1_scores):.5f} ± {np.std(f1_scores):.5f}")
print(f"Mean Macro Precision: {np.mean(precision_scores):.5f} ± {np.std(precision_scores):.5f}")
print(f"Mean Macro Recall: {np.mean(recall_scores):.5f} ± {np.std(recall_scores):.5}")

Overcall Cross Validation Results
Mean Macro F1: 0.44820 ± 0.05403
Mean Macro Precision: 0.48658 ± 0.07018
Mean Macro Recall: 0.46240 ± 0.049451


In [7]:
from sklearn.metrics import classification_report

oof_pred_classes = np.argmax(oof_preds, axis=1)
print(classification_report(y, oof_pred_classes, target_names=class_names))

                precision    recall  f1-score   support

BenignPositive       0.55      0.59      0.57    260495
 FalsePositive       0.61      0.49      0.54    177771
  TruePositive       0.26      0.29      0.28    129343

      accuracy                           0.49    567609
     macro avg       0.47      0.46      0.46    567609
  weighted avg       0.50      0.49      0.50    567609



# XGBoost

In [15]:
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


In [None]:
# Calculate class weights for XGBoost
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)

xgb_classifier = xgb.XGBClassifier(
    objective='multi:softprob',  
    num_class=len(class_names),  
    random_state=42,
    n_jobs=-1,
    n_estimators=500,           
    max_depth=8,                
    learning_rate=0.05,         
    reg_alpha=0.1,              
    reg_lambda=0.1,             
    gamma=0.1,                  
    
    subsample=0.8,              
    colsample_bytree=0.8,       
    colsample_bylevel=0.8,      
    
    min_child_weight=5,         
    max_delta_step=1,           
    
    eval_metric='mlogloss'
)


In [None]:
print(f"\nStarting {N_SPLITS}-Fold XGBoost Cross Validation...")

oof_preds_xgb = np.zeros((len(df_pd), len(class_names)))
f1_scores_xgb, precision_scores_xgb, recall_scores_xgb = [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    print(f"=== Fold {fold+1}/{N_SPLITS} ===")

    X_train, y_train = X.iloc[train_idx], y[train_idx]
    X_val, y_val = X.iloc[val_idx], y[val_idx]
    
    sample_weights = np.array([class_weights[label] for label in y_train])
    
    xgb_fold = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(class_names),
        random_state=42,
        n_jobs=-1,
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        reg_alpha=0.1,
        reg_lambda=0.1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        min_child_weight=5,
        max_delta_step=1,
        eval_metric='mlogloss',
        early_stopping_rounds=50
    )
    
    xgb_fold.fit(
        X_train, y_train,
        sample_weight=sample_weights,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    val_preds = xgb_fold.predict(X_val)
    val_preds_proba = xgb_fold.predict_proba(X_val)
    oof_preds_xgb[val_idx] = val_preds_proba

    f1 = f1_score(y_val, val_preds, average='macro')
    precision = precision_score(y_val, val_preds, average='macro')
    recall = recall_score(y_val, val_preds, average='macro')

    f1_scores_xgb.append(f1)
    precision_scores_xgb.append(precision)
    recall_scores_xgb.append(recall)

    print(f"Macro F1: {f1:.5f} | Macro Precision: {precision:.5f} | Macro Recall: {recall:.5f}")



Starting 5-Fold XGBoost Cross Validation...
=== Fold 1/5 ===
Macro F1: 0.43787 | Macro Precision: 0.44744 | Macro Recall: 0.44152
=== Fold 2/5 ===
Macro F1: 0.42636 | Macro Precision: 0.44785 | Macro Recall: 0.44371
=== Fold 3/5 ===
Macro F1: 0.46694 | Macro Precision: 0.47221 | Macro Recall: 0.46730
=== Fold 4/5 ===
Macro F1: 0.50541 | Macro Precision: 0.51983 | Macro Recall: 0.51048
=== Fold 5/5 ===
Macro F1: 0.45478 | Macro Precision: 0.50010 | Macro Recall: 0.45974


In [18]:
print(f"\n=== XGBoost Cross-Validation Results ===")
print(f"Mean Macro F1: {np.mean(f1_scores_xgb):.5f} ± {np.std(f1_scores_xgb):.5f}")
print(f"Mean Macro Precision: {np.mean(precision_scores_xgb):.5f} ± {np.std(precision_scores_xgb):.5f}")
print(f"Mean Macro Recall: {np.mean(recall_scores_xgb):.5f} ± {np.std(recall_scores_xgb):.5f}")

# Classification report
oof_pred_classes_xgb = np.argmax(oof_preds_xgb, axis=1)
print(f"\n=== XGBoost Classification Report ===")
print(classification_report(y, oof_pred_classes_xgb, target_names=class_names))



=== XGBoost Cross-Validation Results ===
Mean Macro F1: 0.45827 ± 0.02736
Mean Macro Precision: 0.47749 ± 0.02868
Mean Macro Recall: 0.46455 ± 0.02492

=== XGBoost Classification Report ===
                precision    recall  f1-score   support

BenignPositive       0.60      0.54      0.57    260495
 FalsePositive       0.54      0.44      0.49    177771
  TruePositive       0.29      0.43      0.34    129343

      accuracy                           0.48    567609
     macro avg       0.48      0.47      0.47    567609
  weighted avg       0.51      0.48      0.49    567609



In [None]:
# Feature importance analysis
print(f"\n=== Feature Importance Analysis ===")
final_xgb = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=len(class_names),
    random_state=42,
    n_jobs=-1,
    n_estimators=200,  
    max_depth=8,
    learning_rate=0.05,
    reg_alpha=0.1,
    reg_lambda=0.1,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    colsample_bylevel=0.8,
    min_child_weight=5,
    max_delta_step=1,
    eval_metric='mlogloss'
)

final_sample_weights = np.array([class_weights[label] for label in y])
final_xgb.fit(X, y, sample_weight=final_sample_weights)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': final_xgb.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))


=== Feature Importance Analysis ===
Top 10 Most Important Features:
                              feature  importance
19                              OrgId    0.153517
12       category_InitialAccess_count    0.104378
0                      evidence_count    0.079487
11                  entity_File_count    0.066091
2            unique_entity_type_count    0.059980
14  category_SuspiciousActivity_count    0.059542
13        category_Exfiltration_count    0.055212
5                 unique_org_id_count    0.055055
17                      evidence_rate    0.053357
7                     entity_Ip_count    0.051994


In [20]:
ensemble_preds = 0.5 * oof_preds + 0.5 * oof_preds_xgb
ensemble_classes = np.argmax(ensemble_preds, axis=1)

print("=== Ensemble Results ===")
print(classification_report(y, ensemble_classes, target_names=class_names))

=== Ensemble Results ===
                precision    recall  f1-score   support

BenignPositive       0.60      0.55      0.57    260495
 FalsePositive       0.60      0.49      0.54    177771
  TruePositive       0.29      0.41      0.34    129343

      accuracy                           0.50    567609
     macro avg       0.50      0.48      0.48    567609
  weighted avg       0.53      0.50      0.51    567609

