In [1]:
import os
import gc
import pickle
import psutil
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm

import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold


In [2]:
PROCESSED_FEATURES_PATH = '../data/processed/incident_features.parquet'
RAW_FEATURES_PATH = '../data/raw/GUIDE_Train.parquet'
TARGET = 'IncidentGrade'
N_SPLITS = 5
RANDOM_STATE = 42
MAX_FEATURES_VARIANCE = 0.01

OUT_DIR = Path('models')
LGB_DIR = OUT_DIR / 'lightgbm'
XGB_DIR = OUT_DIR / 'xgboost'
ENS_DIR = OUT_DIR / 'ensemble'
for d in (LGB_DIR, XGB_DIR, ENS_DIR):
    d.mkdir(parents=True, exist_ok=True)

TIMESTAMP = datetime.now().strftime('%Y%m%d_%H%M%S')

# Memory monitoring
def print_memory_usage(stage=""):
    process = psutil.Process(os.getpid())
    memory_mb = process.memory_info().rss / 1024 / 1024
    memory_percent = process.memory_percent()
    print(f"[{stage}] Memory: {memory_mb:.1f} MB ({memory_percent:.1f}%)")


# Helpers

In [3]:
def reduce_mem_usage(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
    """Aggressively downcast numeric columns to save memory."""
    start_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtype
        if pd.api.types.is_numeric_dtype(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.api.types.is_integer_dtype(col_type):
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                # Always use float32 for better memory and speed
                df[col] = pd.to_numeric(df[col], downcast='float').astype(np.float32)
        elif pd.api.types.is_object_dtype(col_type):
            # Convert to category immediately for memory savings
            nunique = df[col].nunique(dropna=False)
            if nunique < len(df) * 0.5:  # If less than 50% unique
                df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    if verbose:
        print(f"Memory usage: {start_mem:.2f} MB -> {end_mem:.2f} MB ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)")
    return df

def save_pickle(obj, path: Path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)


# SECTION 1: PREPROCESSING


In [None]:
print('SECTION 1: PREPROCESSING (MEMORY-OPTIMIZED)')

print_memory_usage("Start")

print('Loading parquet files with Polars...')
df_features = pl.read_parquet(PROCESSED_FEATURES_PATH)
print(f"Features shape: {df_features.shape}")

raw_df = pl.read_parquet(RAW_FEATURES_PATH).select([
    'IncidentId', 'OrgId', 'DetectorId', 'AlertTitle', 'EntityType', 'Category'
])
print(f"Raw data shape: {raw_df.shape}")

print('Joining and preprocessing with Polars...')
categorical_cols = ['OrgId', 'DetectorId', 'AlertTitle', 'EntityType', 'Category']
for col in categorical_cols:
    if col in raw_df.columns:
        raw_df = raw_df.with_columns(pl.col(col).fill_null('missing'))

df = df_features.join(raw_df, on='IncidentId', how='left')
print(f"Joined shape: {df.shape}")

for col in categorical_cols:
    if col not in df.columns:
        continue
    
    # Check cardinality
    nuni = df.select(pl.col(col).n_unique()).item()
    if nuni > 1000:
        print(f'High-cardinality: {col} ({nuni} unique). Creating frequency feature.')
        
        # Create frequency feature
        freq_expr = (
            df.select(col)
            .group_by(col)
            .agg(pl.len().alias('freq'))
        )
        df = df.join(freq_expr, on=col, how='left')
        df = df.rename({'freq': f'{col}_freq'})
        
        # Keep only top 100 categories
        top_cats = (
            df.select(col)
            .group_by(col)
            .agg(pl.len().alias('count'))
            .sort('count', descending=True)
            .head(100)
            .select(col)
            .to_series()
            .to_list()
        )
        
        df = df.with_columns(
            pl.when(pl.col(col).is_in(top_cats))
            .then(pl.col(col))
            .otherwise(pl.lit('other'))
            .alias(col)
        )

print_memory_usage("After Polars preprocessing")

SECTION 1: PREPROCESSING (MEMORY-OPTIMIZED)
[Start] Memory: 214.5 MB (1.4%)
Loading parquet files with Polars...
Features shape: (567609, 21)
Raw data shape: (9516837, 6)
Joining and preprocessing with Polars...
Joined shape: (14857136, 26)
High-cardinality: OrgId (5746 unique). Creating frequency feature.
High-cardinality: DetectorId (8149 unique). Creating frequency feature.
High-cardinality: AlertTitle (81172 unique). Creating frequency feature.
[After Polars preprocessing] Memory: 5214.7 MB (33.3%)


In [5]:
print('Converting to pandas...')
df = df.to_pandas()

print_memory_usage("After pandas conversion")

# Aggressive memory reduction
print('Reducing memory usage...')
df = reduce_mem_usage(df, verbose=True)

# Convert remaining categoricals
for col in categorical_cols:
    if col in df.columns and df[col].dtype != 'category':
        df[col] = df[col].astype('category')

print_memory_usage("After memory optimization")

# Prepare target
print('Preparing target...')
y, class_names = pd.factorize(df[TARGET])
num_classes = len(np.unique(y))
print(f'Target classes: {num_classes} -> {class_names}')

# Compute class weights more efficiently
unique_y, counts_y = np.unique(y, return_counts=True)
class_weights = len(y) / (num_classes * counts_y)
class_weight_dict = dict(zip(unique_y, class_weights))
print('Class weights:', class_weight_dict)

# Prepare features
FEATURES_DROP = [TARGET, 'IncidentId']
X = df.drop(columns=[c for c in FEATURES_DROP if c in df.columns])


Converting to pandas...
[After pandas conversion] Memory: 7937.2 MB (50.7%)
Reducing memory usage...
Memory usage: 6256.03 MB -> 906.84 MB (85.5% reduction)
[After memory optimization] Memory: 8015.7 MB (51.2%)
Preparing target...
Target classes: 3 -> CategoricalIndex(['BenignPositive', 'FalsePositive', 'TruePositive'], categories=['BenignPositive', 'FalsePositive', 'TruePositive'], ordered=False, dtype='category')
Class weights: {np.int64(0): np.float64(0.8168810045592506), np.int64(1): np.float64(1.2439955596036465), np.int64(2): np.float64(1.0288382301900767)}


In [None]:
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print(f'Numeric features: {len(numeric_cols)}')

if len(numeric_cols) > 0:
    sample_size = min(10000, len(X))
    if len(X) > sample_size:
        sample_idx = np.random.choice(len(X), sample_size, replace=False)
        X_sample = X.iloc[sample_idx]
    else:
        X_sample = X
    
    selector = VarianceThreshold(threshold=MAX_FEATURES_VARIANCE)
    selector.fit(X_sample[numeric_cols])
    selected_numeric = [c for i, c in enumerate(numeric_cols) if selector.get_support()[i]]
    print(f'Selected numeric features: {len(selected_numeric)}/{len(numeric_cols)}')
else:
    selected_numeric = []

# Final feature set
CATEGORICAL_FEATURES = [c for c in categorical_cols if c in X.columns]
all_selected_features = selected_numeric + CATEGORICAL_FEATURES
X_selected = X[all_selected_features].copy()

print(f'Final features: {X_selected.shape[1]} (numeric: {len(selected_numeric)}, categorical: {len(CATEGORICAL_FEATURES)})')

# Ensure numeric columns are float32
for c in selected_numeric:
    if X_selected[c].dtype != np.float32:
        X_selected[c] = X_selected[c].astype(np.float32)

print_memory_usage("After feature selection")

# Save feature metadata
feature_info = {
    'feature_names': list(X_selected.columns),
    'categorical_features': CATEGORICAL_FEATURES,
    'class_names': list(class_names),
    'class_weights': class_weight_dict,
}
save_pickle(feature_info, OUT_DIR / f'feature_info_{TIMESTAMP}.pkl')

Numeric features: 22
Selected numeric features: 22/22
Final features: 27 (numeric: 22, categorical: 5)
[After feature selection] Memory: 7641.7 MB (48.8%)


In [7]:
# Clean up memory
del df, df_features, raw_df, X
gc.collect()
print_memory_usage("After cleanup")

[After cleanup] Memory: 5999.9 MB (38.3%)


# SECTION 2: LIGHTGBM


In [None]:
print('SECTION 2: LIGHTGBM TRAINING (OPTIMIZED)')

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# Pre-allocate OOF array
oof_lgb = np.zeros((len(X_selected), num_classes), dtype=np.float32)
feature_importances_lgb = pd.DataFrame(index=X_selected.columns)

lgb_params = {
    'objective': 'multiclass',
    'num_class': num_classes,
    'boosting_type': 'gbdt',
    'random_state': RANDOM_STATE,
    'n_jobs': 4,  
    'learning_rate': 0.1,  
    'n_estimators': 1000,  
    'num_leaves': 31,
    'max_depth': 6,
    'subsample': 0.8,
    'subsample_freq': 5,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'class_weight': 'balanced',  
    'verbose': -1,
    'force_col_wise': True,  
    'max_bin': 255,  
}

SECTION 2: LIGHTGBM TRAINING (OPTIMIZED)


In [None]:
for fold, (train_idx, val_idx) in enumerate(tqdm(cv.split(X_selected, y), total=N_SPLITS, desc='LGB folds')):
    print(f"\nLGB FOLD {fold+1}/{N_SPLITS}")
    print_memory_usage(f"Fold {fold+1} start")
    
    # Get fold data
    X_train = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]
    
    print(f"Train: {X_train.shape}, Val: {X_val.shape}")
    
    lgbm = lgb.LGBMClassifier(**lgb_params)
    
    print("Starting LightGBM training...")
    start_time = datetime.now()
    
    callbacks = [
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.log_evaluation(period=50)  
    ]
    
    lgbm.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_logloss',
        categorical_feature=CATEGORICAL_FEATURES,
        callbacks=callbacks
    )
    
    train_time = datetime.now() - start_time
    print(f"Training completed in: {train_time}")
    
    # Predict OOF
    val_proba = lgbm.predict_proba(X_val)
    oof_lgb[val_idx] = val_proba
    
    # Feature importance
    importance = lgbm.feature_importances_
    feature_importances_lgb[f'fold_{fold+1}'] = importance
    
    # Save model
    lgb_model_path = LGB_DIR / f'lgb_fold{fold+1}_{TIMESTAMP}.pkl'
    save_pickle(lgbm, lgb_model_path)
    print(f'Saved LightGBM fold {fold+1}')
    
    # Evaluate
    fold_pred = np.argmax(val_proba, axis=1)
    f1_macro = f1_score(y_val, fold_pred, average='macro')
    f1_weighted = f1_score(y_val, fold_pred, average='weighted')
    print(f'Fold {fold+1} - Macro F1: {f1_macro:.5f}, Weighted F1: {f1_weighted:.5f}')
    
    # Show top features
    imp_series = pd.Series(importance, index=X_selected.columns).sort_values(ascending=False)
    print(f'\nTop 10 features: {list(imp_series.head(10).index)}')
    
    # Clean up
    del lgbm, X_train, X_val, y_train, y_val, val_proba, importance
    gc.collect()
    print_memory_usage(f"Fold {fold+1} end")


LGB folds:   0%|          | 0/5 [00:00<?, ?it/s]


LGB FOLD 1/5
[Fold 1 start] Memory: 5780.3 MB (36.9%)
Train: (11885708, 27), Val: (2971428, 27)
Starting LightGBM training...
[50]	valid_0's multi_logloss: 0.756767
[100]	valid_0's multi_logloss: 0.733933
[150]	valid_0's multi_logloss: 0.720216
[200]	valid_0's multi_logloss: 0.709568
[250]	valid_0's multi_logloss: 0.702159
[300]	valid_0's multi_logloss: 0.696551
[350]	valid_0's multi_logloss: 0.691461
[400]	valid_0's multi_logloss: 0.687267
[450]	valid_0's multi_logloss: 0.683842
[500]	valid_0's multi_logloss: 0.68089
[550]	valid_0's multi_logloss: 0.678308
[600]	valid_0's multi_logloss: 0.675998
[650]	valid_0's multi_logloss: 0.673934
[700]	valid_0's multi_logloss: 0.67196
[750]	valid_0's multi_logloss: 0.670317
[800]	valid_0's multi_logloss: 0.668782
[850]	valid_0's multi_logloss: 0.667342
[900]	valid_0's multi_logloss: 0.665996
[950]	valid_0's multi_logloss: 0.664731
[1000]	valid_0's multi_logloss: 0.663623
Training completed in: 0:19:39.761328
Saved LightGBM fold 1
Fold 1 - Macro 

LGB folds:  20%|██        | 1/5 [21:22<1:25:30, 1282.60s/it]

[Fold 1 end] Memory: 711.5 MB (4.5%)

LGB FOLD 2/5
[Fold 2 start] Memory: 881.6 MB (5.6%)
Train: (11885709, 27), Val: (2971427, 27)
Starting LightGBM training...
[50]	valid_0's multi_logloss: 0.756348
[100]	valid_0's multi_logloss: 0.73349
[150]	valid_0's multi_logloss: 0.719345
[200]	valid_0's multi_logloss: 0.708931
[250]	valid_0's multi_logloss: 0.701454
[300]	valid_0's multi_logloss: 0.695607
[350]	valid_0's multi_logloss: 0.690707
[400]	valid_0's multi_logloss: 0.686649
[450]	valid_0's multi_logloss: 0.683143
[500]	valid_0's multi_logloss: 0.67995
[550]	valid_0's multi_logloss: 0.677355
[600]	valid_0's multi_logloss: 0.674801
[650]	valid_0's multi_logloss: 0.672727
[700]	valid_0's multi_logloss: 0.6707
[750]	valid_0's multi_logloss: 0.669007
[800]	valid_0's multi_logloss: 0.667504
[850]	valid_0's multi_logloss: 0.666117
[900]	valid_0's multi_logloss: 0.664736
[950]	valid_0's multi_logloss: 0.663622
[1000]	valid_0's multi_logloss: 0.662503
Training completed in: 0:18:29.017962
Save

LGB folds:  40%|████      | 2/5 [41:16<1:01:30, 1230.22s/it]

Fold 2 - Macro F1: 0.61951, Weighted F1: 0.61881

Top 10 features: ['OrgId', 'OrgId_freq', 'DetectorId', 'incident_duration_seconds', 'AlertTitle', 'evidence_rate', 'evidence_count', 'category_InitialAccess_count', 'alert_rate', 'unique_entity_type_count']
[Fold 2 end] Memory: 880.0 MB (5.6%)

LGB FOLD 3/5
[Fold 3 start] Memory: 1132.7 MB (7.2%)
Train: (11885709, 27), Val: (2971427, 27)
Starting LightGBM training...
[50]	valid_0's multi_logloss: 0.756882
[100]	valid_0's multi_logloss: 0.733314
[150]	valid_0's multi_logloss: 0.718768
[200]	valid_0's multi_logloss: 0.708276
[250]	valid_0's multi_logloss: 0.70143
[300]	valid_0's multi_logloss: 0.695659
[350]	valid_0's multi_logloss: 0.69089
[400]	valid_0's multi_logloss: 0.68701
[450]	valid_0's multi_logloss: 0.683276
[500]	valid_0's multi_logloss: 0.680243
[550]	valid_0's multi_logloss: 0.677489
[600]	valid_0's multi_logloss: 0.675113
[650]	valid_0's multi_logloss: 0.673064
[700]	valid_0's multi_logloss: 0.671148
[750]	valid_0's multi_lo

LGB folds:  60%|██████    | 3/5 [59:57<39:20, 1180.49s/it]  

Fold 3 - Macro F1: 0.62012, Weighted F1: 0.61943

Top 10 features: ['OrgId', 'OrgId_freq', 'DetectorId', 'incident_duration_seconds', 'AlertTitle', 'evidence_rate', 'evidence_count', 'category_InitialAccess_count', 'alert_rate', 'unique_entity_type_count']
[Fold 3 end] Memory: 2055.1 MB (13.1%)

LGB FOLD 4/5
[Fold 4 start] Memory: 2055.1 MB (13.1%)
Train: (11885709, 27), Val: (2971427, 27)
Starting LightGBM training...
[50]	valid_0's multi_logloss: 0.756676
[100]	valid_0's multi_logloss: 0.732913
[150]	valid_0's multi_logloss: 0.718369
[200]	valid_0's multi_logloss: 0.708527
[250]	valid_0's multi_logloss: 0.701409
[300]	valid_0's multi_logloss: 0.695501
[350]	valid_0's multi_logloss: 0.691001
[400]	valid_0's multi_logloss: 0.68718
[450]	valid_0's multi_logloss: 0.683962
[500]	valid_0's multi_logloss: 0.68089
[550]	valid_0's multi_logloss: 0.678355
[600]	valid_0's multi_logloss: 0.675766
[650]	valid_0's multi_logloss: 0.673499
[700]	valid_0's multi_logloss: 0.671584
[750]	valid_0's mult

LGB folds:  80%|████████  | 4/5 [1:22:37<20:51, 1251.56s/it]


LGB FOLD 5/5
[Fold 5 start] Memory: 919.1 MB (5.9%)
Train: (11885709, 27), Val: (2971427, 27)
Starting LightGBM training...
[50]	valid_0's multi_logloss: 0.75586
[100]	valid_0's multi_logloss: 0.732254
[150]	valid_0's multi_logloss: 0.718298
[200]	valid_0's multi_logloss: 0.708338
[250]	valid_0's multi_logloss: 0.700654
[300]	valid_0's multi_logloss: 0.694826
[350]	valid_0's multi_logloss: 0.690144
[400]	valid_0's multi_logloss: 0.686526
[450]	valid_0's multi_logloss: 0.683024
[500]	valid_0's multi_logloss: 0.679993
[550]	valid_0's multi_logloss: 0.67736
[600]	valid_0's multi_logloss: 0.674944
[650]	valid_0's multi_logloss: 0.672742
[700]	valid_0's multi_logloss: 0.67082
[750]	valid_0's multi_logloss: 0.668983
[800]	valid_0's multi_logloss: 0.667524
[850]	valid_0's multi_logloss: 0.66619
[900]	valid_0's multi_logloss: 0.664913
[950]	valid_0's multi_logloss: 0.66372
[1000]	valid_0's multi_logloss: 0.662426
Training completed in: 0:19:12.547230
Saved LightGBM fold 5


LGB folds: 100%|██████████| 5/5 [1:43:30<00:00, 1242.07s/it]

Fold 5 - Macro F1: 0.61978, Weighted F1: 0.61898

Top 10 features: ['OrgId', 'OrgId_freq', 'DetectorId', 'incident_duration_seconds', 'AlertTitle', 'evidence_rate', 'evidence_count', 'category_InitialAccess_count', 'alert_rate', 'unique_entity_type_count']
[Fold 5 end] Memory: 769.3 MB (4.9%)





In [10]:
# Overall results
oof_lgb_preds = np.argmax(oof_lgb, axis=1)
overall_f1_macro_lgb = f1_score(y, oof_lgb_preds, average='macro')
overall_f1_weighted_lgb = f1_score(y, oof_lgb_preds, average='weighted')

print("LightGBM Overall Results")
print(f'Overall Macro F1: {overall_f1_macro_lgb:.5f}')
print(f"Overall Weighted F1: {overall_f1_weighted_lgb:.5f}")

LightGBM Overall Results
Overall Macro F1: 0.61963
Overall Weighted F1: 0.61889


In [11]:
feature_importances_lgb['mean'] = feature_importances_lgb.mean(axis=1)
feature_importances_lgb['std'] = feature_importances_lgb.std(axis=1)
top_features_lgb = feature_importances_lgb.sort_values('mean', ascending=False)

print('Top 20 LightGBM features:')
for i, (feat, row) in enumerate(top_features_lgb.head(20).iterrows(), 1):
    print(f"{i:2d}. {feat:<30} {row['mean']:8.2f} (±{row['std']:5.2f})")

Top 20 LightGBM features:
 1. OrgId                          10126.00 (±41.29)
 2. OrgId_freq                      6666.20 (±63.32)
 3. DetectorId                      5746.00 (±46.89)
 4. incident_duration_seconds       5443.00 (±112.79)
 5. AlertTitle                      4872.80 (±70.26)
 6. evidence_rate                   4801.80 (±70.75)
 7. evidence_count                  4224.80 (±51.89)
 8. alert_rate                      3571.20 (±49.60)
 9. category_InitialAccess_count    3517.00 (±45.20)
10. unique_entity_type_count        3286.20 (±40.52)
11. entity_User_count               2980.00 (±67.44)
12. entity_MailMessage_count        2879.40 (±58.48)
13. unique_org_id_count             2846.20 (±29.34)
14. entity_Ip_count                 2810.20 (±57.58)
15. entity_Machine_count            2614.00 (±31.55)
16. unique_alert_count              2396.20 (±26.41)
17. category_SuspiciousActivity_count  2382.20 (±25.36)
18. unique_detector_id_count        2378.20 (±19.95)
19. entity_File_

In [12]:
# Save it
np.save(LGB_DIR / f'oof_lgb_{TIMESTAMP}.npy', oof_lgb)
save_pickle(feature_importances_lgb, LGB_DIR / f'feature_importances_lgb_{TIMESTAMP}.pkl')

# XGBOOST

In [8]:
print('Section 3: XGBOOST Training')

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
oof_xgb = np.zeros((len(X_selected), num_classes), dtype=np.float32)
feature_importances_xgb = pd.DataFrame(index=X_selected.columns)

xgb_params = {
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'random_state': RANDOM_STATE,
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_child_weight': 5,
    'gamma': 0.1,
    'tree_method': 'hist',    
    'device': 'cuda',         
    'max_bin': 256,
    'eval_metric': 'mlogloss'
}



Section 3: XGBOOST Training


In [None]:
for fold, (train_idx, val_idx) in enumerate(tqdm(cv.split(X_selected, y), total=N_SPLITS, desc='XGB folds')):
    print(f"XGB FOLD {fold+1}/{N_SPLITS}")
    print_memory_usage(f"XGB FOLD {fold+1} start")
    
    X_train = X_selected.iloc[train_idx].copy()
    X_val = X_selected.iloc[val_idx].copy()
    y_train = y[train_idx]
    y_val = y[val_idx]

    # Label encoding categorical features
    label_encoders = {}
    for col in CATEGORICAL_FEATURES:
        if col in X_train.columns:
            le = LabelEncoder()
            combined_data = pd.concat([X_train[col].astype(str), X_val[col].astype(str)])
            le.fit(combined_data)
            X_train[col] = le.transform(X_train[col].astype(str))
            X_val[col] = le.transform(X_val[col].astype(str))
            label_encoders[col] = le

    for c in X_train.select_dtypes(include=[np.number]).columns:
        X_train[c] = X_train[c].astype(np.float32)
        X_val[c] = X_val[c].astype(np.float32)

    # Convert to DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=X_train.columns.tolist())
    dval = xgb.DMatrix(X_val, label=y_val, feature_names=X_val.columns.tolist())
    evals = [(dtrain, 'train'), (dval, 'eval')]
    
    print("Starting XGBoost training...")
    start_time = datetime.now()

    booster = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=1000,          
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=50
    )
    
    train_time = datetime.now() - start_time
    print(f"XGBoost training completed in: {train_time}")

    # Predict OOF
    xgb_val_proba = booster.predict(dval)
    oof_xgb[val_idx] = xgb_val_proba

    # Feature importance
    importance_dict = booster.get_score(importance_type='weight')
    importance = pd.Series(importance_dict, index=X_train.columns).reindex(X_selected.columns).fillna(0)
    feature_importances_xgb[f'fold_{fold+1}'] = importance
    
    # Save model + encoders
    xgb_pkl_path = XGB_DIR / f'xgb_fold{fold+1}_{TIMESTAMP}.pkl'
    save_pickle({'model': booster, 'label_encoders': label_encoders}, xgb_pkl_path)
    print(f'Saved XGBoost fold {fold+1}')
    
    # Evaluate
    fold_pred = np.argmax(xgb_val_proba, axis=1)
    f1_macro = f1_score(y_val, fold_pred, average='macro')
    f1_weighted = f1_score(y_val, fold_pred, average='weighted')
    print(f'Fold {fold+1} - Macro F1: {f1_macro:.5f}, Weighted F1: {f1_weighted:.5f}')
    
    # Clean up
    del booster, X_train, X_val, y_train, y_val, dtrain, dval, label_encoders
    gc.collect()
    print_memory_usage(f"XGB Fold {fold+1} end")


XGB folds:   0%|          | 0/5 [00:00<?, ?it/s]

XGB FOLD 1/5
[XGB FOLD 1 start] Memory: 6286.3 MB (40.1%)
Starting XGBoost training...
[0]	train-mlogloss:1.06491	eval-mlogloss:1.06494
[50]	train-mlogloss:0.78980	eval-mlogloss:0.79016
[100]	train-mlogloss:0.75778	eval-mlogloss:0.75828
[150]	train-mlogloss:0.73757	eval-mlogloss:0.73814
[200]	train-mlogloss:0.72360	eval-mlogloss:0.72422
[250]	train-mlogloss:0.71235	eval-mlogloss:0.71304
[300]	train-mlogloss:0.70388	eval-mlogloss:0.70463
[350]	train-mlogloss:0.69769	eval-mlogloss:0.69848
[400]	train-mlogloss:0.69220	eval-mlogloss:0.69304
[450]	train-mlogloss:0.68748	eval-mlogloss:0.68837
[500]	train-mlogloss:0.68350	eval-mlogloss:0.68443
[550]	train-mlogloss:0.67986	eval-mlogloss:0.68086
[600]	train-mlogloss:0.67650	eval-mlogloss:0.67757
[650]	train-mlogloss:0.67372	eval-mlogloss:0.67486
[700]	train-mlogloss:0.67103	eval-mlogloss:0.67223
[750]	train-mlogloss:0.66849	eval-mlogloss:0.66975
[800]	train-mlogloss:0.66609	eval-mlogloss:0.66741
[850]	train-mlogloss:0.66406	eval-mlogloss:0.6654

XGB folds:  20%|██        | 1/5 [03:07<12:29, 187.46s/it]

[XGB Fold 1 end] Memory: 5157.9 MB (32.9%)
XGB FOLD 2/5
[XGB FOLD 2 start] Memory: 5398.7 MB (34.5%)
Starting XGBoost training...
[0]	train-mlogloss:1.06543	eval-mlogloss:1.06543
[50]	train-mlogloss:0.79014	eval-mlogloss:0.78991
[100]	train-mlogloss:0.75587	eval-mlogloss:0.75569
[150]	train-mlogloss:0.73668	eval-mlogloss:0.73656
[200]	train-mlogloss:0.72255	eval-mlogloss:0.72250
[250]	train-mlogloss:0.71234	eval-mlogloss:0.71234
[300]	train-mlogloss:0.70375	eval-mlogloss:0.70380
[350]	train-mlogloss:0.69729	eval-mlogloss:0.69741
[400]	train-mlogloss:0.69178	eval-mlogloss:0.69198
[450]	train-mlogloss:0.68734	eval-mlogloss:0.68760
[500]	train-mlogloss:0.68318	eval-mlogloss:0.68351
[550]	train-mlogloss:0.67972	eval-mlogloss:0.68011
[600]	train-mlogloss:0.67634	eval-mlogloss:0.67680
[650]	train-mlogloss:0.67329	eval-mlogloss:0.67381
[700]	train-mlogloss:0.67079	eval-mlogloss:0.67138
[750]	train-mlogloss:0.66811	eval-mlogloss:0.66878
[800]	train-mlogloss:0.66582	eval-mlogloss:0.66654
[850]	

XGB folds:  40%|████      | 2/5 [06:14<09:21, 187.03s/it]

Fold 2 - Macro F1: 0.60746, Weighted F1: 0.61759
[XGB Fold 2 end] Memory: 6320.0 MB (40.4%)
XGB FOLD 3/5
[XGB FOLD 3 start] Memory: 6480.0 MB (41.4%)
Starting XGBoost training...
[0]	train-mlogloss:1.06492	eval-mlogloss:1.06494
[50]	train-mlogloss:0.78841	eval-mlogloss:0.78865
[100]	train-mlogloss:0.75606	eval-mlogloss:0.75629
[150]	train-mlogloss:0.73486	eval-mlogloss:0.73508
[200]	train-mlogloss:0.72230	eval-mlogloss:0.72255
[250]	train-mlogloss:0.71207	eval-mlogloss:0.71235
[300]	train-mlogloss:0.70355	eval-mlogloss:0.70390
[350]	train-mlogloss:0.69703	eval-mlogloss:0.69743
[400]	train-mlogloss:0.69203	eval-mlogloss:0.69248
[450]	train-mlogloss:0.68739	eval-mlogloss:0.68790
[500]	train-mlogloss:0.68331	eval-mlogloss:0.68387
[550]	train-mlogloss:0.67972	eval-mlogloss:0.68034
[600]	train-mlogloss:0.67659	eval-mlogloss:0.67727
[650]	train-mlogloss:0.67374	eval-mlogloss:0.67447
[700]	train-mlogloss:0.67093	eval-mlogloss:0.67172
[750]	train-mlogloss:0.66836	eval-mlogloss:0.66921
[800]	tr

XGB folds:  60%|██████    | 3/5 [09:19<06:12, 186.38s/it]

XGB FOLD 4/5
[XGB FOLD 4 start] Memory: 6438.7 MB (41.1%)
Starting XGBoost training...
[0]	train-mlogloss:1.06549	eval-mlogloss:1.06554
[50]	train-mlogloss:0.78975	eval-mlogloss:0.78998
[100]	train-mlogloss:0.75824	eval-mlogloss:0.75855
[150]	train-mlogloss:0.73669	eval-mlogloss:0.73706
[200]	train-mlogloss:0.72268	eval-mlogloss:0.72311
[250]	train-mlogloss:0.71145	eval-mlogloss:0.71198
[300]	train-mlogloss:0.70319	eval-mlogloss:0.70377
[350]	train-mlogloss:0.69651	eval-mlogloss:0.69714
[400]	train-mlogloss:0.69135	eval-mlogloss:0.69201
[450]	train-mlogloss:0.68653	eval-mlogloss:0.68723
[500]	train-mlogloss:0.68266	eval-mlogloss:0.68341
[550]	train-mlogloss:0.67931	eval-mlogloss:0.68011
[600]	train-mlogloss:0.67623	eval-mlogloss:0.67708
[650]	train-mlogloss:0.67331	eval-mlogloss:0.67421
[700]	train-mlogloss:0.67078	eval-mlogloss:0.67173
[750]	train-mlogloss:0.66828	eval-mlogloss:0.66928
[800]	train-mlogloss:0.66601	eval-mlogloss:0.66707
[850]	train-mlogloss:0.66386	eval-mlogloss:0.6649

XGB folds:  80%|████████  | 4/5 [12:34<03:09, 189.59s/it]

[XGB Fold 4 end] Memory: 4834.5 MB (30.9%)
XGB FOLD 5/5
[XGB FOLD 5 start] Memory: 5086.0 MB (32.5%)
Starting XGBoost training...
[0]	train-mlogloss:1.06563	eval-mlogloss:1.06557
[50]	train-mlogloss:0.79048	eval-mlogloss:0.79007
[100]	train-mlogloss:0.75434	eval-mlogloss:0.75392
[150]	train-mlogloss:0.73490	eval-mlogloss:0.73454
[200]	train-mlogloss:0.72187	eval-mlogloss:0.72158
[250]	train-mlogloss:0.71144	eval-mlogloss:0.71125
[300]	train-mlogloss:0.70339	eval-mlogloss:0.70328
[350]	train-mlogloss:0.69695	eval-mlogloss:0.69691
[400]	train-mlogloss:0.69178	eval-mlogloss:0.69180
[450]	train-mlogloss:0.68715	eval-mlogloss:0.68724
[500]	train-mlogloss:0.68283	eval-mlogloss:0.68301
[550]	train-mlogloss:0.67942	eval-mlogloss:0.67965
[600]	train-mlogloss:0.67635	eval-mlogloss:0.67666
[650]	train-mlogloss:0.67339	eval-mlogloss:0.67377
[700]	train-mlogloss:0.67075	eval-mlogloss:0.67120
[750]	train-mlogloss:0.66830	eval-mlogloss:0.66881
[800]	train-mlogloss:0.66600	eval-mlogloss:0.66658
[850]	

XGB folds: 100%|██████████| 5/5 [15:43<00:00, 188.74s/it]


In [10]:
oof_xgb_preds = np.argmax(oof_xgb, axis=1)
overall_f1_macro_xgb = f1_score(y, oof_xgb_preds, average='macro')
overall_f1_weighted_xgb = f1_score(y, oof_xgb_preds, average='weighted')

print("XGBoost Overall Results")
print(f'Overall Macro F1: {overall_f1_macro_xgb:.5f}')
print(f'Overall Weighted F1: {overall_f1_weighted_xgb:.5f}')

XGBoost Overall Results
Overall Macro F1: 0.60774
Overall Weighted F1: 0.61782


In [11]:
feature_importances_xgb["mean_importance"] = feature_importances_xgb.mean(axis=1)

# Sort by importance
top20_features = feature_importances_xgb["mean_importance"].sort_values(ascending=False).head(20)

print("Top 20 Feature Importances:")
print(top20_features)

Top 20 Feature Importances:
OrgId_freq                           15591.2
incident_duration_seconds             9289.0
evidence_count                        8978.8
evidence_rate                         7675.2
OrgId                                 6293.0
category_InitialAccess_count          6017.6
unique_entity_type_count              5956.0
alert_rate                            5886.2
DetectorId_freq                       5558.0
AlertTitle_freq                       5528.0
entity_Ip_count                       5516.8
entity_User_count                     5381.8
unique_alert_count                    5030.4
unique_org_id_count                   4993.2
entity_MailMessage_count              4690.8
unique_detector_id_count              4502.2
entity_Machine_count                  4296.4
entity_File_count                     4078.8
category_SuspiciousActivity_count     3513.8
unique_mitre_techniques_count         3061.6
Name: mean_importance, dtype: float64


In [12]:
# Save the model
np.save(XGB_DIR / f'oof_xgb_{TIMESTAMP}.npy', oof_xgb)
save_pickle(feature_importances_xgb, XGB_DIR / f'feature_importances_xgb_{TIMESTAMP}.pkl')

# Final Ensemble

In [None]:
# Final Ensemble
PROCESSED_FEATURES_PATH = '../data/processed/incident_features.parquet'
RAW_FEATURES_PATH = '../data/raw/GUIDE_Train.parquet'
TARGET = 'IncidentGrade'
OUT_DIR = Path('models')
LGB_DIR = OUT_DIR / 'lightgbm'
XGB_DIR = OUT_DIR / 'xgboost'


In [None]:
print("Recreating the target variable 'y' with the correct join logic...")
df_features = pl.read_parquet(PROCESSED_FEATURES_PATH)
raw_df = pl.read_parquet(RAW_FEATURES_PATH).select([
    'IncidentId', 'OrgId', 'DetectorId', 'AlertTitle', 'EntityType', 'Category'
])

df_for_y = df_features.join(raw_df, on='IncidentId', how='left')

df_pandas = df_for_y.to_pandas()
y, class_names = pd.factorize(df_pandas[TARGET])

print(f"Target variable 'y' is ready. Shape: {y.shape}")


Recreating the target variable 'y' with the correct join logic...
Target variable 'y' is ready. Shape: (14857136,)


In [None]:
import glob
print("\nLoading saved OOF prediction files...")
list_of_lgb_files = glob.glob(os.path.join(LGB_DIR, 'oof_lgb_*.npy'))
latest_lgb_file = max(list_of_lgb_files, key=os.path.getctime)
print(f"Found LightGBM OOF file: {latest_lgb_file}")

list_of_xgb_files = glob.glob(os.path.join(XGB_DIR, 'oof_xgb_*.npy'))
latest_xgb_file = max(list_of_xgb_files, key=os.path.getctime)
print(f"Found XGBoost OOF file: {latest_xgb_file}")

# Load the numpy arrays
oof_lgb = np.load(latest_lgb_file)
oof_xgb = np.load(latest_xgb_file)
print(f"Loaded OOF predictions. Shape: {oof_lgb.shape}")



Loading saved OOF prediction files...
Found LightGBM OOF file: models\lightgbm\oof_lgb_20250925_123541.npy
Found XGBoost OOF file: models\xgboost\oof_xgb_20250925_193826.npy
Loaded OOF predictions. Shape: (14857136, 3)


In [21]:
print('\nFinal Ensemble Calculation')

# Recalculate individual model scores for comparison
overall_f1_macro_lgb = f1_score(y, np.argmax(oof_lgb, axis=1), average='macro')
overall_f1_macro_xgb = f1_score(y, np.argmax(oof_xgb, axis=1), average='macro')

# Create the ensemble by averaging the predictions
oof_ensemble = (oof_lgb + oof_xgb) / 2.0
oof_ens_preds = np.argmax(oof_ensemble, axis=1)
f1_ens_macro = f1_score(y, oof_ens_preds, average='macro')

print('\nFinal Comparison (Macro F1):')
print(f"LightGBM: {overall_f1_macro_lgb:.5f}")
print(f"XGBoost:  {overall_f1_macro_xgb:.5f}")
print(f"Ensemble: {f1_ens_macro:.5f}")



Final Ensemble Calculation

Final Comparison (Macro F1):
LightGBM: 0.61963
XGBoost:  0.60774
Ensemble: 0.62343


In [23]:
improvement_threshold = 0.005
best_single = max(overall_f1_macro_lgb, overall_f1_macro_xgb)
if f1_ens_macro > best_single + improvement_threshold:
    recommendation = "ENSEMBLE"
elif overall_f1_macro_lgb > overall_f1_macro_xgb:
    recommendation = "LightGBM"
else:
    recommendation = "XGBoost"

print(f'RECOMMENDATION: Use {recommendation}')


RECOMMENDATION: Use LightGBM


In [24]:
# Save final results
results_summary = {
    'lightgbm_macro_f1': float(overall_f1_macro_lgb),
    'xgboost_macro_f1': float(overall_f1_macro_xgb), 
    'ensemble_macro_f1': float(f1_ens_macro),
    'recommendation': recommendation,
    'timestamp': TIMESTAMP
}

np.save(ENS_DIR / f'oof_ensemble_{TIMESTAMP}.npy', oof_ensemble)
save_pickle(results_summary, ENS_DIR / f'results_summary_{TIMESTAMP}.pkl')

print_memory_usage("Final")
print('\nTRAINING COMPLETED SUCCESSFULLY!')
print(f'All models saved with timestamp: {TIMESTAMP}')

[Final] Memory: 2346.7 MB (15.0%)

TRAINING COMPLETED SUCCESSFULLY!
All models saved with timestamp: 20250925_193826
