# Fusion Model Training (XGBoost)

Train XGBoost on fused features: `[tabular features || graph embeddings]`

**Protocol A (default):** Local features (AF1-93) + embeddings

**Steps:**
1. Load embeddings and features
2. Merge on txId
3. Train XGBoost with early stopping
4. Evaluate and compare with baseline metrics

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import yaml
from pathlib import Path

from src.data.elliptic_loader import EllipticDataset
from src.data.merge_embeddings import merge_embeddings_with_features
from src.train.fusion_xgb import train_xgb_fusion
from src.eval.fusion_report import create_comparison_report
from src.utils.seed import set_all_seeds

In [None]:
# Load config
with open('../configs/fusion_xgb.yaml', 'r') as f:
    config = yaml.safe_load(f)

set_all_seeds(config['seed'])
print(f"Config loaded: {config['experiment']}")

In [None]:
# Load dataset
data_root = Path(config['data']['root'])
dataset = EllipticDataset(data_root, use_local_only=config['data']['use_local_only'])
print(f"Dataset loaded")

In [None]:
# Merge embeddings with features
fused_df = merge_embeddings_with_features(
    embeddings_path=config['embed']['save_path'],
    features_path=data_root / config['data']['features'],
    use_local_only=config['data']['use_local_only']
)

print(f"Fused dataset shape: {fused_df.shape}")

In [None]:
# Prepare train/val/test splits
# Align with dataset splits
train_mask = dataset.splits['train']
val_mask = dataset.splits['val']
test_mask = dataset.splits['test']

# Get labeled nodes only
train_labeled = dataset.get_labeled_mask(train_mask)
val_labeled = dataset.get_labeled_mask(val_mask)
test_labeled = dataset.get_labeled_mask(test_mask)

# Extract features (exclude txId, Time step)
feature_cols = [col for col in fused_df.columns if col not in ['txId', 'Time step']]

X_train = fused_df.loc[train_labeled, feature_cols].values
y_train = dataset.labels[train_labeled]

X_val = fused_df.loc[val_labeled, feature_cols].values
y_val = dataset.labels[val_labeled]

X_test = fused_df.loc[test_labeled, feature_cols].values
y_test = dataset.labels[test_labeled]

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
# Train XGBoost
model, metrics = train_xgb_fusion(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    config=config['fusion']['xgb'],
    output_dir=config['logging']['out_dir']
)

print("\n=== Test Metrics ===")
for k, v in metrics['test'].items():
    print(f"{k}: {v:.4f}")

In [None]:
# Generate comparison report with baseline
comparison_df = create_comparison_report(
    fusion_metrics=metrics,
    baseline_csv=config['baseline']['metrics_csv'],
    output_dir=config['logging']['out_dir']
)

print("\n=== Model Comparison (Test Set) ===")
test_comparison = comparison_df[comparison_df['split'] == 'test'][['model', 'pr_auc', 'roc_auc', 'f1']]
print(test_comparison.to_string(index=False))