# M5: Tabular Baselines (No Graph Structure)

**Question:** Does the graph help fraud detection, or are features alone sufficient?

**Approach:**
- Train ML models on node features ONLY (ignore edges)
- Use SAME temporal splits as GNN models
- Compare: Best ML vs GraphSAGE (0.4483 PR-AUC)

**Models:**
1. Logistic Regression (linear baseline)
2. Random Forest (tree ensemble)
3. XGBoost (gradient boosting - expected best)
4. MLP (neural net, no graph)

**Expected:** XGBoost PR-AUC ‚âà 0.25-0.35 (worse than GraphSAGE 0.45)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    f1_score,
    precision_recall_curve,
    roc_curve
)
from sklearn.utils.class_weight import compute_class_weight

import xgboost as xgb

print("‚úÖ Libraries imported")

## 1. Load Data & Temporal Splits

In [None]:
# Load features and labels
data_dir = Path('../data/elliptic')

features = pd.read_csv(data_dir / 'txs_features.csv')
classes = pd.read_csv(data_dir / 'txs_classes.csv')

print(f"Features shape: {features.shape}")
print(f"Classes shape: {classes.shape}")

# Merge
df = features.merge(classes, on='txId', how='left')

# Rename timestamp column
df = df.rename(columns={'Time step': 'timestamp'})

print(f"\nMerged shape: {df.shape}")
print(f"Columns: {list(df.columns[:5])}...")

# Filter to labeled only
df_labeled = df[df['class'].isin([1, 2])].copy()
df_labeled['label'] = (df_labeled['class'] == 2).astype(int)  # 2=fraud, 1=legit

print(f"\nLabeled transactions: {len(df_labeled)}")
print(f"Fraud percentage: {df_labeled['label'].mean()*100:.2f}%")  # Should be ~8-10%
print(f"Timestamp range: {df_labeled['timestamp'].min()} to {df_labeled['timestamp'].max()}")

In [None]:
# Create temporal splits (SAME AS GNN MODELS)
# Train: timestamps 1-29
# Val: timestamps 30-39  
# Test: timestamps 40-49

train_df = df_labeled[df_labeled['timestamp'] <= 29]
val_df = df_labeled[(df_labeled['timestamp'] > 29) & (df_labeled['timestamp'] <= 39)]
test_df = df_labeled[df_labeled['timestamp'] > 39]

print("Temporal Splits:")
print(f"Train: {len(train_df):5d} samples | Fraud: {train_df['label'].mean()*100:5.2f}% | Time: 1-29")
print(f"Val:   {len(val_df):5d} samples | Fraud: {val_df['label'].mean()*100:5.2f}% | Time: 30-39")
print(f"Test:  {len(test_df):5d} samples | Fraud: {test_df['label'].mean()*100:5.2f}% | Time: 40-49")

# Verify no overlap
assert len(set(train_df.index) & set(val_df.index)) == 0, "Train-Val overlap!"
assert len(set(train_df.index) & set(test_df.index)) == 0, "Train-Test overlap!"
assert len(set(val_df.index) & set(test_df.index)) == 0, "Val-Test overlap!"
print("\n‚úÖ No data leakage - splits are clean")

In [None]:
# Prepare features and labels
feature_cols = [c for c in df_labeled.columns if c.startswith('Local_feature') or c.startswith('Aggregate_feature')]

X_train = train_df[feature_cols].values
y_train = train_df['label'].values

X_val = val_df[feature_cols].values
y_val = val_df['label'].values

X_test = test_df[feature_cols].values
y_test = test_df['label'].values

print(f"Feature matrix shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_val:   {X_val.shape}")
print(f"X_test:  {X_test.shape}")
print(f"\nNumber of features: {len(feature_cols)}")

# Compute class weights for imbalanced data
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"\nClass weights: {class_weight_dict}")

## 2. Train Models

### Model 1: Logistic Regression

In [None]:
print("Training Logistic Regression...")

lr_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

lr_model.fit(X_train, y_train)

# Predictions
lr_probs = lr_model.predict_proba(X_test)[:, 1]

# Metrics
lr_pr_auc = average_precision_score(y_test, lr_probs)
lr_roc_auc = roc_auc_score(y_test, lr_probs)

# F1 with threshold from validation
val_probs = lr_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[min(best_idx, len(thresholds)-1)]
lr_preds = (lr_probs >= best_threshold).astype(int)
lr_f1 = f1_score(y_test, lr_preds)

# Recall@1%
k = max(1, int(0.01 * len(y_test)))
top_k_indices = np.argsort(lr_probs)[-k:]
lr_recall_1pct = y_test[top_k_indices].sum() / y_test.sum()

print(f"\n‚úÖ Logistic Regression Results:")
print(f"   PR-AUC:      {lr_pr_auc:.4f}")
print(f"   ROC-AUC:     {lr_roc_auc:.4f}")
print(f"   F1 Score:    {lr_f1:.4f}")
print(f"   Recall@1%:   {lr_recall_1pct:.4f}")

### Model 2: Random Forest

In [None]:
print("Training Random Forest...")

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Predictions
rf_probs = rf_model.predict_proba(X_test)[:, 1]

# Metrics
rf_pr_auc = average_precision_score(y_test, rf_probs)
rf_roc_auc = roc_auc_score(y_test, rf_probs)

# F1 with threshold from validation
val_probs = rf_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[min(best_idx, len(thresholds)-1)]
rf_preds = (rf_probs >= best_threshold).astype(int)
rf_f1 = f1_score(y_test, rf_preds)

# Recall@1%
top_k_indices = np.argsort(rf_probs)[-k:]
rf_recall_1pct = y_test[top_k_indices].sum() / y_test.sum()

print(f"\n‚úÖ Random Forest Results:")
print(f"   PR-AUC:      {rf_pr_auc:.4f}")
print(f"   ROC-AUC:     {rf_roc_auc:.4f}")
print(f"   F1 Score:    {rf_f1:.4f}")
print(f"   Recall@1%:   {rf_recall_1pct:.4f}")

### Model 3: XGBoost (Expected Best)

In [None]:
print("Training XGBoost...")

# Calculate scale_pos_weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    eval_metric='aucpr'
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Predictions
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

# Metrics
xgb_pr_auc = average_precision_score(y_test, xgb_probs)
xgb_roc_auc = roc_auc_score(y_test, xgb_probs)

# F1 with threshold from validation
val_probs = xgb_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[min(best_idx, len(thresholds)-1)]
xgb_preds = (xgb_probs >= best_threshold).astype(int)
xgb_f1 = f1_score(y_test, xgb_preds)

# Recall@1%
top_k_indices = np.argsort(xgb_probs)[-k:]
xgb_recall_1pct = y_test[top_k_indices].sum() / y_test.sum()

print(f"\n‚úÖ XGBoost Results:")
print(f"   PR-AUC:      {xgb_pr_auc:.4f}")
print(f"   ROC-AUC:     {xgb_roc_auc:.4f}")
print(f"   F1 Score:    {xgb_f1:.4f}")
print(f"   Recall@1%:   {xgb_recall_1pct:.4f}")

### Model 4: MLP (Neural Network, No Graph)

In [None]:
print("Training MLP...")

mlp_model = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    max_iter=100,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10
)

mlp_model.fit(X_train, y_train)

# Predictions
mlp_probs = mlp_model.predict_proba(X_test)[:, 1]

# Metrics
mlp_pr_auc = average_precision_score(y_test, mlp_probs)
mlp_roc_auc = roc_auc_score(y_test, mlp_probs)

# F1 with threshold from validation
val_probs = mlp_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[min(best_idx, len(thresholds)-1)]
mlp_preds = (mlp_probs >= best_threshold).astype(int)
mlp_f1 = f1_score(y_test, mlp_preds)

# Recall@1%
top_k_indices = np.argsort(mlp_probs)[-k:]
mlp_recall_1pct = y_test[top_k_indices].sum() / y_test.sum()

print(f"\n‚úÖ MLP Results:")
print(f"   PR-AUC:      {mlp_pr_auc:.4f}")
print(f"   ROC-AUC:     {mlp_roc_auc:.4f}")
print(f"   F1 Score:    {mlp_f1:.4f}")
print(f"   Recall@1%:   {mlp_recall_1pct:.4f}")

## 3. Compare All Models (Tabular + GNN)

In [None]:
# GNN results (from M3/M4)
gnn_results = {
    'GCN': {'pr_auc': 0.1976, 'roc_auc': 0.7627, 'f1': 0.2487, 'recall_1pct': 0.0613},
    'GraphSAGE': {'pr_auc': 0.4483, 'roc_auc': 0.8210, 'f1': 0.4527, 'recall_1pct': 0.1478},
    'GAT': {'pr_auc': 0.1839, 'roc_auc': 0.7942, 'f1': 0.2901, 'recall_1pct': 0.0126}
}

# Create comparison dataframe
results = {
    'Model': [
        'Logistic Regression', 'Random Forest', 'XGBoost', 'MLP',
        'GCN', 'GraphSAGE', 'GAT'
    ],
    'Type': [
        'Tabular', 'Tabular', 'Tabular', 'Tabular',
        'GNN', 'GNN', 'GNN'
    ],
    'PR-AUC': [
        lr_pr_auc, rf_pr_auc, xgb_pr_auc, mlp_pr_auc,
        gnn_results['GCN']['pr_auc'],
        gnn_results['GraphSAGE']['pr_auc'],
        gnn_results['GAT']['pr_auc']
    ],
    'ROC-AUC': [
        lr_roc_auc, rf_roc_auc, xgb_roc_auc, mlp_roc_auc,
        gnn_results['GCN']['roc_auc'],
        gnn_results['GraphSAGE']['roc_auc'],
        gnn_results['GAT']['roc_auc']
    ],
    'F1 Score': [
        lr_f1, rf_f1, xgb_f1, mlp_f1,
        gnn_results['GCN']['f1'],
        gnn_results['GraphSAGE']['f1'],
        gnn_results['GAT']['f1']
    ],
    'Recall@1%': [
        lr_recall_1pct, rf_recall_1pct, xgb_recall_1pct, mlp_recall_1pct,
        gnn_results['GCN']['recall_1pct'],
        gnn_results['GraphSAGE']['recall_1pct'],
        gnn_results['GAT']['recall_1pct']
    ]
}

df_results = pd.DataFrame(results)
df_results = df_results.sort_values('PR-AUC', ascending=False)

print("\n" + "="*80)
print("FINAL RESULTS: ALL MODELS")
print("="*80)
print(df_results.to_string(index=False))
print("="*80)

# Save
df_results.to_csv('../reports/all_models_comparison.csv', index=False)
print("\n‚úÖ Saved: reports/all_models_comparison.csv")

## 4. Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# PR-AUC comparison
ax = axes[0, 0]
colors = ['blue' if t == 'Tabular' else 'green' for t in df_results['Type']]
ax.barh(df_results['Model'], df_results['PR-AUC'], color=colors, alpha=0.7)
ax.set_xlabel('PR-AUC', fontsize=12)
ax.set_title('PR-AUC Comparison (Higher = Better)', fontsize=14, fontweight='bold')
ax.axvline(x=0.4483, color='red', linestyle='--', label='GraphSAGE (0.4483)')
ax.legend()
ax.grid(axis='x', alpha=0.3)

# ROC-AUC comparison
ax = axes[0, 1]
ax.barh(df_results['Model'], df_results['ROC-AUC'], color=colors, alpha=0.7)
ax.set_xlabel('ROC-AUC', fontsize=12)
ax.set_title('ROC-AUC Comparison', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

# F1 Score comparison
ax = axes[1, 0]
ax.barh(df_results['Model'], df_results['F1 Score'], color=colors, alpha=0.7)
ax.set_xlabel('F1 Score', fontsize=12)
ax.set_title('F1 Score Comparison', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

# Recall@1% comparison
ax = axes[1, 1]
ax.barh(df_results['Model'], df_results['Recall@1%'], color=colors, alpha=0.7)
ax.set_xlabel('Recall@1%', fontsize=12)
ax.set_title('Recall@1% (Fraud Detection Rate)', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='blue', alpha=0.7, label='Tabular (No Graph)'),
    Patch(facecolor='green', alpha=0.7, label='GNN (With Graph)')
]
fig.legend(handles=legend_elements, loc='upper center', ncol=2, fontsize=12)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.savefig('../reports/plots/all_models_comparison.png', dpi=150, bbox_inches='tight')
print("\n‚úÖ Saved: reports/plots/all_models_comparison.png")
plt.show()

## 5. Analysis & Conclusion

In [None]:
# Find best models
best_tabular = df_results[df_results['Type'] == 'Tabular'].iloc[0]
best_gnn = df_results[df_results['Type'] == 'GNN'].iloc[0]

print("\n" + "="*80)
print("ANALYSIS")
print("="*80)

print(f"\nüìä Best Tabular Model: {best_tabular['Model']}")
print(f"   PR-AUC: {best_tabular['PR-AUC']:.4f}")

print(f"\nüß† Best GNN Model: {best_gnn['Model']}")
print(f"   PR-AUC: {best_gnn['PR-AUC']:.4f}")

gap = ((best_gnn['PR-AUC'] - best_tabular['PR-AUC']) / best_tabular['PR-AUC']) * 100

print(f"\nüéØ Graph Structure Impact:")
print(f"   PR-AUC Improvement: {gap:+.1f}%")

if gap > 20:
    conclusion = "üèÜ Graph is ESSENTIAL! GNNs win decisively."
    recommendation = "Deploy GraphSAGE for best fraud detection."
elif gap > 5:
    conclusion = "‚úÖ Graph helps moderately. GNNs worth the complexity."
    recommendation = "GraphSAGE recommended, but XGBoost viable for simpler deployment."
elif gap > -5:
    conclusion = "‚öñÔ∏è  Roughly equal. Choose based on operational requirements."
    recommendation = "XGBoost for simplicity, GraphSAGE for marginal gain."
else:
    conclusion = "‚ö†Ô∏è  Tabular wins! Graph adds noise."
    recommendation = "Deploy XGBoost. GNNs not justified."

print(f"\n{conclusion}")
print(f"\nüí° Recommendation: {recommendation}")
print("\n" + "="*80)

## 6. Save Individual Model Metrics

In [None]:
# Save individual results
models_data = [
    ('logistic_regression', lr_pr_auc, lr_roc_auc, lr_f1, lr_recall_1pct),
    ('random_forest', rf_pr_auc, rf_roc_auc, rf_f1, rf_recall_1pct),
    ('xgboost', xgb_pr_auc, xgb_roc_auc, xgb_f1, xgb_recall_1pct),
    ('mlp', mlp_pr_auc, mlp_roc_auc, mlp_f1, mlp_recall_1pct)
]

for model_name, pr_auc, roc_auc, f1, recall_1pct in models_data:
    metrics = {
        'test_pr_auc': float(pr_auc),
        'test_roc_auc': float(roc_auc),
        'test_f1': float(f1),
        'test_recall_1pct': float(recall_1pct)
    }
    
    with open(f'../reports/{model_name}_metrics.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f"‚úÖ Saved: reports/{model_name}_metrics.json")

print("\n‚úÖ M5 COMPLETE! All artifacts saved.")