# M5: Tabular ML Baselines - Does Graph Help?

**Goal:** Train ML models WITHOUT graph structure and compare to GNN models.

**Models:**
- Logistic Regression
- Random Forest
- XGBoost
- MLP (Neural Net)

**Compare with:**
- GraphSAGE (best GNN): PR-AUC = 0.4483

**Dataset:** Elliptic++ (same 182 features, IGNORE graph edges)

**Expected Runtime:** 15-20 minutes on CPU

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    f1_score,
    precision_recall_curve,
    roc_curve
)

import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed
SEED = 42
np.random.seed(SEED)

print("‚úÖ Libraries loaded")

## 1. Load Dataset

In [None]:
# Auto-detect dataset path
kaggle_input = Path('/kaggle/input')
dataset_folders = list(kaggle_input.glob('*'))

print(f"Available datasets: {[f.name for f in dataset_folders]}")

# Find dataset folder
data_dir = None
for folder in dataset_folders:
    if (folder / 'elliptic_txs_features.csv').exists():
        data_dir = folder
        break

if data_dir is None:
    raise FileNotFoundError("‚ùå Dataset not found! Add 'elliptic-fraud-detection' in Kaggle data panel.")

print(f"‚úÖ Using: {data_dir.name}")

# Load CSVs
features_df = pd.read_csv(data_dir / 'elliptic_txs_features.csv', header=None)
classes_df = pd.read_csv(data_dir / 'elliptic_txs_classes.csv')

print(f"Features shape: {features_df.shape}")
print(f"Classes shape: {classes_df.shape}")

## 2. Preprocess Data

In [None]:
# Rename columns
feature_cols = ['txId', 'Time step'] + [f'AF{i}' for i in range(1, 183)]
features_df.columns = feature_cols

# Merge features + classes
df = features_df.merge(classes_df, on='txId', how='left')

# Filter LABELED transactions only
df_labeled = df[df['class'].isin(['1', '2'])].copy()
df_labeled['label'] = df_labeled['class'].apply(lambda x: 1 if x == '2' else 0)

print(f"Total transactions: {len(df)}")
print(f"Labeled transactions: {len(df_labeled)}")
print(f"Fraud percentage: {df_labeled['label'].mean()*100:.2f}%")
print(f"Time range: {df_labeled['Time step'].min()} to {df_labeled['Time step'].max()}")

## 3. Create Temporal Splits (Same as GNN Models)

In [None]:
# Sort by timestamp
df_labeled = df_labeled.sort_values('Time step').reset_index(drop=True)

# Split: 60% train, 20% val, 20% test (temporal)
n = len(df_labeled)
train_size = int(0.6 * n)
val_size = int(0.2 * n)

train_df = df_labeled.iloc[:train_size]
val_df = df_labeled.iloc[train_size:train_size+val_size]
test_df = df_labeled.iloc[train_size+val_size:]

print(f"\nüìä Temporal Splits:")
print(f"Train: {len(train_df):>6} ({len(train_df)/n*100:.1f}%) | Fraud: {train_df['label'].mean()*100:.2f}%")
print(f"Val:   {len(val_df):>6} ({len(val_df)/n*100:.1f}%) | Fraud: {val_df['label'].mean()*100:.2f}%")
print(f"Test:  {len(test_df):>6} ({len(test_df)/n*100:.1f}%) | Fraud: {test_df['label'].mean()*100:.2f}%")

## 4. Prepare Features

In [None]:
# Extract feature columns (AF1-AF182)
feature_names = [f'AF{i}' for i in range(1, 183)]

X_train = train_df[feature_names].values
y_train = train_df['label'].values

X_val = val_df[feature_names].values
y_val = val_df['label'].values

X_test = test_df[feature_names].values
y_test = test_df['label'].values

# Standardize features (fit on train only)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(f"‚úÖ Features prepared")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val:   {X_val.shape}, y_val:   {y_val.shape}")
print(f"X_test:  {X_test.shape}, y_test:  {y_test.shape}")

## 5. Helper Functions

In [None]:
def evaluate_model(y_true, y_proba):
    """Calculate all metrics"""
    pr_auc = average_precision_score(y_true, y_proba)
    roc_auc = roc_auc_score(y_true, y_proba)
    
    # F1 with threshold 0.5
    y_pred = (y_proba >= 0.5).astype(int)
    f1 = f1_score(y_true, y_pred)
    
    # Recall@1%
    top_1pct_idx = np.argsort(y_proba)[-int(len(y_proba)*0.01):]
    recall_at_1 = y_true[top_1pct_idx].mean()
    
    return {
        'pr_auc': pr_auc,
        'roc_auc': roc_auc,
        'f1_score': f1,
        'recall_at_1pct': recall_at_1
    }

def print_results(model_name, metrics):
    """Pretty print results"""
    print(f"\n{'='*50}")
    print(f"  {model_name}")
    print(f"{'='*50}")
    print(f"  PR-AUC:       {metrics['pr_auc']:.4f}")
    print(f"  ROC-AUC:      {metrics['roc_auc']:.4f}")
    print(f"  F1 Score:     {metrics['f1_score']:.4f}")
    print(f"  Recall@1%:    {metrics['recall_at_1pct']:.4f}")
    print(f"{'='*50}")

# Storage for all results
all_results = {}

print("‚úÖ Helper functions ready")

## 6. Model 1: Logistic Regression

In [None]:
print("üîµ Training Logistic Regression...")

# Calculate class weight
fraud_ratio = y_train.mean()
class_weight = {0: 1.0, 1: (1 - fraud_ratio) / fraud_ratio}

lr = LogisticRegression(
    max_iter=1000,
    class_weight=class_weight,
    random_state=SEED,
    n_jobs=-1
)

lr.fit(X_train, y_train)
y_proba = lr.predict_proba(X_test)[:, 1]

lr_metrics = evaluate_model(y_test, y_proba)
all_results['Logistic Regression'] = lr_metrics
print_results('Logistic Regression', lr_metrics)

## 7. Model 2: Random Forest

In [None]:
print("üå≤ Training Random Forest...")

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=4,
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_proba = rf.predict_proba(X_test)[:, 1]

rf_metrics = evaluate_model(y_test, y_proba)
all_results['Random Forest'] = rf_metrics
print_results('Random Forest', rf_metrics)

## 8. Model 3: XGBoost (Expected Best)

In [None]:
print("‚ö° Training XGBoost...")

# Calculate scale_pos_weight
scale_pos_weight = (1 - fraud_ratio) / fraud_ratio

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=SEED,
    tree_method='hist',
    eval_metric='aucpr',
    n_jobs=-1
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

y_proba = xgb_model.predict_proba(X_test)[:, 1]

xgb_metrics = evaluate_model(y_test, y_proba)
all_results['XGBoost'] = xgb_metrics
print_results('XGBoost', xgb_metrics)

## 9. Model 4: MLP (Neural Network)

In [None]:
print("üß† Training MLP...")

mlp = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    learning_rate='adaptive',
    max_iter=100,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=SEED
)

mlp.fit(X_train, y_train)
y_proba = mlp.predict_proba(X_test)[:, 1]

mlp_metrics = evaluate_model(y_test, y_proba)
all_results['MLP'] = mlp_metrics
print_results('MLP', mlp_metrics)

## 10. Compare All Models (Including GNNs)

In [None]:
# Add GNN results for comparison
gnn_results = {
    'GCN': {'pr_auc': 0.1976, 'roc_auc': 0.7627, 'f1_score': 0.2487, 'recall_at_1pct': 0.0613},
    'GAT': {'pr_auc': 0.1839, 'roc_auc': 0.7942, 'f1_score': 0.2901, 'recall_at_1pct': 0.0126},
    'GraphSAGE': {'pr_auc': 0.4483, 'roc_auc': 0.8210, 'f1_score': 0.4527, 'recall_at_1pct': 0.1478}
}

all_results.update(gnn_results)

# Create comparison DataFrame
comparison_df = pd.DataFrame(all_results).T
comparison_df['type'] = ['ML', 'ML', 'ML', 'ML', 'GNN', 'GNN', 'GNN']
comparison_df = comparison_df.sort_values('pr_auc', ascending=False)

print("\n" + "="*80)
print("  FINAL RESULTS - ALL MODELS")
print("="*80)
print(comparison_df.to_string())
print("="*80)

# Find winners
best_ml = comparison_df[comparison_df['type'] == 'ML'].iloc[0]
best_gnn = comparison_df[comparison_df['type'] == 'GNN'].iloc[0]

print(f"\nü•á Best ML Model:  {best_ml.name} (PR-AUC: {best_ml['pr_auc']:.4f})")
print(f"ü•à Best GNN Model: {best_gnn.name} (PR-AUC: {best_gnn['pr_auc']:.4f})")

gap = (best_ml['pr_auc'] - best_gnn['pr_auc']) / best_gnn['pr_auc'] * 100
print(f"\nüìä Gap: ML models are {gap:+.1f}% {'better' if gap > 0 else 'worse'} than GNN")

## 11. Visualization

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(20, 5))

metrics = ['pr_auc', 'roc_auc', 'f1_score', 'recall_at_1pct']
titles = ['PR-AUC', 'ROC-AUC', 'F1 Score', 'Recall@1%']

for ax, metric, title in zip(axes, metrics, titles):
    data = comparison_df.reset_index()
    colors = ['blue' if t == 'ML' else 'green' for t in data['type']]
    
    ax.barh(data['index'], data[metric], color=colors, alpha=0.7)
    ax.set_xlabel(title, fontsize=12)
    ax.set_title(f'{title} Comparison', fontsize=14, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add values
    for i, v in enumerate(data[metric]):
        ax.text(v + 0.01, i, f'{v:.3f}', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('all_models_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Visualization saved: all_models_comparison.png")

## 12. Save Results

In [None]:
# Save individual model metrics
for model_name in ['Logistic Regression', 'Random Forest', 'XGBoost', 'MLP']:
    if model_name in all_results:
        filename = model_name.lower().replace(' ', '_') + '_metrics.json'
        with open(filename, 'w') as f:
            json.dump(all_results[model_name], f, indent=2)
        print(f"‚úÖ Saved: {filename}")

# Save comparison CSV
comparison_df.to_csv('all_models_comparison.csv')
print(f"‚úÖ Saved: all_models_comparison.csv")

print("\n" + "="*80)
print("  üì¶ FILES TO DOWNLOAD:")
print("="*80)
print("  1. logistic_regression_metrics.json")
print("  2. random_forest_metrics.json")
print("  3. xgboost_metrics.json")
print("  4. mlp_metrics.json")
print("  5. all_models_comparison.csv")
print("  6. all_models_comparison.png")
print("="*80)

## 13. Analysis & Conclusion

In [None]:
print("\n" + "="*80)
print("  üìä KEY FINDINGS")
print("="*80)

if best_ml['pr_auc'] > best_gnn['pr_auc']:
    print("\n‚úÖ RESULT: Tabular ML models OUTPERFORM GNNs!")
    print(f"\nBest ML (${best_ml.name}):   PR-AUC = {best_ml['pr_auc']:.4f}")
    print(f"Best GNN (GraphSAGE): PR-AUC = {best_gnn['pr_auc']:.4f}")
    print(f"\nGap: {abs(gap):.1f}% better")
    
    print("\nüîç WHY?")
    print("  1. Dataset is 90%+ fraud (extreme imbalance)")
    print("  2. Features AF94-182 already contain neighbor aggregations")
    print("  3. GNNs propagate wrong labels from fraud-heavy neighborhoods")
    print("  4. Tabular models handle imbalance better with class weights")
    
    print("\nüí° RECOMMENDATION:")
    print(f"  ‚Üí Deploy {best_ml.name} for production")
    print("  ‚Üí Simpler, faster, no GPU needed")
    print("  ‚Üí Better performance than complex GNNs")
    
else:
    print("\n‚úÖ RESULT: GNNs OUTPERFORM Tabular ML!")
    print(f"\nBest GNN (GraphSAGE): PR-AUC = {best_gnn['pr_auc']:.4f}")
    print(f"Best ML ({best_ml.name}):  PR-AUC = {best_ml['pr_auc']:.4f}")
    print(f"\nGap: {abs(gap):.1f}% better")
    
    print("\nüîç WHY?")
    print("  ‚Üí Graph structure captures fraud patterns")
    print("  ‚Üí Message passing leverages network effects")
    print("  ‚Üí GNNs learn optimal neighbor aggregations")
    
    print("\nüí° RECOMMENDATION:")
    print("  ‚Üí Deploy GraphSAGE for best performance")
    print("  ‚Üí Graph structure adds significant value")

print("\n" + "="*80)
print("  ‚úÖ M5 COMPLETE")
print("="*80)