# 05 - Tabular Baselines (Kaggle)

**Goal:** Train traditional ML models on node features ONLY (no graph structure).

**Question:** Does the graph actually help fraud detection?

**Models:**
1. Logistic Regression - Linear baseline
2. Random Forest - Tree ensemble
3. XGBoost - Gradient boosting (expected best)
4. MLP - Neural network without graph

**Setup:**
- Same temporal splits as GNN models
- Same evaluation metrics (PR-AUC, ROC-AUC, F1, Recall@1%)
- Compare with GraphSAGE (PR-AUC: 0.4483)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    precision_recall_curve,
    roc_curve,
    f1_score,
    classification_report
)
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Load Data & Features

In [None]:
# Load Elliptic++ dataset
# Expected Kaggle dataset structure:
# /kaggle/input/elliptic-fraud-detection/
#   - elliptic_txs_features.csv
#   - elliptic_txs_classes.csv

import os
kaggle_input = Path('/kaggle/input')

# Auto-detect dataset folder
dataset_folders = list(kaggle_input.glob('*'))
print(f"Available datasets: {[f.name for f in dataset_folders]}")

# Find the correct folder
data_dir = None
for folder in dataset_folders:
    if (folder / 'elliptic_txs_features.csv').exists():
        data_dir = folder
        break

if data_dir is None:
    raise FileNotFoundError(
        "Dataset not found! Please add 'elliptic-fraud-detection' dataset in Kaggle.\n"
        "Go to: Add Data → Search 'elliptic' → Add dataset"
    )

print(f"Using dataset: {data_dir.name}")
print(f"Files: {[f.name for f in data_dir.glob('*.csv')]}")

# Load features and classes
print("\nLoading Elliptic++ dataset...")
features_df = pd.read_csv(data_dir / 'elliptic_txs_features.csv')
classes_df = pd.read_csv(data_dir / 'elliptic_txs_classes.csv')

print(f"Features shape: {features_df.shape}")
print(f"Classes shape: {classes_df.shape}")

# Merge features and classes
df = features_df.merge(classes_df, on='txId', how='left')
print(f"Merged shape: {df.shape}")
print(f"\nSample columns: {list(df.columns[:10])}...")
print(f"Unique class values: {sorted(df['class'].dropna().unique())}")

## 2. Prepare Features & Labels

In [None]:
# Identify timestamp column
time_col = 'Time step' if 'Time step' in df.columns else 'timestamp'
if time_col not in df.columns:
    raise ValueError(f"No timestamp column found! Available: {df.columns.tolist()}")

df['timestamp'] = df[time_col]

# Filter labeled transactions only (class in {1, 2})
df['class'] = df['class'].astype(str).str.strip()
df_labeled = df[df['class'].isin(['1', '2'])].copy()

if df_labeled.empty:
    raise ValueError(f"No labeled data! Available classes: {df['class'].value_counts().to_dict()}")

# Auto-detect fraud class (minority)
class_counts = df_labeled['class'].value_counts().to_dict()
fraud_class = min(class_counts, key=class_counts.get)
legit_class = '1' if fraud_class == '2' else '2'

df_labeled['label'] = (df_labeled['class'] == fraud_class).astype(int)

print(f"Labeled transactions: {len(df_labeled)}")
print(f"Fraud class: {fraud_class} (minority); Legit: {legit_class}")
print(f"Fraud %: {df_labeled['label'].mean()*100:.2f}%")
print(f"Timestamp range: {df_labeled['timestamp'].min()} to {df_labeled['timestamp'].max()}")

## 3. Create Temporal Splits (Same as GNN Models)

In [None]:
# Sort by timestamp
df_labeled = df_labeled.sort_values('timestamp').reset_index(drop=True)

# Split: 60% train, 20% val, 20% test
n = len(df_labeled)
train_size = int(0.6 * n)
val_size = int(0.2 * n)

train_df = df_labeled.iloc[:train_size]
val_df = df_labeled.iloc[train_size:train_size+val_size]
test_df = df_labeled.iloc[train_size+val_size:]

print(f"\nTemporal Splits:")
print(f"Train: {len(train_df)} ({len(train_df)/n*100:.1f}%) | Fraud: {train_df['label'].mean()*100:.2f}%")
print(f"Val:   {len(val_df)} ({len(val_df)/n*100:.1f}%) | Fraud: {val_df['label'].mean()*100:.2f}%")
print(f"Test:  {len(test_df)} ({len(test_df)/n*100:.1f}%) | Fraud: {test_df['label'].mean()*100:.2f}%")

## 4. Feature Engineering

In [None]:
# Select feature columns (exclude txId, timestamp, class, label)
feature_cols = [c for c in df_labeled.columns if c not in ['txId', 'timestamp', 'Time step', 'class', 'label']]
print(f"Using {len(feature_cols)} features")

# Extract features and labels
X_train = train_df[feature_cols].values
y_train = train_df['label'].values

X_val = val_df[feature_cols].values
y_val = val_df['label'].values

X_test = test_df[feature_cols].values
y_test = test_df['label'].values

# Handle inf/NaN
X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_val = np.nan_to_num(X_val, nan=0.0, posinf=0.0, neginf=0.0)
X_test = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(f"\nTrain: X={X_train.shape}, y fraud={y_train.mean()*100:.2f}%")
print(f"Val:   X={X_val.shape}, y fraud={y_val.mean()*100:.2f}%")
print(f"Test:  X={X_test.shape}, y fraud={y_test.mean()*100:.2f}%")

## 5. Evaluation Function

In [None]:
def evaluate_model(y_true, y_probs, name):
    """Compute metrics matching GNN models."""
    pr_auc = average_precision_score(y_true, y_probs)
    roc_auc = roc_auc_score(y_true, y_probs)
    
    # F1 at optimal threshold
    precision, recall, thresholds = precision_recall_curve(y_true, y_probs)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_f1 = np.max(f1_scores)
    
    # Recall@1% FPR
    top_1pct = int(0.01 * len(y_true))
    top_indices = np.argsort(y_probs)[-top_1pct:]
    recall_at_1pct = y_true[top_indices].mean()
    
    metrics = {
        'model': name,
        'test_pr_auc': float(pr_auc),
        'test_roc_auc': float(roc_auc),
        'test_f1': float(best_f1),
        'test_recall_at_1pct': float(recall_at_1pct)
    }
    
    print(f"\n{name} Results:")
    print(f"  PR-AUC:      {pr_auc:.4f}")
    print(f"  ROC-AUC:     {roc_auc:.4f}")
    print(f"  F1 Score:    {best_f1:.4f}")
    print(f"  Recall@1%:   {recall_at_1pct:.4f}")
    
    return metrics

## 6. Model 1: Logistic Regression

In [None]:
print("Training Logistic Regression...")
lr = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
lr.fit(X_train, y_train)

y_probs_lr = lr.predict_proba(X_test)[:, 1]
metrics_lr = evaluate_model(y_test, y_probs_lr, 'Logistic Regression')

## 7. Model 2: Random Forest

In [None]:
print("Training Random Forest...")
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=0
)
rf.fit(X_train, y_train)

y_probs_rf = rf.predict_proba(X_test)[:, 1]
metrics_rf = evaluate_model(y_test, y_probs_rf, 'Random Forest')

## 8. Model 3: XGBoost (Expected Best)

In [None]:
print("Training XGBoost...")

# Calculate scale_pos_weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    scale_pos_weight=scale_pos_weight,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

y_probs_xgb = xgb_model.predict_proba(X_test)[:, 1]
metrics_xgb = evaluate_model(y_test, y_probs_xgb, 'XGBoost')

## 9. Model 4: MLP (Neural Network, No Graph)

In [None]:
print("Training MLP...")
mlp = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    alpha=0.001,
    batch_size=256,
    learning_rate='adaptive',
    learning_rate_init=0.001,
    max_iter=100,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42,
    verbose=False
)
mlp.fit(X_train, y_train)

y_probs_mlp = mlp.predict_proba(X_test)[:, 1]
metrics_mlp = evaluate_model(y_test, y_probs_mlp, 'MLP')

## 10. Compare All Models

In [None]:
# Combine all metrics
all_metrics = [metrics_lr, metrics_rf, metrics_xgb, metrics_mlp]
comparison_df = pd.DataFrame(all_metrics)

print("\n" + "="*60)
print("TABULAR MODELS COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))
print("\nBest Model (by PR-AUC):")
best_idx = comparison_df['test_pr_auc'].idxmax()
print(f"  {comparison_df.iloc[best_idx]['model']}: {comparison_df.iloc[best_idx]['test_pr_auc']:.4f}")

## 11. Compare with GNN Models

In [None]:
# Add GNN results for comparison
gnn_results = [
    {'model': 'GCN', 'test_pr_auc': 0.1976, 'test_roc_auc': 0.7627, 'test_f1': 0.2487, 'test_recall_at_1pct': 0.0613},
    {'model': 'GraphSAGE', 'test_pr_auc': 0.4483, 'test_roc_auc': 0.8210, 'test_f1': 0.4527, 'test_recall_at_1pct': 0.1478},
    {'model': 'GAT', 'test_pr_auc': 0.1839, 'test_roc_auc': 0.7942, 'test_f1': 0.2901, 'test_recall_at_1pct': 0.0126}
]

all_models_df = pd.concat([pd.DataFrame(gnn_results), comparison_df], ignore_index=True)
all_models_df = all_models_df.sort_values('test_pr_auc', ascending=False).reset_index(drop=True)

print("\n" + "="*60)
print("ALL MODELS COMPARISON (GNN + Tabular)")
print("="*60)
print(all_models_df.to_string(index=False))

# Answer the big question
best_tabular_pr = comparison_df['test_pr_auc'].max()
graphsage_pr = 0.4483

print("\n" + "="*60)
print("DOES THE GRAPH HELP?")
print("="*60)
print(f"Best Tabular (features only): {best_tabular_pr:.4f}")
print(f"GraphSAGE (with graph):       {graphsage_pr:.4f}")
improvement = ((graphsage_pr - best_tabular_pr) / best_tabular_pr) * 100
print(f"\nGraph improvement: {improvement:+.1f}%")

if improvement > 20:
    print("\nConclusion: Graph is ESSENTIAL for fraud detection!")
elif improvement > 5:
    print("\nConclusion: Graph HELPS fraud detection.")
else:
    print("\nConclusion: Features alone are sufficient.")

## 12. Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# PR-AUC comparison
ax = axes[0, 0]
colors = ['#2ecc71' if 'Graph' in m or m in ['GCN', 'GAT'] else '#3498db' for m in all_models_df['model']]
ax.barh(all_models_df['model'], all_models_df['test_pr_auc'], color=colors)
ax.set_xlabel('PR-AUC')
ax.set_title('PR-AUC Comparison (Higher = Better)')
ax.axvline(0.4, color='red', linestyle='--', alpha=0.5, label='Target: 0.40')
ax.legend()

# ROC-AUC comparison
ax = axes[0, 1]
ax.barh(all_models_df['model'], all_models_df['test_roc_auc'], color=colors)
ax.set_xlabel('ROC-AUC')
ax.set_title('ROC-AUC Comparison')
ax.axvline(0.8, color='red', linestyle='--', alpha=0.5, label='Target: 0.80')
ax.legend()

# F1 Score comparison
ax = axes[1, 0]
ax.barh(all_models_df['model'], all_models_df['test_f1'], color=colors)
ax.set_xlabel('F1 Score')
ax.set_title('F1 Score Comparison')
ax.axvline(0.3, color='red', linestyle='--', alpha=0.5, label='Target: 0.30')
ax.legend()

# Recall@1% comparison
ax = axes[1, 1]
ax.barh(all_models_df['model'], all_models_df['test_recall_at_1pct'], color=colors)
ax.set_xlabel('Recall@1% FPR')
ax.set_title('Recall@1% Comparison')

plt.tight_layout()
plt.savefig('all_models_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("\nVisualization saved: all_models_comparison.png")

## 13. Save Results

In [None]:
# Save individual model metrics
for metrics in all_metrics:
    filename = f"{metrics['model'].lower().replace(' ', '_')}_metrics.json"
    with open(filename, 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f"Saved: {filename}")

# Save comparison CSV
all_models_df.to_csv('all_models_comparison.csv', index=False)
print("\nSaved: all_models_comparison.csv")

print("\n" + "="*60)
print("M5 COMPLETE!")
print("="*60)
print("\nFiles to download:")
print("  1. logistic_regression_metrics.json")
print("  2. random_forest_metrics.json")
print("  3. xgboost_metrics.json")
print("  4. mlp_metrics.json")
print("  5. all_models_comparison.csv")
print("  6. all_models_comparison.png")