# 05 - Tabular Baselines (Local CPU)

**Goal:** Train traditional ML models on node features ONLY (no graph structure).

**Question:** Does the graph actually help fraud detection?

**Models:**
1. Logistic Regression - Linear baseline
2. Random Forest - Tree ensemble
3. XGBoost - Gradient boosting (expected best)
4. MLP - Neural network without graph

**Setup:**
- Same temporal splits as GNN models
- Same evaluation metrics (PR-AUC, ROC-AUC, F1, Recall@1%)
- Compare with GraphSAGE (PR-AUC: 0.4483)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    precision_recall_curve,
    roc_curve,
    f1_score,
    classification_report
)
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Load Data & Features

In [None]:
# Load Elliptic++ dataset from local directory
data_dir = Path(r'C:\Users\oumme\OneDrive\Desktop\FRAUD-DETECTION-GNN\Elliptic++ Dataset')

print(f"Loading from: {data_dir}")
print(f"Files: {[f.name for f in data_dir.glob('*.csv')]}")

# Load features and classes
print("\nLoading Elliptic++ dataset...")
features_df = pd.read_csv(data_dir / 'elliptic_txs_features.csv')
classes_df = pd.read_csv(data_dir / 'elliptic_txs_classes.csv')

print(f"Features shape: {features_df.shape}")
print(f"Classes shape: {classes_df.shape}")

# Merge features and classes
df = features_df.merge(classes_df, on='txId', how='left')
print(f"Merged shape: {df.shape}")

# Identify timestamp column (could be 'Time step' or 'timestamp')
time_col = None
for col in ['timestamp', 'Time step', 'time_step']:
    if col in df.columns:
        time_col = col
        break

if time_col is None:
    raise ValueError(f"No timestamp column found! Columns: {df.columns.tolist()[:10]}...")

# Normalize column name
if time_col != 'timestamp':
    df['timestamp'] = df[time_col]

print(f"\nTimestamp column: '{time_col}' → 'timestamp'")
print(f"Timestamp range: {df['timestamp'].min()} to {df['timestamp'].max()}")

# Filter to labeled transactions only
df_labeled = df[df['class'].isin([1, 2])].copy()

# Map class: 1=licit (0), 2=illicit (1)
df_labeled['label'] = (df_labeled['class'] == 2).astype(int)

print(f"\nLabeled transactions: {len(df_labeled):,}")
print(f"Fraud percentage: {df_labeled['label'].mean()*100:.2f}%")

## 2. Create Temporal Splits

In [None]:
# Sort by timestamp
df_labeled = df_labeled.sort_values('timestamp')

# Split: 60% train, 20% val, 20% test (temporal)
n = len(df_labeled)
train_size = int(0.6 * n)
val_size = int(0.2 * n)

train_df = df_labeled.iloc[:train_size]
val_df = df_labeled.iloc[train_size:train_size+val_size]
test_df = df_labeled.iloc[train_size+val_size:]

print(f"\nTemporal Splits:")
print(f"Train: {len(train_df):,} ({len(train_df)/n*100:.1f}%) | Fraud: {train_df['label'].mean()*100:.2f}%")
print(f"Val:   {len(val_df):,} ({len(val_df)/n*100:.1f}%) | Fraud: {val_df['label'].mean()*100:.2f}%")
print(f"Test:  {len(test_df):,} ({len(test_df)/n*100:.1f}%) | Fraud: {test_df['label'].mean()*100:.2f}%")

## 3. Prepare Features

In [None]:
# Identify feature columns (exclude txId, timestamp, class, label)
exclude_cols = ['txId', 'timestamp', 'Time step', 'time_step', 'class', 'label']
feature_cols = [col for col in df_labeled.columns if col not in exclude_cols]

print(f"Number of features: {len(feature_cols)}")
print(f"Feature columns: {feature_cols[:5]}...")

# Extract features and labels
X_train = train_df[feature_cols].values
y_train = train_df['label'].values

X_val = val_df[feature_cols].values
y_val = val_df['label'].values

X_test = test_df[feature_cols].values
y_test = test_df['label'].values

# Handle NaN/inf values
print(f"\nNaN values: Train={np.isnan(X_train).sum()}, Val={np.isnan(X_val).sum()}, Test={np.isnan(X_test).sum()}")
print(f"Inf values: Train={np.isinf(X_train).sum()}, Val={np.isinf(X_val).sum()}, Test={np.isinf(X_test).sum()}")

# Replace NaN and inf with 0
X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_val = np.nan_to_num(X_val, nan=0.0, posinf=0.0, neginf=0.0)
X_test = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"\nData shapes:")
print(f"Train: X={X_train_scaled.shape}, y={y_train.shape}")
print(f"Val:   X={X_val_scaled.shape}, y={y_val.shape}")
print(f"Test:  X={X_test_scaled.shape}, y={y_test.shape}")

## 4. Helper Functions

In [None]:
def evaluate_model(y_true, y_pred_proba, model_name):
    """Evaluate model with same metrics as GNN models"""
    pr_auc = average_precision_score(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    
    # F1 score (threshold at 0.5)
    y_pred = (y_pred_proba >= 0.5).astype(int)
    f1 = f1_score(y_true, y_pred)
    
    # Recall@1% (top 1% predictions)
    top_1pct_idx = np.argsort(y_pred_proba)[::-1][:int(len(y_pred_proba)*0.01)]
    recall_at_1pct = y_true[top_1pct_idx].mean()
    
    metrics = {
        'model': model_name,
        'pr_auc': float(pr_auc),
        'roc_auc': float(roc_auc),
        'f1_score': float(f1),
        'recall_at_1pct': float(recall_at_1pct)
    }
    
    print(f"\n{model_name} Results:")
    print(f"  PR-AUC:      {pr_auc:.4f}")
    print(f"  ROC-AUC:     {roc_auc:.4f}")
    print(f"  F1 Score:    {f1:.4f}")
    print(f"  Recall@1%:   {recall_at_1pct:.4f}")
    
    return metrics

def save_metrics(metrics, filename):
    """Save metrics to JSON file"""
    reports_dir = Path(r'C:\Users\oumme\OneDrive\Desktop\FRAUD-DETECTION-GNN\reports')
    reports_dir.mkdir(exist_ok=True)
    
    filepath = reports_dir / filename
    with open(filepath, 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f"  Saved to: {filepath}")

print("Helper functions defined!")

## 5. Model 1: Logistic Regression

In [None]:
print("Training Logistic Regression...")

# Calculate class weights
class_weight = {0: 1.0, 1: len(y_train) / (2 * y_train.sum())}
print(f"Class weights: {class_weight}")

lr_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

lr_model.fit(X_train_scaled, y_train)

# Predict on test set
lr_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
lr_metrics = evaluate_model(y_test, lr_proba, 'Logistic Regression')
save_metrics(lr_metrics, 'logistic_regression_metrics.json')

## 6. Model 2: Random Forest

In [None]:
print("Training Random Forest...")

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=4,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(X_train_scaled, y_train)

# Predict on test set
rf_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
rf_metrics = evaluate_model(y_test, rf_proba, 'Random Forest')
save_metrics(rf_metrics, 'random_forest_metrics.json')

## 7. Model 3: XGBoost

In [None]:
print("Training XGBoost...")

# Calculate scale_pos_weight
scale_pos_weight = len(y_train) / y_train.sum() - 1
print(f"Scale pos weight: {scale_pos_weight:.2f}")

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    verbosity=1
)

xgb_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_val_scaled, y_val)],
    verbose=50
)

# Predict on test set
xgb_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
xgb_metrics = evaluate_model(y_test, xgb_proba, 'XGBoost')
save_metrics(xgb_metrics, 'xgboost_metrics.json')

## 8. Model 4: MLP (Neural Network)

In [None]:
print("Training MLP...")

mlp_model = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size=256,
    learning_rate='adaptive',
    learning_rate_init=0.001,
    max_iter=100,
    random_state=42,
    verbose=True,
    early_stopping=True,
    validation_fraction=0.1
)

mlp_model.fit(X_train_scaled, y_train)

# Predict on test set
mlp_proba = mlp_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
mlp_metrics = evaluate_model(y_test, mlp_proba, 'MLP')
save_metrics(mlp_metrics, 'mlp_metrics.json')

## 9. Compare All Models

In [None]:
# Collect all results
all_results = [
    lr_metrics,
    rf_metrics,
    xgb_metrics,
    mlp_metrics
]

# Add GNN results for comparison (from previous notebooks)
gnn_results = [
    {'model': 'GCN', 'pr_auc': 0.1976, 'roc_auc': 0.7627, 'f1_score': 0.2487, 'recall_at_1pct': 0.0613},
    {'model': 'GraphSAGE', 'pr_auc': 0.4483, 'roc_auc': 0.8210, 'f1_score': 0.4527, 'recall_at_1pct': 0.1478},
    {'model': 'GAT', 'pr_auc': 0.1839, 'roc_auc': 0.7942, 'f1_score': 0.2901, 'recall_at_1pct': 0.0126}
]

all_results.extend(gnn_results)

# Create comparison dataframe
df_results = pd.DataFrame(all_results)
df_results = df_results.sort_values('pr_auc', ascending=False)

print("\n" + "="*80)
print("FINAL RESULTS - ALL MODELS")
print("="*80)
print(df_results.to_string(index=False))

# Save to CSV
reports_dir = Path(r'C:\Users\oumme\OneDrive\Desktop\FRAUD-DETECTION-GNN\reports')
csv_path = reports_dir / 'all_models_comparison.csv'
df_results.to_csv(csv_path, index=False)
print(f"\nSaved comparison to: {csv_path}")

## 10. Visualization

In [None]:
# Create comparison bar chart
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics_to_plot = ['pr_auc', 'roc_auc', 'f1_score', 'recall_at_1pct']
titles = ['PR-AUC (Higher is Better)', 'ROC-AUC (Higher is Better)', 
          'F1 Score (Higher is Better)', 'Recall@1% (Higher is Better)']

for idx, (metric, title) in enumerate(zip(metrics_to_plot, titles)):
    ax = axes[idx // 2, idx % 2]
    
    # Sort by metric for this plot
    df_sorted = df_results.sort_values(metric, ascending=True)
    
    # Color: green for GNN, blue for tabular
    colors = ['green' if model in ['GCN', 'GraphSAGE', 'GAT'] else 'blue' 
              for model in df_sorted['model']]
    
    ax.barh(df_sorted['model'], df_sorted[metric], color=colors, alpha=0.7)
    ax.set_xlabel(metric.upper())
    ax.set_title(title)
    ax.grid(axis='x', alpha=0.3)
    
    # Add values on bars
    for i, (model, value) in enumerate(zip(df_sorted['model'], df_sorted[metric])):
        ax.text(value, i, f' {value:.4f}', va='center', fontsize=9)

plt.tight_layout()
plots_dir = reports_dir / 'plots'
plots_dir.mkdir(exist_ok=True)
plot_path = plots_dir / 'all_models_comparison.png'
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
print(f"Saved plot to: {plot_path}")
plt.show()

## 11. Analysis: Does Graph Help?

In [None]:
# Find best tabular model
tabular_models = df_results[~df_results['model'].isin(['GCN', 'GraphSAGE', 'GAT'])]
best_tabular = tabular_models.loc[tabular_models['pr_auc'].idxmax()]

# GraphSAGE performance
graphsage = df_results[df_results['model'] == 'GraphSAGE'].iloc[0]

# Calculate improvement
improvement = ((graphsage['pr_auc'] - best_tabular['pr_auc']) / best_tabular['pr_auc']) * 100

print("\n" + "="*80)
print("GRAPH STRUCTURE VALUE ANALYSIS")
print("="*80)
print(f"\nBest Tabular Model:  {best_tabular['model']}")
print(f"  PR-AUC: {best_tabular['pr_auc']:.4f}")
print(f"\nBest Graph Model:    GraphSAGE")
print(f"  PR-AUC: {graphsage['pr_auc']:.4f}")
print(f"\nImprovement: {improvement:+.1f}%")

if improvement > 20:
    print("\n✅ CONCLUSION: Graph structure adds SIGNIFICANT value!")
    print("   → GNNs are essential for fraud detection on this dataset.")
elif improvement > 5:
    print("\n✅ CONCLUSION: Graph structure helps moderately.")
    print("   → Consider ensemble of GNN + tabular models.")
else:
    print("\n⚠️  CONCLUSION: Graph structure adds minimal value.")
    print("   → Tabular models may be sufficient.")

print("\n" + "="*80)

## ✅ M5 Complete!

**Deliverables:**
- ✅ 4 tabular models trained and evaluated
- ✅ Metrics saved to `reports/*.json`
- ✅ Comparison CSV: `reports/all_models_comparison.csv`
- ✅ Visualization: `reports/plots/all_models_comparison.png`
- ✅ Graph value analysis complete

**Next:** M6 - Final verification and documentation