# M5 â€” Tabular Baselines (Local)

**Goal:** Train strong tabular models (LR, RF, XGBoost, MLP) on the corrected Elliptic++ labels to answer *"Does graph structure help?"*

## Notebook TODO (auto-discipline)
- [ ] Load real Elliptic++ dataset from `data/Elliptic++ Dataset/`
- [ ] Set deterministic seeds via `src.utils.seed`
- [ ] Train LR/RF/XGBoost/MLP end-to-end (no graph features)
- [ ] Save metrics JSON + comparison CSV + plots under `reports/`
- [ ] Print artifact paths + best metrics in the final cell
- [ ] Clear this TODO checklist before marking M5 done


## 1. Setup

In [None]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

PROJECT_ROOT = Path.cwd().resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.utils.seed import set_all_seeds
from scripts import run_m5_tabular as m5

DATA_DIR = PROJECT_ROOT / "data" / "Elliptic++ Dataset"
REPORTS_DIR = PROJECT_ROOT / "reports"
PLOTS_DIR = REPORTS_DIR / "plots"

if not DATA_DIR.exists():
    raise FileNotFoundError(f"Missing Elliptic++ dataset at {DATA_DIR}")

set_all_seeds(42)
np.set_printoptions(precision=4, suppress=True)
plt.style.use('seaborn-v0_8')
sns.set_theme(style='whitegrid')

print(f"Project root : {PROJECT_ROOT}")
print(f"Data dir     : {DATA_DIR}")
print(f"Reports dir  : {REPORTS_DIR}")


## 2. Load Elliptic++ dataset

In [None]:
tab_data = m5.load_tabular_dataset(DATA_DIR)

stats = pd.Series(tab_data.stats).to_frame(name='value')
stats

### Peek at training records

In [None]:
tab_data.train_df.head()

## 3. Train tabular models

In [None]:
lr_result = m5.train_logistic_regression(tab_data)
pd.Series(lr_result.metrics)


In [None]:
rf_result = m5.train_random_forest(tab_data)
pd.Series(rf_result.metrics)


In [None]:
xgb_result = m5.train_xgboost(tab_data)
pd.Series(xgb_result.metrics)


In [None]:
mlp_result = m5.train_mlp(tab_data)
pd.Series(mlp_result.metrics)


## 4. Aggregate + compare

In [None]:
results = [lr_result, rf_result, xgb_result, mlp_result]
summary_df = m5.summarize_results(results)
summary_df

## 5. Persist artifacts

In [None]:
m5.save_artifacts(results, tab_data.y_test, REPORTS_DIR)
print("Saved:")
for path in [REPORTS_DIR / 'logistic_regression_metrics.json',
             REPORTS_DIR / 'random_forest_metrics.json',
             REPORTS_DIR / 'xgboost_metrics.json',
             REPORTS_DIR / 'mlp_metrics.json',
             REPORTS_DIR / 'all_models_comparison.csv',
             REPORTS_DIR / 'plots' / 'all_models_comparison.png']:
    print(f"  - {path}")


## 6. Summary

In [None]:
best_row = summary_df.iloc[0]
print(f"Best tabular model : {best_row['model']}")
print(f"Test PR-AUC        : {best_row['pr_auc']:.4f}")
print(f"Test ROC-AUC       : {best_row['roc_auc']:.4f}")
print(f"Test F1 (val thr)  : {best_row['f1']:.4f}")
print(f"Recall@1%          : {best_row['recall@1.0%']:.4f}")
print("
Next step: compare against retrained GraphSAGE/GCN/GAT baselines once available.")
