In [1]:
import json
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

In [2]:
REPO_ROOT = Path(__file__).resolve().parents[1]  # backend/
RAW_PATH = REPO_ROOT / "data" / "raw" / "creditcard.csv"
OUT_PATH = REPO_ROOT / "data" / "processed" / "full_test_metrics.json"

NameError: name '__file__' is not defined

In [None]:
def summarize_model(name: str, model, X: np.ndarray, y: np.ndarray):
    model.fit(X, y)
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else None

    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    precision = float(precision_score(y, y_pred, zero_division=0))
    recall = float(recall_score(y, y_pred, zero_division=0))
    f1 = float(f1_score(y, y_pred, zero_division=0))
    roc_auc = float(roc_auc_score(y, y_proba)) if y_proba is not None else None

    return {
        "model": name,
        "predicted_fraud": int(np.sum(y_pred)),
        "tp": int(tp),
        "fp": int(fp),
        "fn": int(fn),
        "tn": int(tn),
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
    }

In [None]:
def main():
    if not RAW_PATH.exists():
        raise FileNotFoundError(f"creditcard.csv not found: {RAW_PATH}")

    df = pd.read_csv(RAW_PATH)
    total_cases = int(len(df))
    true_fraud = int(df["Class"].sum())

    X = df.drop(["Time", "Class"], axis=1)
    y = df["Class"].astype(int).to_numpy()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    models = [
        ("LogisticRegression", LogisticRegression(max_iter=1000, random_state=42)),
        ("RandomForest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ]

    results = {}
    for name, mdl in models:
        results[name] = summarize_model(name, mdl, X_scaled, y)

    summary = {
        "timestamp": datetime.utcnow().isoformat(),
        "total_cases": total_cases,
        "true_fraud": true_fraud,
        "models": results,
    }

    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(OUT_PATH, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print(f"Wrote summary -> {OUT_PATH}")
    print(json.dumps(summary, indent=2))

In [None]:
if __name__ == "__main__":
    main()