# Notebook 02 — Modeling & Threshold Tuning

Goal:
- Train a baseline fraud detection model
- Evaluate using metrics suitable for extreme class imbalance
- Demonstrate how decision threshold affects precision, recall and business trade-offs

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path("..").resolve()))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.config import CFG
from src.io import load_csv
from src.split import stratified_split
from src.models import train_logreg_baseline
from src.evaluation import evaluate_at_threshold
from src.thresholding import (
    find_threshold_for_min_precision,
    find_threshold_for_min_recall,
    find_threshold_min_cost,
)
from src.viz import plot_pr_curve

plt.rcParams["figure.figsize"] = (7, 4)

In [None]:
DATA_PATH = "../data/raw/creditcard.csv"
df = load_csv(DATA_PATH)

target_col = CFG.target_col
df.shape

In [None]:
train_df, test_df = stratified_split(
    df=df,
    target_col=target_col,
    test_size=CFG.test_size,
    seed=CFG.seed,
)

train_df[target_col].mean(), test_df[target_col].mean()

In [None]:
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col].values

X_test = test_df.drop(columns=[target_col])
y_test = test_df[target_col].values

X_train.shape, X_test.shape

## Baseline model choice

I start with Logistic Regression because:
- it’s fast and strong as a baseline
- it outputs probabilities (required for threshold tuning)
- with `class_weight="balanced"` it handles imbalance better than a naive classifier

In [None]:
trained = train_logreg_baseline(X_train, y_train, seed=CFG.seed)
y_prob = trained.predict_proba(X_test)

y_prob[:5]

In [None]:
plot_pr_curve(y_test, y_prob)

In [None]:
default_metrics = evaluate_at_threshold(
    y_true=y_test,
    y_prob=y_prob,
    threshold=0.5,
    cost_fn=CFG.cost_false_negative,
    cost_fp=CFG.cost_false_positive,
)
default_metrics

## Why threshold tuning matters

In fraud detection, threshold is not "0.5 by default".
It directly controls the trade-off:
- Higher threshold → fewer alerts → higher precision, lower recall (more missed fraud)
- Lower threshold → more alerts → higher recall, lower precision (more false positives)

Next, we choose thresholds based on constraints:
1) Minimum Precision (control alert quality)
2) Minimum Recall (avoid missing fraud)
3) Minimum Expected Cost (FN vs FP cost trade-off)

In [None]:
t_min_p = find_threshold_for_min_precision(y_test, y_prob, min_precision=CFG.min_precision)
metrics_min_p = evaluate_at_threshold(
    y_true=y_test,
    y_prob=y_prob,
    threshold=t_min_p,
    cost_fn=CFG.cost_false_negative,
    cost_fp=CFG.cost_false_positive,
)
t_min_p, metrics_min_p

In [None]:
t_min_r = find_threshold_for_min_recall(y_test, y_prob, min_recall=CFG.min_recall)
metrics_min_r = evaluate_at_threshold(
    y_true=y_test,
    y_prob=y_prob,
    threshold=t_min_r,
    cost_fn=CFG.cost_false_negative,
    cost_fp=CFG.cost_false_positive,
)
t_min_r, metrics_min_r

In [None]:
t_min_cost = find_threshold_min_cost(
    y_test, y_prob,
    cost_fn=CFG.cost_false_negative,
    cost_fp=CFG.cost_false_positive,
)

metrics_min_cost = evaluate_at_threshold(
    y_true=y_test,
    y_prob=y_prob,
    threshold=t_min_cost,
    cost_fn=CFG.cost_false_negative,
    cost_fp=CFG.cost_false_positive,
)
t_min_cost, metrics_min_cost

In [None]:
compare = pd.DataFrame([
    {"strategy": "default_0.5", **default_metrics},
    {"strategy": f"min_precision_{CFG.min_precision}", **metrics_min_p},
    {"strategy": f"min_recall_{CFG.min_recall}", **metrics_min_r},
    {"strategy": f"min_cost_fn{CFG.cost_false_negative}_fp{CFG.cost_false_positive}", **metrics_min_cost},
])

# make confusion matrix readable
compare["TN_FP_FN_TP"] = compare["confusion_matrix"].apply(lambda x: [x[0][0], x[0][1], x[1][0], x[1][1]])
compare.drop(columns=["confusion_matrix"], inplace=True)

compare[["strategy", "threshold", "pr_auc", "precision", "recall", "f1", "expected_cost", "TN_FP_FN_TP"]]

## Conclusion (Notebook 02)

- Baseline Logistic Regression provides a meaningful starting point.
- PR-AUC is used as the core metric due to extreme imbalance.
- Default threshold (0.5) is not optimal.
- Threshold tuning changes the trade-off between false positives and missed fraud.
- The final threshold selection depends on business constraints:
  - high precision → reduce alert fatigue
  - high recall → reduce missed fraud
  - cost-based → optimize operational + fraud loss trade-off

Next notebook: deep dive into false positives/false negatives and business impact.