In [2]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, ParameterSampler

import joblib

try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception:
    HAS_LGB = False


In [2]:

LABEL = "fraud"

TRAIN_PATH = "../../DATA/dataset/TRAIN_stage2"
TEST_PATH  = "../../DATA/dataset/TEST_stage2"

OUT_DIR = "artifacts/stage2_models"
OUT_DIR_METRICS = "artifacts/stage2_metrics"


In [3]:

from pathlib import Path

Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
Path(OUT_DIR_METRICS).mkdir(parents=True, exist_ok=True)

In [4]:

def load_stage_df(path: str, label: str = LABEL):
    df = pd.read_parquet(path)
    if label not in df.columns:
        raise KeyError(f"Missing label column: {label}")
    X = df.drop(columns=[label])
    y = df[label].astype(np.int8).to_numpy()
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0)
    return X, y


In [5]:
def topk_metrics(y_true, score, top_pct_list=(0.001, 0.002, 0.005, 0.01)):
    y = np.asarray(y_true).astype(int)
    s = np.asarray(score).astype(float)
    n = len(y)
    base_rate = float(y.mean()) if n else np.nan
    order = np.argsort(-s)
    y_sorted = y[order]

    rows = []
    for p in top_pct_list:
        k = max(int(np.ceil(n * p)), 1)
        top_y = y_sorted[:k]
        prec = float(top_y.mean())
        rec = float(top_y.sum() / max(y.sum(), 1))
        lift = float(prec / base_rate) if base_rate and base_rate > 0 else np.nan
        rows.append({"top_pct": p, "k": k, "precision": prec, "recall": rec, "lift": lift, "base_rate": base_rate})
    return pd.DataFrame(rows)


def evaluate_metrics(y_true, score):
    return {
        "auc": float(roc_auc_score(y_true, score)),
        "prauc": float(average_precision_score(y_true, score)),
        "base_rate": float(np.mean(y_true)),
    }


In [6]:

def fit_logit(X_tr, y_tr):
    model = Pipeline([
        ("scaler", StandardScaler(with_mean=False)),
        ("clf", LogisticRegression(
            solver="lbfgs",
            max_iter=2000,
            class_weight="balanced",
        ))
    ])
    model.fit(X_tr, y_tr)
    return model


def fit_hgb(X_tr, y_tr):
    model = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_iter=400,
        min_samples_leaf=200,
        l2_regularization=0.0,
        random_state=42,
    )
    model.fit(X_tr, y_tr)
    return model


def fit_lgb_small(X_tr, y_tr):
    if not HAS_LGB:
        raise RuntimeError("lightgbm is not available in this environment.")
    model = lgb.LGBMClassifier(
        objective="binary",
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=6,
        min_data_in_leaf=300,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
    )
    model.fit(X_tr, y_tr)
    return model


def predict_score(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        s = model.decision_function(X)
        return 1 / (1 + np.exp(-s))
    return model.predict(X)


In [7]:
X_tr, y_tr = load_stage_df(TRAIN_PATH)
X_te, y_te = load_stage_df(TEST_PATH)

print("train:", X_tr.shape, "test:", X_te.shape)
print("base_rate train:", float(y_tr.mean()), "test:", float(y_te.mean()))


train: (5312525, 17) test: (928867, 17)
base_rate train: 0.0014471461310770302 test: 0.002624702998384053


In [8]:

from sklearn.metrics import classification_report

candidates = [
    ("logit", fit_logit),
    ("hgb", fit_hgb),
]
if HAS_LGB:
    candidates.append(("lgb_small", fit_lgb_small))

results = []
topk_all = {}
reports = {}

for name, fit_fn in tqdm(candidates, desc="Training Stage1 models"):
    model = fit_fn(X_tr, y_tr)
    score_te = predict_score(model, X_te)

    m = evaluate_metrics(y_te, score_te)
    m["model"] = name
    results.append(m)

    topk = topk_metrics(y_te, score_te)
    topk_all[name] = topk
    topk.to_csv(Path(OUT_DIR_METRICS) / f"{name}_topk.csv", index=False)

    joblib.dump(model, Path(OUT_DIR) / f"{name}.joblib")
    np.save(Path(OUT_DIR_METRICS) / f"{name}_test_scores.npy", score_te)

    thr = np.quantile(score_te, 0.99)
    y_pred = (score_te >= thr).astype(int)

    rep_txt = classification_report(y_te, y_pred, digits=4)
    reports[name] = rep_txt

    print("\n" + "=" * 80)
    print(f"[{name}] threshold=quantile(0.99) -> top 1% as positive")
    print(rep_txt)

results_df = pd.DataFrame(results).sort_values(["prauc", "auc"], ascending=False).reset_index(drop=True)
results_df


Training Stage1 models:   0%|          | 0/3 [00:00<?, ?it/s]


[logit] threshold=quantile(0.99) -> top 1% as positive
              precision    recall  f1-score   support

           0     0.9989    0.9915    0.9952    926429
           1     0.1571    0.5984    0.2488      2438

    accuracy                         0.9905    928867
   macro avg     0.5780    0.7950    0.6220    928867
weighted avg     0.9967    0.9905    0.9933    928867


[hgb] threshold=quantile(0.99) -> top 1% as positive
              precision    recall  f1-score   support

           0     0.9985    0.9911    0.9948    926429
           1     0.1159    0.4446    0.1839      2438

    accuracy                         0.9896    928867
   macro avg     0.5572    0.7179    0.5894    928867
weighted avg     0.9962    0.9896    0.9927    928867

[LightGBM] [Info] Number of positive: 7688, number of negative: 5304837
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081949 seconds.
You can set `force_row_wise=true` to remove the overhead.
An

Unnamed: 0,auc,prauc,base_rate,model
0,0.945793,0.434644,0.002625,lgb_small
1,0.802043,0.280938,0.002625,hgb
2,0.947069,0.252247,0.002625,logit


---

stage1에서 걸러진 데이터만 2차로 거른다면

In [10]:
import pandas as pd

PASS_PATH = "../transaction/artifacts/stage1_pass_ids_test.parquet"
TRAIN2_PATH = "../../DATA/dataset/TRAIN_stage2"
TEST2_PATH  = "../../DATA/dataset/TEST_stage2"

pass_ids = pd.read_parquet(PASS_PATH)["id"].astype("int64")

train2 = pd.read_parquet(TRAIN2_PATH)
test2  = pd.read_parquet(TEST2_PATH)

test2_pass = test2[test2["id"].isin(pass_ids)].copy()

print("train2:", train2.shape)
print("test2:", test2.shape)
print("test2_pass:", test2_pass.shape)
print("pass_rate_in_test2:", len(test2_pass) / len(test2))


train2: (5312525, 19)
test2: (928867, 19)
test2_pass: (9146, 19)
pass_rate_in_test2: 0.009846404275316058


In [22]:
OUT_DIR = "artifacts/stage2_models"
OUT_DIR_METRICS = "artifacts/stage2_metrics"

In [23]:
from pathlib import Path 
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
Path(OUT_DIR_METRICS).mkdir(parents=True, exist_ok=True)

In [12]:
LABEL = "fraud"

def load_stage_df(df, label: str = LABEL, id_col: str = "id"):
    df = df.copy()

    if label not in df.columns:
        raise KeyError(f"Missing label column: {label}")

    drop_cols = [label]
    if id_col in df.columns:
        drop_cols.append(id_col)

    y = df[label].astype(np.int8).to_numpy()
    X = df.drop(columns=drop_cols)

    X = X.replace([np.inf, -np.inf], np.nan).fillna(0)

    return X, y


In [13]:
def topk_metrics(y_true, score, top_pct_list=(0.001, 0.002, 0.005, 0.01)):
    y = np.asarray(y_true).astype(int)
    s = np.asarray(score).astype(float)
    n = len(y)
    base_rate = float(y.mean()) if n else np.nan
    order = np.argsort(-s)
    y_sorted = y[order]

    rows = []
    for p in top_pct_list:
        k = max(int(np.ceil(n * p)), 1)
        top_y = y_sorted[:k]
        prec = float(top_y.mean())
        rec = float(top_y.sum() / max(y.sum(), 1))
        lift = float(prec / base_rate) if base_rate and base_rate > 0 else np.nan
        rows.append({"top_pct": p, "k": k, "precision": prec, "recall": rec, "lift": lift, "base_rate": base_rate})
    return pd.DataFrame(rows)


def evaluate_metrics(y_true, score):
    return {
        "auc": float(roc_auc_score(y_true, score)),
        "prauc": float(average_precision_score(y_true, score)),
        "base_rate": float(np.mean(y_true)),
    }


In [14]:

def fit_logit(X_tr, y_tr):
    model = Pipeline([
        ("scaler", StandardScaler(with_mean=False)),
        ("clf", LogisticRegression(
            solver="lbfgs",
            max_iter=2000,
            class_weight="balanced",
        ))
    ])
    model.fit(X_tr, y_tr)
    return model


def fit_hgb(X_tr, y_tr):
    model = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_iter=400,
        min_samples_leaf=200,
        l2_regularization=0.0,
        random_state=42,
    )
    model.fit(X_tr, y_tr)
    return model


def fit_lgb_small(X_tr, y_tr):
    if not HAS_LGB:
        raise RuntimeError("lightgbm is not available in this environment.")
    model = lgb.LGBMClassifier(
        objective="binary",
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=6,
        min_data_in_leaf=300,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
    )
    model.fit(X_tr, y_tr)
    return model


def predict_score(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        s = model.decision_function(X)
        return 1 / (1 + np.exp(-s))
    return model.predict(X)


In [17]:
X_tr, y_tr = load_stage_df(train2)
X_te, y_te = load_stage_df(test2_pass)

print("train:", X_tr.shape, "test:", X_te.shape)
print("base_rate train:", float(y_tr.mean()), "test:", float(y_te.mean()))


train: (5312525, 17) test: (9146, 17)
base_rate train: 0.0014471461310770302 test: 0.15908593920839711


In [24]:

from sklearn.metrics import classification_report

candidates = [
    ("logit", fit_logit),
    ("hgb", fit_hgb),
]
if HAS_LGB:
    candidates.append(("lgb_small", fit_lgb_small))

results = []
topk_all = {}
reports = {}

for name, fit_fn in tqdm(candidates, desc="Training Stage1 models"):
    model = fit_fn(X_tr, y_tr)
    score_te = predict_score(model, X_te)

    m = evaluate_metrics(y_te, score_te)
    m["model"] = name
    results.append(m)

    topk = topk_metrics(y_te, score_te)
    topk_all[name] = topk
    topk.to_csv(Path(OUT_DIR_METRICS) / f"{name}_topk.csv", index=False)

    joblib.dump(model, Path(OUT_DIR) / f"{name}.joblib")
    np.save(Path(OUT_DIR_METRICS) / f"{name}_test_scores.npy", score_te)

    thr = np.quantile(score_te, 0.99)
    y_pred = (score_te >= thr).astype(int)

    rep_txt = classification_report(y_te, y_pred, digits=4)
    reports[name] = rep_txt

    print("\n" + "=" * 80)
    print(f"[{name}] threshold=quantile(0.99) -> top 1% as positive")
    print(rep_txt)

results_df = pd.DataFrame(results).sort_values(["prauc", "auc"], ascending=False).reset_index(drop=True)
results_df


Training Stage1 models:   0%|          | 0/3 [00:00<?, ?it/s]


[logit] threshold=quantile(0.99) -> top 1% as positive
              precision    recall  f1-score   support

           0     0.8461    0.9961    0.9150      7691
           1     0.6739    0.0426    0.0802      1455

    accuracy                         0.8444      9146
   macro avg     0.7600    0.5194    0.4976      9146
weighted avg     0.8187    0.8444    0.7822      9146


[hgb] threshold=quantile(0.99) -> top 1% as positive
              precision    recall  f1-score   support

           0     0.9219    0.9555    0.9384      7691
           1     0.7087    0.5718    0.6329      1455

    accuracy                         0.8945      9146
   macro avg     0.8153    0.7637    0.7857      9146
weighted avg     0.8879    0.8945    0.8898      9146

[LightGBM] [Info] Number of positive: 7688, number of negative: 5304837
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085054 seconds.
You can set `force_row_wise=true` to remove the overhead.
An

Unnamed: 0,auc,prauc,base_rate,model
0,0.891103,0.716675,0.159086,lgb_small
1,0.691538,0.510391,0.159086,hgb
2,0.842729,0.447422,0.159086,logit


In [26]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import precision_recall_curve

def fit_hgb_with_params(X_tr, y_tr, params):
    clf = HistGradientBoostingClassifier(
        loss="log_loss",
        random_state=42,
        **params,
    )
    clf.fit(X_tr, y_tr)
    return clf

def predict_score_hgb(model, X):
    return model.predict_proba(X)[:, 1]

def best_precision_under_min_recall(y_true, score, min_recall=0.60):
    y_true = np.asarray(y_true).astype(int)
    score = np.asarray(score).astype(float)

    prec, rec, thr = precision_recall_curve(y_true, score)
    thr = np.r_[thr, 1.0]

    ok = rec >= min_recall
    if not np.any(ok):
        return None

    idx = np.argmax(np.where(ok, prec, -1.0))
    return {
        "threshold": float(thr[idx]),
        "precision": float(prec[idx]),
        "recall": float(rec[idx]),
    }

def sample_hgb_params(rng):
    max_depth = rng.choice([3, 5, 7, None])
    max_depth = None if max_depth is None else int(max_depth)

    return {
        "learning_rate": float(rng.choice([0.02, 0.05, 0.1])),
        "max_depth": max_depth,
        "max_leaf_nodes": int(rng.choice([15, 31, 63, 127])),
        "min_samples_leaf": int(rng.choice([20, 50, 100, 200])),
        "l2_regularization": float(rng.choice([0.0, 0.1, 1.0, 5.0, 10.0])),
        "max_bins": int(rng.choice([128, 255])),
    }


def tune_hgb_recall_driven(X_tr, y_tr, X_te, y_te, n_trials=30, min_recall=0.60, seed=42):
    rng = np.random.default_rng(seed)

    rows = []
    best = None

    for t in tqdm(range(n_trials), desc="HGB tuning"):
        params = sample_hgb_params(rng)
        model = fit_hgb_with_params(X_tr, y_tr, params)
        score_te = predict_score_hgb(model, X_te)

        best_row = best_precision_under_min_recall(y_te, score_te, min_recall=min_recall)

        row = {
            "trial": t,
            "ok": best_row is not None,
            "precision": np.nan if best_row is None else best_row["precision"],
            "recall": np.nan if best_row is None else best_row["recall"],
            "threshold": np.nan if best_row is None else best_row["threshold"],
            **params,
        }
        rows.append(row)

        if best_row is not None:
            if (best is None) or (best_row["precision"] > best["precision"]):
                best = {
                    "precision": best_row["precision"],
                    "recall": best_row["recall"],
                    "threshold": best_row["threshold"],
                    "params": params,
                    "model": model,
                }

    trials_df = pd.DataFrame(rows).sort_values(["ok", "precision"], ascending=[False, False]).reset_index(drop=True)
    return best, trials_df

best_hgb, hgb_trials = tune_hgb_recall_driven(
    X_tr, y_tr, X_te, y_te,
    n_trials=40,
    min_recall=0.60,
)

hgb_trials.head(10)


HGB tuning:   0%|          | 0/40 [00:00<?, ?it/s]

Unnamed: 0,trial,ok,precision,recall,threshold,learning_rate,max_depth,max_leaf_nodes,min_samples_leaf,l2_regularization,max_bins
0,17,True,0.849661,0.602062,0.827425,0.02,,127,200,5.0,255
1,29,True,0.848544,0.600687,0.886483,0.05,7.0,63,20,10.0,255
2,25,True,0.84593,0.6,0.865247,0.02,,127,50,1.0,255
3,26,True,0.841851,0.6,0.899208,0.05,3.0,63,200,10.0,255
4,9,True,0.836207,0.6,0.815635,0.02,5.0,63,20,5.0,255
5,37,True,0.835407,0.6,0.749201,0.02,7.0,63,50,10.0,255
6,30,True,0.834608,0.6,0.854034,0.02,,31,100,1.0,128
7,12,True,0.834601,0.603436,0.889215,0.02,5.0,63,50,0.1,128
8,28,True,0.831181,0.619244,0.833018,0.02,5.0,15,20,5.0,255
9,5,True,0.829224,0.624055,0.735924,0.02,5.0,15,100,10.0,128


In [27]:
from sklearn.metrics import classification_report

thr = best_hgb["threshold"]
score_te = predict_score_hgb(best_hgb["model"], X_te)
y_pred = (score_te >= thr).astype(int)

print("best precision under recall constraint")
print("precision:", best_hgb["precision"], "recall:", best_hgb["recall"], "thr:", thr)
print(classification_report(y_te, y_pred, digits=4))


best precision under recall constraint
precision: 0.8496605237633366 recall: 0.6020618556701031 thr: 0.8274246241907796
              precision    recall  f1-score   support

           0     0.9287    0.9798    0.9536      7691
           1     0.8497    0.6021    0.7047      1455

    accuracy                         0.9197      9146
   macro avg     0.8892    0.7910    0.8292      9146
weighted avg     0.9161    0.9197    0.9140      9146



---

# Stage1 / Stage2 모델 성능 정리

## 1. Stage1 모델 (LightGBM)

**Threshold 선택 기준**

* Recall ≥ 0.50 조건 하에서 Precision 최대화

**선택된 Threshold**

* threshold = 0.990359
* top_pct = 0.002193

### Classification Report

| Class | Precision | Recall | F1-score | Support |
| ----- | --------- | ------ | -------- | ------- |
| 0     | 0.9987    | 0.9991 | 0.9989   | 930,314 |
| 1     | 0.6070    | 0.5074 | 0.5527   | 2,448   |

**요약**

* Precision (Fraud) = **0.6070**
* Recall (Fraud) = **0.5074**
* Accuracy = 0.9978

Stage1은 전체 거래 중 극히 일부(top 0.2%)만 후보로 추출하는 1차 필터 역할을 수행한다.
Recall을 50% 이상 확보하면서 Precision 60% 수준을 유지하는 구조이다.

---

## 2. Stage2 모델 (HGB, Recall-Driven Tuning)

**튜닝 기준**

* Recall ≥ 0.60 조건 하에서 Precision 최대화

**선택 결과**

* threshold = 0.8274246241907796
* Precision = 0.8497
* Recall = 0.6021

### Classification Report

| Class | Precision | Recall | F1-score | Support |
| ----- | --------- | ------ | -------- | ------- |
| 0     | 0.9287    | 0.9798 | 0.9536   | 7,691   |
| 1     | 0.8497    | 0.6021 | 0.7047   | 1,455   |

**요약**

* Precision (Fraud) = **0.8497**
* Recall (Fraud) = **0.6021**
* Accuracy = 0.9197

Stage2는 Stage1 통과 후보에 대해 정밀 판별을 수행하며,
높은 Precision(85%)을 유지하면서 Recall 60% 이상을 확보하였다.

---

# 3. 2-Stage 구조 전체 해석

Stage1 Recall × Stage2 Recall:

[
0.5074 \times 0.6021 \approx 0.305
]

즉, 전체 사기 거래 중 약 **30% 수준을 최종적으로 탐지**하는 구조이다.

다만 Stage2에서 Precision이 크게 상승하므로
최종 Alert 품질은 상당히 높게 유지된다.

---

# 4. 구조적 의미

| 단계     | 목적       | 특징           |
| ------ | -------- | ------------ |
| Stage1 | 고속 1차 필터 | Recall 중심    |
| Stage2 | 정밀 판별    | Precision 중심 |

현재 구조는:

* Stage1에서 후보 압축
* Stage2에서 고정밀 판별

이라는 설계 목적에 부합하는 성능을 보인다.

---