In [1]:
# ====== [0] CONFIG ======
ZIP_ONLINE = "online_data-20260209T062301Z-1-001.zip"   # 네 Colab에 업로드된 파일명
EXTRACT_DIR = "online_data_unzipped"                    # 풀어놓을 폴더
TARGET_COL = "fraud"                                    # label 컬럼명(다르면 자동탐색)
TIME_COL_CANDIDATES = ["transaction_time", "date", "tx_datetime", "datetime", "timestamp"]  # 시간 후보

TARGET_RECALL = 0.70   # 운영정책: recall>=0.70에서 precision 최대
RANDOM_SEED = 42

In [2]:
import os, zipfile, shutil

assert os.path.exists(ZIP_ONLINE), f"Missing zip: {ZIP_ONLINE}"

# (중요) 이전에 깨진 extract가 남아있을 수 있으니 항상 깨끗이 지우고 다시 푼다
if os.path.exists(EXTRACT_DIR):
    shutil.rmtree(EXTRACT_DIR)
os.makedirs(EXTRACT_DIR, exist_ok=True)

# zip 열기 테스트(업로드 중 실행하면 BadZipFile 나옴)
with open(ZIP_ONLINE, "rb") as f:
    head = f.read(4)
assert head[:2] == b"PK", f"Not a zip file header. head={head}"

with zipfile.ZipFile(ZIP_ONLINE, "r") as z:
    z.testzip()  # 내부 손상 검사
    z.extractall(EXTRACT_DIR)

print("Unzip done ->", EXTRACT_DIR)

Unzip done -> online_data_unzipped


In [3]:
import glob
import pandas as pd

def find_base_dir(extract_dir: str) -> str:
    # online_data 폴더를 찾는다 (zip마다 경로가 한 겹 더 들어갈 수 있음)
    cands = []
    for root, dirs, files in os.walk(extract_dir):
        for d in dirs:
            if d.lower() == "online_data":
                cands.append(os.path.join(root, d))
    if not cands:
        raise FileNotFoundError("Could not find folder named 'online_data' after unzip.")
    # 가장 깊은 경로 우선(대개 실제 데이터 위치)
    cands = sorted(cands, key=lambda p: p.count(os.sep), reverse=True)
    return cands[0]

BASE = find_base_dir(EXTRACT_DIR)
print("BASE:", BASE)

def load_split(name: str) -> pd.DataFrame:
    """
    split이
    (A) BASE/train 처럼 '확장자 없는 parquet 파일'로 있든
    (B) BASE/train/ 아래 parquet 여러개로 있든
    둘 다 로딩
    """
    p = os.path.join(BASE, name)

    # (A) 파일로 존재 (확장자 없어도 parquet면 read_parquet 가능)
    if os.path.isfile(p):
        print(f"[{name}] as FILE:", p)
        return pd.read_parquet(p)

    # (B) 폴더로 존재
    if os.path.isdir(p):
        print(f"[{name}] as DIR :", p)
        cands = sorted(glob.glob(os.path.join(p, "**", "*.parquet"), recursive=True))
        if not cands:
            raise FileNotFoundError(f"No parquet under dir: {p}")
        dfs = [pd.read_parquet(fp) for fp in cands]
        return pd.concat(dfs, ignore_index=True)

    raise FileNotFoundError(f"Split not found as file/dir: {p}")

df_train = load_split("train")
df_test  = load_split("test")
df_check = load_split("check")

print("train:", df_train.shape)
print("test :", df_test.shape)
print("check:", df_check.shape)

BASE: online_data_unzipped/online_data
[train] as FILE: online_data_unzipped/online_data/train
[test] as FILE: online_data_unzipped/online_data/test
[check] as FILE: online_data_unzipped/online_data/check
train: (609655, 60)
test : (114209, 60)
check: (166904, 60)


In [4]:
import numpy as np

def pick_target_col(df: pd.DataFrame, target_hint: str) -> str:
    if target_hint in df.columns:
        return target_hint
    # 0/1 이진 후보 자동탐색
    cands = []
    for c in df.columns:
        s = df[c]
        if pd.api.types.is_numeric_dtype(s):
            vals = set(pd.Series(s.dropna().unique()).head(10).tolist())
            if vals.issubset({0,1}) and s.nunique(dropna=True) <= 2:
                cands.append(c)
    if len(cands) == 1:
        return cands[0]
    if len(cands) > 1:
        # 이름에 fraud/label/target 우선
        for key in ["fraud","label","target","y"]:
            for c in cands:
                if key in c.lower():
                    return c
        return cands[0]
    raise KeyError("Could not detect target column. Set TARGET_COL manually.")

def pick_time_col(df: pd.DataFrame, cands) -> str | None:
    for c in cands:
        if c in df.columns:
            return c
    # datetime dtype 컬럼 탐색
    for c in df.columns:
        if np.issubdtype(df[c].dtype, np.datetime64):
            return c
    return None

TARGET = pick_target_col(df_train, TARGET_COL)
TIME_COL = pick_time_col(df_train, TIME_COL_CANDIDATES)

print("TARGET:", TARGET)
print("TIME_COL:", TIME_COL)

# 시간컬럼 있으면 datetime으로 강제
if TIME_COL is not None:
    df_train[TIME_COL] = pd.to_datetime(df_train[TIME_COL], errors="coerce")

# valid split: train의 마지막 20%를 valid로 (시간컬럼 있으면 시간정렬 후)
def time_valid_split(df: pd.DataFrame, time_col: str | None, valid_ratio=0.2):
    if time_col is not None and time_col in df.columns and df[time_col].notna().any():
        d = df.sort_values(time_col).reset_index(drop=True)
    else:
        # 시간컬럼 없으면 shuffle은 위험하니 "원래 순서" 기준으로 뒤를 valid로
        d = df.reset_index(drop=True)

    n = len(d)
    cut = int(n * (1 - valid_ratio))
    train_part = d.iloc[:cut].copy()
    valid_part = d.iloc[cut:].copy()
    return train_part, valid_part

train_part, valid_part = time_valid_split(df_train, TIME_COL, valid_ratio=0.2)

print("train_part:", train_part.shape, "pos:", train_part[TARGET].sum())
print("valid_part:", valid_part.shape, "pos:", valid_part[TARGET].sum())

TARGET: fraud
TIME_COL: date
train_part: (487724, 60) pos: 4734
valid_part: (121931, 60) pos: 1864


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

# 옵션 모델(설치돼 있으면 사용)
HAS_XGB = False
HAS_LGBM = False
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except:
    pass
try:
    from lightgbm import LGBMClassifier
    HAS_LGBM = True
except:
    pass

def build_preprocess(df: pd.DataFrame, target: str, time_col: str | None):
    drop_cols = [target]
    if time_col is not None and time_col in df.columns:
        # datetime 그대로 모델에 넣지 않음 (파생변수가 이미 있다는 전제)
        drop_cols.append(time_col)

    X = df.drop(columns=[c for c in drop_cols if c in df.columns])
    # dtype 기반 자동 분리
    num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat_cols = [c for c in X.columns if c not in num_cols]

    numeric_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),  # sparse 대응
    ])
    categorical_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])

    pre = ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop"
    )
    return pre, num_cols, cat_cols

preprocess, num_cols, cat_cols = build_preprocess(train_part, TARGET, TIME_COL)
print("#num:", len(num_cols), "#cat:", len(cat_cols), "total:", len(num_cols)+len(cat_cols))

models = {
    "logit_l2_balanced": LogisticRegression(max_iter=200, class_weight="balanced", n_jobs=None),
    "hgb": HistGradientBoostingClassifier(max_depth=6, learning_rate=0.1, random_state=RANDOM_SEED),
}

if HAS_XGB:
    models["xgb"] = XGBClassifier(
        n_estimators=400, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric="logloss", random_state=RANDOM_SEED, n_jobs=-1
    )
if HAS_LGBM:
    models["lgbm"] = LGBMClassifier(
        n_estimators=800, learning_rate=0.05, num_leaves=64,
        subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_SEED, n_jobs=-1
    )

print("Models:", list(models.keys()))

#num: 58 #cat: 0 total: 58
Models: ['logit_l2_balanced', 'hgb', 'xgb', 'lgbm']


In [6]:
from sklearn.metrics import average_precision_score, roc_auc_score, precision_recall_curve, confusion_matrix

def choose_threshold_under_recall(y_true, scores, target_recall: float):
    prec, rec, thr = precision_recall_curve(y_true, scores)
    # precision_recall_curve는 thr 길이가 prec/rec보다 1 짧음
    prec2, rec2 = prec[:-1], rec[:-1]
    ok = rec2 >= target_recall
    if not np.any(ok):
        # 불가능하면 가장 recall 큰 지점으로
        idx = np.argmax(rec2)
        return float(thr[idx]), float(prec2[idx]), float(rec2[idx])
    # recall 조건 만족 중 precision 최대
    idx = np.argmax(prec2[ok])
    thr_ok = thr[ok][idx]
    return float(thr_ok), float(prec2[ok][idx]), float(rec2[ok][idx])

def ops_at_threshold(y_true, scores, thr):
    y_hat = (scores >= thr).astype(int)
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp) if (tp+fp)>0 else 0.0
    recall = tp / (tp + fn) if (tp+fn)>0 else 0.0
    alert_rate = (tp + fp) / len(y_true)
    return dict(precision=precision, recall=recall, alert_rate=alert_rate, tp=tp, fp=fp, fn=fn, tn=tn)

def fit_score(model, X_train, y_train, X_eval):
    model.fit(X_train, y_train)
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X_eval)[:,1]
    return model.decision_function(X_eval)

# 데이터 분리
y_train = train_part[TARGET].astype(int).values
y_valid = valid_part[TARGET].astype(int).values
y_test  = df_test[TARGET].astype(int).values
y_check = df_check[TARGET].astype(int).values

X_train_df = train_part.drop(columns=[TARGET])
X_valid_df = valid_part.drop(columns=[TARGET])
X_test_df  = df_test.drop(columns=[TARGET])
X_check_df = df_check.drop(columns=[TARGET])

# 시간컬럼이 있으면 drop (전처리에서 이미 drop했어도 안전하게)
for X in [X_train_df, X_valid_df, X_test_df, X_check_df]:
    if TIME_COL is not None and TIME_COL in X.columns:
        X.drop(columns=[TIME_COL], inplace=True)

rows = []

for name, clf in models.items():
    pipe = Pipeline(steps=[("pre", preprocess), ("clf", clf)])
    # fit
    pipe.fit(X_train_df, y_train)

    # scores
    s_valid = pipe.predict_proba(X_valid_df)[:,1] if hasattr(pipe, "predict_proba") else pipe.decision_function(X_valid_df)
    s_test  = pipe.predict_proba(X_test_df)[:,1]  if hasattr(pipe, "predict_proba") else pipe.decision_function(X_test_df)
    s_check = pipe.predict_proba(X_check_df)[:,1] if hasattr(pipe, "predict_proba") else pipe.decision_function(X_check_df)

    # aucs
    valid_pr = average_precision_score(y_valid, s_valid)
    valid_roc = roc_auc_score(y_valid, s_valid)
    test_pr = average_precision_score(y_test, s_test)
    test_roc = roc_auc_score(y_test, s_test)

    # threshold는 valid에서 고정
    thr, vprec, vrec = choose_threshold_under_recall(y_valid, s_valid, TARGET_RECALL)

    test_ops = ops_at_threshold(y_test, s_test, thr)
    check_ops = ops_at_threshold(y_check, s_check, thr)

    rows.append({
        "model": name,
        "valid_pr_auc": valid_pr,
        "valid_roc_auc": valid_roc,
        "thr": thr,
        "test_pr_auc": test_pr,
        "test_roc_auc": test_roc,
        "test_precision@thr": test_ops["precision"],
        "test_recall@thr": test_ops["recall"],
        "test_alert_rate@thr": test_ops["alert_rate"],
        "test_fp": test_ops["fp"],
        "test_fn": test_ops["fn"],
        "check_alert_rate@thr": check_ops["alert_rate"],
        "check_pos_cnt": int(y_check.sum()),
    })

result = pd.DataFrame(rows).sort_values("test_pr_auc", ascending=False)
result

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 4734, number of negative: 482990
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.247181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4546
[LightGBM] [Info] Number of data points in the train set: 487724, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009706 -> initscore=-4.625225
[LightGBM] [Info] Start training from score -4.625225




Unnamed: 0,model,valid_pr_auc,valid_roc_auc,thr,test_pr_auc,test_roc_auc,test_precision@thr,test_recall@thr,test_alert_rate@thr,test_fp,test_fn,check_alert_rate@thr,check_pos_cnt
3,lgbm,0.904295,0.986584,0.124173,0.896638,0.986834,0.995888,0.693225,0.012775,6,643,4.2e-05,0
2,xgb,0.870575,0.984836,0.156822,0.867098,0.984813,0.985417,0.677004,0.012608,21,677,0.000174,0
1,hgb,0.84357,0.981823,0.171177,0.838697,0.975181,0.973538,0.666985,0.012573,38,698,0.000473,0
0,logit_l2_balanced,0.24422,0.909248,0.66553,0.238519,0.879739,0.119023,0.614027,0.094677,9526,809,0.079489,0


In [7]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    average_precision_score, roc_auc_score,
    precision_recall_curve, confusion_matrix
)

def _pick_first(globs, keys):
    for k in keys:
        if k in globs and globs[k] is not None:
            return globs[k], k
    return None, None

def resolve_artifacts(globs=None):
    """
    v3 코드 뒤에 붙여도 돌아가도록,
    X/y split, preprocess, models를 globals()에서 자동 탐색.
    """
    globs = globs or globals()

    # 후보 키들 (너희가 이전에 썼던 이름들 포함)
    X_train, kXt = _pick_first(globs, ["X_train_df","X_train","train_X","Xtr","X_train0"])
    y_train, kyt = _pick_first(globs, ["y_train","train_y","ytr","y_train0"])

    X_valid, kXv = _pick_first(globs, ["X_valid_df","X_valid","valid_X","Xva","X_val","X_valid0"])
    y_valid, kyv = _pick_first(globs, ["y_valid","valid_y","yva","y_val","y_valid0"])

    X_test,  kXte = _pick_first(globs, ["X_test_df","X_test","test_X","Xte","X_test0"])
    y_test,  kyte = _pick_first(globs, ["y_test","test_y","yte","y_test0"])

    X_check, kXc = _pick_first(globs, ["X_check_df","X_check","check_X","Xck","X_check0"])
    y_check, kyc = _pick_first(globs, ["y_check","check_y","yck","y_check0"])

    preprocess, kpre = _pick_first(globs, ["preprocess","preprocessor","ct","column_transformer"])
    models, kmod = _pick_first(globs, ["models","MODEL_ZOO","model_dict"])

    missing = []
    for name, obj in [
        ("X_train",X_train),("y_train",y_train),
        ("X_valid",X_valid),("y_valid",y_valid),
        ("X_test",X_test),("y_test",y_test),
        ("X_check",X_check),("y_check",y_check),
        ("preprocess",preprocess),("models",models),
    ]:
        if obj is None:
            missing.append(name)

    if missing:
        raise RuntimeError(
            f"[resolve_artifacts] Missing: {missing}\n"
            f"현재 globals()에 있는 키 일부: {sorted(list(globs.keys()))[:40]} ..."
        )

    meta = {
        "X_train_key": kXt, "y_train_key": kyt,
        "X_valid_key": kXv, "y_valid_key": kyv,
        "X_test_key": kXte, "y_test_key": kyte,
        "X_check_key": kXc, "y_check_key": kyc,
        "preprocess_key": kpre, "models_key": kmod
    }

    return (X_train, y_train, X_valid, y_valid, X_test, y_test, X_check, y_check, preprocess, models, meta)

def choose_threshold_under_recall(y_true, scores, target_recall: float):
    prec, rec, thr = precision_recall_curve(y_true, scores)
    # thr는 len-1이므로 맞춰줌
    prec2, rec2 = prec[:-1], rec[:-1]
    ok = rec2 >= target_recall
    if not np.any(ok):
        # 목표 recall을 달성 못하면, 달성 가능한 최대 recall 지점 선택
        idx = int(np.argmax(rec2))
        return float(thr[idx]), float(prec2[idx]), float(rec2[idx])
    idx = int(np.argmax(prec2[ok]))
    return float(thr[ok][idx]), float(prec2[ok][idx]), float(rec2[ok][idx])

def ops_at_threshold(y_true, scores, thr):
    y_hat = (scores >= thr).astype(int)
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp) if (tp+fp)>0 else 0.0
    recall    = tp / (tp + fn) if (tp+fn)>0 else 0.0
    alert_rate = (tp + fp) / len(y_true) if len(y_true)>0 else 0.0
    return {
        "precision": float(precision),
        "recall": float(recall),
        "alert_rate": float(alert_rate),
        "tp": int(tp), "fp": int(fp), "fn": int(fn), "tn": int(tn)
    }

def get_scores(pipe, X):
    if hasattr(pipe, "predict_proba"):
        return pipe.predict_proba(X)[:,1]
    if hasattr(pipe, "decision_function"):
        return pipe.decision_function(X)
    # 최후: predict(0/1)밖에 없으면 사실상 sweep 의미 없음
    return pipe.predict(X).astype(float)

X_train, y_train, X_valid, y_valid, X_test, y_test, X_check, y_check, preprocess, models, META = resolve_artifacts()
print("[OK] resolved:", META)
print("shapes:",
      getattr(X_train,"shape",None), getattr(X_valid,"shape",None),
      getattr(X_test,"shape",None), getattr(X_check,"shape",None))

[OK] resolved: {'X_train_key': 'X_train_df', 'y_train_key': 'y_train', 'X_valid_key': 'X_valid_df', 'y_valid_key': 'y_valid', 'X_test_key': 'X_test_df', 'y_test_key': 'y_test', 'X_check_key': 'X_check_df', 'y_check_key': 'y_check', 'preprocess_key': 'preprocess', 'models_key': 'models'}
shapes: (487724, 58) (121931, 58) (114209, 58) (166904, 58)


# Recall sweep 코드 (valid에서 임계값 고정 → test/check 평가)

In [8]:
RECALL_GRID = [0.50, 0.60, 0.70, 0.80, 0.90]
rows = []
for mname, clf in models.items():
    pipe = Pipeline([("pre", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)

    s_valid = get_scores(pipe, X_valid)
    s_test  = get_scores(pipe, X_test)
    s_check = get_scores(pipe, X_check)

    # policy-independent 지표(참고용)
    valid_pr_auc = float(average_precision_score(y_valid, s_valid))
    valid_roc_auc = float(roc_auc_score(y_valid, s_valid)) if len(np.unique(y_valid))>1 else np.nan
    test_pr_auc = float(average_precision_score(y_test, s_test))
    test_roc_auc = float(roc_auc_score(y_test, s_test)) if len(np.unique(y_test))>1 else np.nan

    for target_recall in RECALL_GRID:
        thr, vprec, vrec = choose_threshold_under_recall(y_valid, s_valid, target_recall)

        test_ops  = ops_at_threshold(y_test, s_test, thr)
        check_ops = ops_at_threshold(y_check, s_check, thr)

        rows.append({
            "model": mname,
            "target_recall": target_recall,
            "thr(valid)": thr,

            "valid_precision@thr": float(vprec),
            "valid_recall@thr": float(vrec),

            "test_pr_auc": test_pr_auc,
            "test_roc_auc": test_roc_auc,
            "test_precision@thr": test_ops["precision"],
            "test_recall@thr": test_ops["recall"],
            "test_alert_rate@thr": test_ops["alert_rate"],
            "test_fp": test_ops["fp"],
            "test_fn": test_ops["fn"],

            "check_alert_rate@thr": check_ops["alert_rate"],
            "check_pos_cnt": int(np.sum(y_check==1)),
            "valid_pr_auc": valid_pr_auc,
            "valid_roc_auc": valid_roc_auc,
        })

sweep_df = pd.DataFrame(rows)

# 보기 편하게: 각 target_recall에서 test_precision 높은 순 정렬
sweep_df = sweep_df.sort_values(["target_recall","test_precision@thr"], ascending=[True,False]).reset_index(drop=True)

# (옵션) 너희 기본 정책이 0.7이면, 그 필터만 뽑아 공유하기 쉬움
sweep_r70 = sweep_df[sweep_df["target_recall"]==0.70].copy().sort_values("test_precision@thr", ascending=False)

print("=== sweep summary (r=0.70) ===")
display(sweep_r70[[
    "model","thr(valid)","valid_precision@thr","valid_recall@thr",
    "test_pr_auc","test_roc_auc","test_precision@thr","test_recall@thr",
    "test_alert_rate@thr","test_fp","test_fn","check_alert_rate@thr","check_pos_cnt"
]])

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 4734, number of negative: 482990
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.540849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4546
[LightGBM] [Info] Number of data points in the train set: 487724, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009706 -> initscore=-4.625225
[LightGBM] [Info] Start training from score -4.625225




=== sweep summary (r=0.70) ===


Unnamed: 0,model,thr(valid),valid_precision@thr,valid_recall@thr,test_pr_auc,test_roc_auc,test_precision@thr,test_recall@thr,test_alert_rate@thr,test_fp,test_fn,check_alert_rate@thr,check_pos_cnt
8,lgbm,0.124173,0.997784,0.724785,0.896638,0.986834,0.995888,0.693225,0.012775,6,643,4.2e-05,0
9,xgb,0.156822,0.98716,0.70118,0.867098,0.984813,0.985417,0.677004,0.012608,21,677,0.000174,0
10,hgb,0.171177,0.966001,0.70118,0.838697,0.975181,0.973538,0.666985,0.012573,38,698,0.000473,0
11,logit_l2_balanced,0.66553,0.10621,0.700107,0.238519,0.879739,0.119023,0.614027,0.094677,9526,809,0.079489,0


# 안정성 테스트 코드 (2종 세트)

2-A) Split 간 성능/운영지표 안정성 (valid→test→check)

In [9]:
TARGET_RECALL = 0.70

rows = []
for mname, clf in models.items():
    pipe = Pipeline([("pre", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)

    s_valid = get_scores(pipe, X_valid)
    s_test  = get_scores(pipe, X_test)
    s_check = get_scores(pipe, X_check)

    thr, vprec, vrec = choose_threshold_under_recall(y_valid, s_valid, TARGET_RECALL)

    v_ops = ops_at_threshold(y_valid, s_valid, thr)
    t_ops = ops_at_threshold(y_test,  s_test,  thr)
    c_ops = ops_at_threshold(y_check, s_check, thr)

    rows.append({
        "model": mname,
        "thr(valid)": thr,

        "valid_precision": v_ops["precision"],
        "valid_recall": v_ops["recall"],
        "valid_alert_rate": v_ops["alert_rate"],

        "test_precision": t_ops["precision"],
        "test_recall": t_ops["recall"],
        "test_alert_rate": t_ops["alert_rate"],

        "check_precision": c_ops["precision"],
        "check_recall": c_ops["recall"],
        "check_alert_rate": c_ops["alert_rate"],

        # drift indicator(간단): alert_rate 변화량
        "delta_alert_rate(test-valid)": t_ops["alert_rate"] - v_ops["alert_rate"],
        "delta_alert_rate(check-valid)": c_ops["alert_rate"] - v_ops["alert_rate"],
    })

stability_split = pd.DataFrame(rows).sort_values("test_precision", ascending=False).reset_index(drop=True)
display(stability_split)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 4734, number of negative: 482990
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.258038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4546
[LightGBM] [Info] Number of data points in the train set: 487724, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009706 -> initscore=-4.625225
[LightGBM] [Info] Start training from score -4.625225




Unnamed: 0,model,thr(valid),valid_precision,valid_recall,valid_alert_rate,test_precision,test_recall,test_alert_rate,check_precision,check_recall,check_alert_rate,delta_alert_rate(test-valid),delta_alert_rate(check-valid)
0,lgbm,0.124173,0.997784,0.724785,0.011105,0.995888,0.693225,0.012775,0.0,0.0,4.2e-05,0.00167,-0.011063
1,xgb,0.156822,0.98716,0.70118,0.010859,0.985417,0.677004,0.012608,0.0,0.0,0.000174,0.00175,-0.010685
2,hgb,0.171177,0.966001,0.70118,0.011096,0.973538,0.666985,0.012573,0.0,0.0,0.000473,0.001477,-0.010623
3,logit_l2_balanced,0.66553,0.10621,0.700107,0.10077,0.119023,0.614027,0.094677,0.0,0.0,0.079489,-0.006093,-0.021281


2-B) 임계값(정책) 민감도 안정성 (Recall grid에서 “랭킹이 바뀌는지”)

In [10]:
# sweep_df가 이미 있으면 그대로 사용
if "sweep_df" not in globals():
    raise RuntimeError("먼저 Recall sweep 셀을 실행해서 sweep_df를 만들어줘.")

rank_rows = []
for r in sorted(sweep_df["target_recall"].unique()):
    tmp = sweep_df[sweep_df["target_recall"]==r].copy()
    tmp = tmp.sort_values("test_precision@thr", ascending=False).reset_index(drop=True)
    tmp["rank_at_r"] = np.arange(1, len(tmp)+1)
    rank_rows.append(tmp[["model","target_recall","rank_at_r","test_precision@thr","test_alert_rate@thr"]])

rank_df = pd.concat(rank_rows, axis=0).reset_index(drop=True)

# 모델별 rank 변동폭
rank_pivot = rank_df.pivot_table(index="model", columns="target_recall", values="rank_at_r", aggfunc="min")
rank_pivot["rank_range"] = rank_pivot.max(axis=1) - rank_pivot.min(axis=1)
rank_pivot = rank_pivot.sort_values("rank_range", ascending=True)

display(rank_pivot)

target_recall,0.5,0.6,0.7,0.8,0.9,rank_range
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
logit_l2_balanced,4,4,4,4,4,0
hgb,3,2,3,3,3,1
xgb,1,1,2,2,2,1
lgbm,2,3,1,1,1,2


# 3) 📌 Feature importance (모델 타입별로 자동)

3-A) 공통: 전처리 후 feature name 뽑기(가능한 경우)

In [11]:
def try_get_feature_names(preprocess, X_example):
    """
    ColumnTransformer/OneHotEncoder 기반이면 get_feature_names_out 가능.
    안되면 None 반환.
    """
    try:
        preprocess.fit(X_example)
        if hasattr(preprocess, "get_feature_names_out"):
            names = preprocess.get_feature_names_out()
            return [str(n) for n in names]
    except Exception:
        return None
    return None

feat_names = try_get_feature_names(preprocess, X_train)
print("feature names:", "OK" if feat_names is not None else "N/A (will fallback)")

feature names: OK


3-B) 트리 모델(예: LGBM/XGB/HGB 등) 내장 중요도

In [12]:
def get_native_importance(pipe, feat_names=None):
    clf = pipe.named_steps["clf"]
    if hasattr(clf, "feature_importances_"):
        imp = clf.feature_importances_
        imp = np.array(imp, dtype=float)
        if feat_names is None:
            feat_names = [f"f{i}" for i in range(len(imp))]
        return pd.DataFrame({"feature": feat_names, "importance": imp}).sort_values("importance", ascending=False)
    return None

# 대표 모델 1~2개만 뽑아도 되고, 전부 돌려도 됨
native_imps = {}
for mname, clf in models.items():
    pipe = Pipeline([("pre", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)

    # 전처리 결과 컬럼 수에 맞는 이름이 필요
    # feat_names가 None이면 f0,f1..로라도 보여줌
    # (feat_names가 있을 때만 사람이 읽기 쉬움)
    df_imp = get_native_importance(pipe, feat_names=feat_names)
    if df_imp is not None:
        native_imps[mname] = df_imp
        print(f"[native FI] {mname}: top10")
        display(df_imp.head(10))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[native FI] xgb: top10


Unnamed: 0,feature,importance
42,num__mccg_Industrial_/_Manufacturing,0.246711
33,num__mccg_Transport_Travel,0.128692
36,num__mccg_Retail,0.083108
3,num__merchant_id,0.064657
32,num__mccg_Food_Daily,0.061945
39,num__mccg_Automotive_Home,0.042749
40,num__mccg_Utilities_Government,0.032289
34,num__mccg_Digital_Online,0.027248
35,num__mccg_Financial,0.0271
55,num__log_abs_amount,0.025841


[LightGBM] [Info] Number of positive: 4734, number of negative: 482990
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.448538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4546
[LightGBM] [Info] Number of data points in the train set: 487724, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009706 -> initscore=-4.625225
[LightGBM] [Info] Start training from score -4.625225
[native FI] lgbm: top10


Unnamed: 0,feature,importance
3,num__merchant_id,5842.0
47,num__months_from_account,2937.0
23,num__months_to_expire,2905.0
1,num__card_id,2195.0
8,num__credit_score,2189.0
26,num__tx_day,2099.0
0,num__client_id,2048.0
27,num__tx_hour,1994.0
25,num__tx_month,1957.0
55,num__log_abs_amount,1879.0
