In [None]:
# ===== [1] Install (최초 1회) =====
!pip -q install pyarrow lightgbm xgboost

# ===== [2] Imports =====
import os, zipfile, shutil
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    average_precision_score, roc_auc_score,
    precision_recall_curve, confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

import lightgbm as lgb
import xgboost as xgb

In [None]:
# ===== [3] Paths: 업로드한 zip 파일명만 여기 맞추면 됨 =====
ZIP_ONLINE = "online_data-20260209T062301Z-1-001.zip"
EXTRACT_DIR = "online_data_unzipped"

assert os.path.exists(ZIP_ONLINE), f"❌ 업로드된 zip이 없음: {ZIP_ONLINE}"

# ===== [4] 깨끗한 재실행을 위해 기존 폴더 삭제 후 재해제 =====
if os.path.exists(EXTRACT_DIR):
    shutil.rmtree(EXTRACT_DIR)
os.makedirs(EXTRACT_DIR, exist_ok=True)

with zipfile.ZipFile(ZIP_ONLINE, "r") as z:
    z.extractall(EXTRACT_DIR)

base = os.path.join(EXTRACT_DIR, "online_data")
assert os.path.exists(base), f"❌ zip 내부에 online_data 폴더가 없음. 실제 구조 확인 필요: {os.listdir(EXTRACT_DIR)}"

# ===== [5] 핵심: train/test/check가 '파일'일 수도 있고 '폴더'일 수도 있으니 둘 다 처리 =====
def resolve_split_path(split_name: str):
    p = os.path.join(base, split_name)
    if os.path.isfile(p):              # ✅ 지금 네 케이스(확장자 없는 parquet 파일)
        return p
    if os.path.isdir(p):               # 다른 팀원이 바꾼 구조(폴더 안 parquet)
        # 폴더 내부에서 parquet 후보(확장자 없어도) 전부 수집 후 "가장 큰 파일" 선택
        files = []
        for root, _, fnames in os.walk(p):
            for f in fnames:
                files.append(os.path.join(root, f))
        if not files:
            return None
        files = sorted(files, key=lambda x: os.path.getsize(x), reverse=True)
        return files[0]
    return None

train_path = resolve_split_path("train")
test_path  = resolve_split_path("test")
check_path = resolve_split_path("check")  # 없을 수도 있음

print("Picked paths:")
print(" train:", train_path)
print(" test :", test_path)
print(" check:", check_path)

assert train_path and test_path, "❌ train/test를 찾지 못함. zip 구조가 바뀐 상태"

Picked paths:
 train: online_data_unzipped/online_data/train
 test : online_data_unzipped/online_data/test
 check: online_data_unzipped/online_data/check


In [None]:
# ===== [6] Load parquet (확장자 없어도 pyarrow 엔진이면 읽힘) =====
df_train = pd.read_parquet(train_path)
df_test  = pd.read_parquet(test_path)
df_check = pd.read_parquet(check_path) if check_path else None

print("Shapes:", df_train.shape, df_test.shape, None if df_check is None else df_check.shape)

# ===== [7] Label 컬럼 자동 탐지 =====
def find_label_col(df):
    candidates = ["fraud", "is_fraud", "label", "target", "y"]
    for c in candidates:
        if c in df.columns:
            return c
    # 마지막 fallback: 0/1로만 구성된 컬럼 탐색(너무 위험하면 여기서 멈추게 할 수도 있음)
    bin_cols = []
    for c in df.columns:
        if df[c].dropna().nunique() <= 2:
            vals = set(df[c].dropna().unique().tolist())
            if vals.issubset({0,1,True,False}):
                bin_cols.append(c)
    if len(bin_cols) == 1:
        return bin_cols[0]
    raise ValueError(f"❌ 라벨 컬럼을 확정 못함. 후보: {bin_cols[:20]} ... / columns 확인 필요")

LABEL = find_label_col(df_train)
print("LABEL =", LABEL)

# ===== [8] 시간 컬럼 자동 탐지(없으면 tx_year 같은 파생으로 대체) =====
def find_time_col(df):
    for c in ["transaction_time", "trans_time", "datetime", "date", "tx_time"]:
        if c in df.columns:
            return c
    # tx_year/tx_month/tx_day 조합이 있으면 “정렬키”로 사용
    if all(c in df.columns for c in ["tx_year", "tx_month", "tx_day"]):
        return "__ymd__"
    if all(c in df.columns for c in ["tx_year", "tx_month"]):
        return "__ym__"
    return None

TIME_COL = find_time_col(df_train)
print("TIME_COL =", TIME_COL)

def add_time_key(df):
    df = df.copy()
    if TIME_COL == "__ymd__":
        df["__time_key__"] = (df["tx_year"].astype(int)*10000
                              + df["tx_month"].astype(int)*100
                              + df["tx_day"].astype(int))
    elif TIME_COL == "__ym__":
        df["__time_key__"] = (df["tx_year"].astype(int)*100
                              + df["tx_month"].astype(int))
    elif TIME_COL is None:
        df["__time_key__"] = np.arange(len(df))  # ⚠️ 시간정보 없으면 “순서”로만 대체(근거 약함)
    else:
        # datetime으로 안전 변환(실패 시 NaT -> 이후 impute/드랍에서 처리)
        df["__time_key__"] = pd.to_datetime(df[TIME_COL], errors="coerce").view("int64")
    return df

df_train = add_time_key(df_train)
df_test  = add_time_key(df_test)
if df_check is not None:
    df_check = add_time_key(df_check)

# 시간 기준 정렬(중요: valid를 “미래”로 만들기 위해)
df_train = df_train.sort_values("__time_key__").reset_index(drop=True)
df_test  = df_test.sort_values("__time_key__").reset_index(drop=True)
if df_check is not None:
    df_check = df_check.sort_values("__time_key__").reset_index(drop=True)

Shapes: (609655, 60) (114209, 60) (166904, 60)
LABEL = fraud
TIME_COL = date


  df["__time_key__"] = pd.to_datetime(df[TIME_COL], errors="coerce").view("int64")
  df["__time_key__"] = pd.to_datetime(df[TIME_COL], errors="coerce").view("int64")
  df["__time_key__"] = pd.to_datetime(df[TIME_COL], errors="coerce").view("int64")


In [None]:
# ===== [9] train을 (fit / valid)로 시간 기준 분리 =====
# 원칙: valid는 train의 "뒤쪽(미래)" 20%
valid_frac = 0.2
cut = int(len(df_train) * (1 - valid_frac))

df_fit   = df_train.iloc[:cut].copy()
df_valid = df_train.iloc[cut:].copy()

print("fit/valid sizes:", df_fit.shape, df_valid.shape)
print("pos rate fit/valid/test:",
      df_fit[LABEL].mean(), df_valid[LABEL].mean(), df_test[LABEL].mean())

fit/valid sizes: (487724, 61) (121931, 61)
pos rate fit/valid/test: 0.009706309306082949 0.015287334640083326 0.018352318994124806


In [None]:
# ===== [10] Feature/Target 분리 =====
DROP_ALWAYS = {LABEL, "__time_key__"}   # label, timekey는 feature에서 제외
X_fit   = df_fit.drop(columns=[c for c in DROP_ALWAYS if c in df_fit.columns])
y_fit   = df_fit[LABEL].astype(int)

X_valid = df_valid.drop(columns=[c for c in DROP_ALWAYS if c in df_valid.columns])
y_valid = df_valid[LABEL].astype(int)

X_test  = df_test.drop(columns=[c for c in DROP_ALWAYS if c in df_test.columns])
y_test  = df_test[LABEL].astype(int)

X_check, y_check = None, None
if df_check is not None and LABEL in df_check.columns:
    X_check = df_check.drop(columns=[c for c in DROP_ALWAYS if c in df_check.columns])
    y_check = df_check[LABEL].astype(int)
elif df_check is not None:
    X_check = df_check.drop(columns=[c for c in DROP_ALWAYS if c in df_check.columns])

# ===== [11] 컬럼 타입 분리 =====
# - 숫자: 스케일링이 필요한 모델(로지스틱 등)에서 안정적
# - 범주: OHE
# - datetime: 원칙적으로 직접 투입 금지(모델/임퓨터 충돌) → 버리거나 파생변수로 대체
def split_columns(X: pd.DataFrame):
    dt_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.datetime64)]
    cat_cols = [c for c in X.columns if X[c].dtype == "object"]
    num_cols = [c for c in X.columns if c not in dt_cols + cat_cols]
    return num_cols, cat_cols, dt_cols

num_cols, cat_cols, dt_cols = split_columns(X_fit)
print("n_num / n_cat / n_dt =", len(num_cols), len(cat_cols), len(dt_cols))
if dt_cols:
    print("Dropping datetime cols:", dt_cols[:20])

# datetime은 여기서 제거 (시간 신호는 tx_year/tx_month 같은 파생으로 쓰는 게 원칙)
X_fit   = X_fit.drop(columns=dt_cols, errors="ignore")
X_valid = X_valid.drop(columns=dt_cols, errors="ignore")
X_test  = X_test.drop(columns=dt_cols, errors="ignore")
if X_check is not None:
    X_check = X_check.drop(columns=dt_cols, errors="ignore")

num_cols, cat_cols, _ = split_columns(X_fit)  # 다시 갱신

# ===== [12] Preprocess pipeline =====
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),   # 결측치로 인한 모델 에러 방지
    ("scaler", StandardScaler())                     # 로지스틱/선형계열 안정화
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop"
)

n_num / n_cat / n_dt = 58 0 1
Dropping datetime cols: ['date']


In [None]:
# ===== [13] threshold 정책: Recall>=TARGET_RECALL에서 Precision 최대가 되는 thr을 valid에서 선택 =====
TARGET_RECALL = 0.70

def choose_threshold_under_recall(y_true, scores, target_recall=0.70):
    precision, recall, thr = precision_recall_curve(y_true, scores)
    # precision_recall_curve의 thr 길이는 precision-1
    # recall/precision은 len(thr)+1 이므로 매칭 조정
    precision = precision[:-1]
    recall = recall[:-1]
    valid = np.where(recall >= target_recall)[0]
    if len(valid) == 0:
        # 목표 recall을 만족 못하면, recall 최대 지점으로 fallback
        k = int(np.argmax(recall))
        return thr[k], {"note": "fallback_max_recall", "precision": float(precision[k]), "recall": float(recall[k])}
    k = valid[np.argmax(precision[valid])]
    return thr[k], {"note": "picked_best_precision_under_recall", "precision": float(precision[k]), "recall": float(recall[k])}

def confusion_ops(y_true, y_hat):
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall    = tp / (tp + fn) if (tp + fn) else 0.0
    alert_rate = (tp + fp) / len(y_true) if len(y_true) else 0.0
    return {"precision": precision, "recall": recall, "alert_rate": alert_rate, "tp": tp, "fp": fp, "fn": fn, "tn": tn}

# ===== [14] 모델 세트 (Stage1용: 확률 점수 출력 가능한 것 위주) =====
models = {
    "logit_l2_balanced": LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=None),
    "hgb": HistGradientBoostingClassifier(max_depth=6, learning_rate=0.08, max_iter=300),
    "lgbm": lgb.LGBMClassifier(
        n_estimators=600, learning_rate=0.05, num_leaves=63,
        subsample=0.8, colsample_bytree=0.8,
        class_weight="balanced"
    ),
    "xgb": xgb.XGBClassifier(
        n_estimators=600, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric="logloss",
        # scale_pos_weight는 train의 불균형 반영(과도하면 조정)
    ),
}

def get_score(estimator, X):
    # predict_proba 우선, 없으면 decision_function
    if hasattr(estimator, "predict_proba"):
        return estimator.predict_proba(X)[:, 1]
    return estimator.decision_function(X)

rows = []

for name, clf in models.items():
    print(f"\n=== {name} ===")

    pipe = Pipeline(steps=[("prep", preprocess), ("model", clf)])
    pipe.fit(X_fit, y_fit)

    # valid 점수로 thr 결정 (중요: test 누수 제거)
    s_valid = get_score(pipe, X_valid)
    thr, thr_info = choose_threshold_under_recall(y_valid, s_valid, TARGET_RECALL)

    # valid 품질(참고용)
    valid_pr_auc = average_precision_score(y_valid, s_valid)
    valid_roc_auc = roc_auc_score(y_valid, s_valid)

    # test는 '고정 thr'로 순수 평가
    s_test = get_score(pipe, X_test)
    test_pr_auc = average_precision_score(y_test, s_test)
    test_roc_auc = roc_auc_score(y_test, s_test)
    yhat_test = (s_test >= thr).astype(int)
    test_ops = confusion_ops(y_test, yhat_test)

    # check는 운영 시뮬(라벨 없을 수 있으니 alert_rate 중심)

    check_alert_rate = None
    check_pos_cnt = None
    if X_check is not None:
        s_check = get_score(pipe, X_check)
        yhat_check = (s_check >= thr).astype(int)
        check_alert_rate = float(yhat_check.mean())
        if y_check is not None:
            check_pos_cnt = int(y_check.sum())  # check에 실제 양성이 있는지(종종 0)
        else:
            check_pos_cnt = None

    print(f"VALID PR-AUC={valid_pr_auc:.4f} ROC-AUC={valid_roc_auc:.4f} | thr={thr:.6f} ({thr_info['note']}) "
          f"| valid_prec={thr_info['precision']:.4f} valid_rec={thr_info['recall']:.4f}")
    print(f"TEST  PR-AUC={test_pr_auc:.4f} ROC-AUC={test_roc_auc:.4f} | test_ops={test_ops}")
    if X_check is not None:
        print(f"CHECK alert_rate={check_alert_rate:.6f} | check_pos_cnt={check_pos_cnt}")

    rows.append({
        "model": name,
        "valid_pr_auc": valid_pr_auc,
        "valid_roc_auc": valid_roc_auc,
        "thr": thr,
        "test_pr_auc": test_pr_auc,
        "test_roc_auc": test_roc_auc,
        "test_precision@thr": test_ops["precision"],
        "test_recall@thr": test_ops["recall"],
        "test_alert_rate@thr": test_ops["alert_rate"],
        "test_fp": test_ops["fp"],
        "test_fn": test_ops["fn"],
        "check_alert_rate@thr": check_alert_rate,
        "check_pos_cnt": check_pos_cnt
    })

result = pd.DataFrame(rows).sort_values(["test_pr_auc", "test_roc_auc"], ascending=False).reset_index(drop=True)
result


=== logit_l2_balanced ===
VALID PR-AUC=0.2457 ROC-AUC=0.9108 | thr=0.507934 (picked_best_precision_under_recall) | valid_prec=0.1102 valid_rec=0.7028
TEST  PR-AUC=0.2382 ROC-AUC=0.8811 | test_ops={'precision': np.float64(0.13423136003781166), 'recall': np.float64(0.5419847328244275), 'alert_rate': np.float64(0.07410099028973198), 'tp': np.int64(1136), 'fp': np.int64(7327), 'fn': np.int64(960), 'tn': np.int64(104786)}
CHECK alert_rate=0.042953 | check_pos_cnt=0

=== hgb ===
VALID PR-AUC=0.8636 ROC-AUC=0.9837 | thr=0.222162 (picked_best_precision_under_recall) | valid_prec=0.9438 valid_rec=0.7028
TEST  PR-AUC=0.8525 ROC-AUC=0.9801 | test_ops={'precision': np.float64(0.9362126245847177), 'recall': np.float64(0.6722328244274809), 'alert_rate': np.float64(0.013177595460953165), 'tp': np.int64(1409), 'fp': np.int64(96), 'fn': np.int64(687), 'tn': np.int64(112017)}
CHECK alert_rate=0.001696 | check_pos_cnt=0

=== lgbm ===
[LightGBM] [Info] Number of positive: 4734, number of negative: 482990



VALID PR-AUC=0.8856 ROC-AUC=0.9865 | thr=0.500342 (picked_best_precision_under_recall) | valid_prec=0.9924 valid_rec=0.7001
TEST  PR-AUC=0.8878 ROC-AUC=0.9879 | test_ops={'precision': np.float64(0.9880868955851436), 'recall': np.float64(0.6727099236641222), 'alert_rate': np.float64(0.012494637025103101), 'tp': np.int64(1410), 'fp': np.int64(17), 'fn': np.int64(686), 'tn': np.int64(112096)}
CHECK alert_rate=0.000072 | check_pos_cnt=0

=== xgb ===
VALID PR-AUC=0.8678 ROC-AUC=0.9851 | thr=0.112758 (picked_best_precision_under_recall) | valid_prec=0.9813 valid_rec=0.7049
TEST  PR-AUC=0.8698 ROC-AUC=0.9861 | test_ops={'precision': np.float64(0.9836956521739131), 'recall': np.float64(0.6908396946564885), 'alert_rate': np.float64(0.012888651507324291), 'tp': np.int64(1448), 'fp': np.int64(24), 'fn': np.int64(648), 'tn': np.int64(112089)}
CHECK alert_rate=0.000258 | check_pos_cnt=0


Unnamed: 0,model,valid_pr_auc,valid_roc_auc,thr,test_pr_auc,test_roc_auc,test_precision@thr,test_recall@thr,test_alert_rate@thr,test_fp,test_fn,check_alert_rate@thr,check_pos_cnt
0,lgbm,0.885586,0.98649,0.500342,0.887805,0.987888,0.988087,0.67271,0.012495,17,686,7.2e-05,0
1,xgb,0.867819,0.985068,0.112758,0.86983,0.986051,0.983696,0.69084,0.012889,24,648,0.000258,0
2,hgb,0.863617,0.983674,0.222162,0.852512,0.980132,0.936213,0.672233,0.013178,96,687,0.001696,0
3,logit_l2_balanced,0.24566,0.910797,0.507934,0.238209,0.881105,0.134231,0.541985,0.074101,7327,960,0.042953,0
