In [2]:
import os
os.chdir("..")

In [3]:
pwd

'/home/nakyung/projects/BDAIFin'

In [30]:
import importlib
import OFFLINE.models.stage1 as stage1
import OFFLINE.features.client_state as client_state
importlib.reload(client_state)
importlib.reload(stage1)

<module 'OFFLINE.models.stage1' from '/home/nakyung/projects/BDAIFin/OFFLINE/models/stage1.py'>

In [18]:
import pandas as pd

df = pd.read_parquet("data/online/train_oss.parquet")

from OFFLINE.features.client_state import add_client_state_feature

class Cfg:
    client_col = "client_id"

df_feat = add_client_state_feature(df, Cfg)
print("After client_state:", df_feat.shape)
print("New cols sample:", [c for c in df_feat.columns if c.startswith("client_") or c in ["is_new_client","amount_mean","amount_diff","client_hour_mode_prev","client_top_mccg_prev"]][:20])


After client_state: (708101, 77)
New cols sample: ['client_id', 'is_new_client', 'client_hour_mode_prev', 'client_weekend_rate_prev', 'amount_mean', 'amount_diff', 'client_mccg_Food_Daily_cnt_prev', 'client_mccg_Transport_Travel_cnt_prev', 'client_mccg_Digital_Online_cnt_prev', 'client_mccg_Financial_cnt_prev', 'client_mccg_Retail_cnt_prev', 'client_mccg_Medical_cnt_prev', 'client_mccg_Entertainment_cnt_prev', 'client_mccg_Automotive_Home_cnt_prev', 'client_mccg_Utilities_Government_cnt_prev', 'client_mccg_Professional_Services_cnt_prev', 'client_mccg_Industrial_/_Manufacturing_cnt_prev', 'client_top_mccg_prev']


In [19]:
from OFFLINE.models.stage1 import Stage1Config, train_stage1, save_stage1_artifacts

MODELS = ["logit", "hgb", "xgb", "lgbm"]
outs = {}

for m in MODELS:
    cfg = Stage1Config(
        label_col="fraud",
        drop_cols=("client_id", "card_id", "merchant_id"),
        valid_ratio=0.3,
        target_recall=0.70,
        model_name=m,
    )
    out = train_stage1(df_feat, cfg)
    outs[m] = out

    print(f"\n=== {m.upper()} REPORT ===")
    r = out["report"]
    print("n_train:", r["n_train"], "n_valid:", r["n_valid"])
    print("pos_rate_train:", r["pos_rate_train"], "pos_rate_valid:", r["pos_rate_valid"])
    print("PR-AUC(valid):", r["pr_auc_valid"])
    print("ROC-AUC(valid):", r["roc_auc_valid"])
    print("threshold:", r["threshold"])
    print("confusion_matrix(valid):", r["confusion_matrix_valid"])

    out_dir = f"artifacts/{m}"
    save_stage1_artifacts(
        model=out["model"],
        feature_schema=out["feature_schema"],
        threshold=out["threshold"],
        out_dir=out_dir,
        model_name=f"stage1_{m}.joblib",
        schema_name="feature_schema.json",
        threshold_name="threshold.json",
    )
    print("Saved to:", out_dir)

import pandas as pd

summary = pd.DataFrame([
    {
        "model": m,
        "pr_auc": outs[m]["report"]["pr_auc_valid"],
        "roc_auc": outs[m]["report"]["roc_auc_valid"],
        "thr": outs[m]["report"]["threshold"]["threshold"],
        "thr_prec": outs[m]["report"]["threshold"]["precision"],
        "thr_rec": outs[m]["report"]["threshold"]["recall"],
        "fp": outs[m]["report"]["confusion_matrix_valid"][0][1],
        "fn": outs[m]["report"]["confusion_matrix_valid"][1][0],
    }
    for m in outs
]).sort_values(["pr_auc", "roc_auc"], ascending=False)

summary


Stage1(logit):   0%|          | 0/6 [00:00<?, ?it/s]


=== LOGIT REPORT ===
n_train: 495670 n_valid: 212431
pos_rate_train: 0.009550709141162466 pos_rate_valid: 0.018057628123955543
PR-AUC(valid): 1.0
ROC-AUC(valid): 1.0
threshold: {'threshold': 3.758255624270608e-25, 'precision': 1.0, 'recall': 1.0, 'note': 'picked_best_precision_under_recall_constraint'}
confusion_matrix(valid): [[208594, 1], [0, 3836]]
Saved to: artifacts/logit


Stage1(hgb):   0%|          | 0/6 [00:00<?, ?it/s]


=== HGB REPORT ===
n_train: 495670 n_valid: 212431
pos_rate_train: 0.009550709141162466 pos_rate_valid: 0.018057628123955543
PR-AUC(valid): 1.0
ROC-AUC(valid): 1.0
threshold: {'threshold': 0.9999982965542009, 'precision': 1.0, 'recall': 0.7155891553701773, 'note': 'picked_best_precision_under_recall_constraint'}
confusion_matrix(valid): [[208595, 0], [1088, 2748]]
Saved to: artifacts/hgb


Stage1(xgb):   0%|          | 0/6 [00:00<?, ?it/s]


=== XGB REPORT ===
n_train: 495670 n_valid: 212431
pos_rate_train: 0.009550709141162466 pos_rate_valid: 0.018057628123955543
PR-AUC(valid): 1.0
ROC-AUC(valid): 1.0
threshold: {'threshold': 0.9999889135360718, 'precision': 1.0, 'recall': 0.7033368091762252, 'note': 'picked_best_precision_under_recall_constraint'}
confusion_matrix(valid): [[208595, 0], [1114, 2722]]
Saved to: artifacts/xgb


Stage1(lgbm):   0%|          | 0/6 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 4734, number of negative: 490936
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.291082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4952
[LightGBM] [Info] Number of data points in the train set: 495670, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000





=== LGBM REPORT ===
n_train: 495670 n_valid: 212431
pos_rate_train: 0.009550709141162466 pos_rate_valid: 0.018057628123955543
PR-AUC(valid): 1.0
ROC-AUC(valid): 1.0
threshold: {'threshold': 0.999999962566552, 'precision': 1.0, 'recall': 0.700208550573514, 'note': 'picked_best_precision_under_recall_constraint'}
confusion_matrix(valid): [[208595, 0], [1149, 2687]]
Saved to: artifacts/lgbm


Unnamed: 0,model,pr_auc,roc_auc,thr,thr_prec,thr_rec,fp,fn
0,logit,1.0,1.0,3.7582560000000003e-25,1.0,1.0,1,0
1,hgb,1.0,1.0,0.9999983,1.0,0.715589,0,1088
2,xgb,1.0,1.0,0.9999889,1.0,0.703337,0,1114
3,lgbm,1.0,1.0,1.0,1.0,0.700209,0,1149


In [20]:
df.columns

Index(['client_id', 'card_id', 'amount', 'merchant_id', 'current_age',
       'retirement_age', 'birth_year', 'birth_month', 'latitude', 'longitude',
       'per_capita_income', 'yearly_income', 'total_debt', 'credit_score',
       'num_credit_cards', 'has_chip', 'num_cards_issued', 'credit_limit',
       'year_pin_last_changed', 'is_online', 'has_error',
       'err_bad_card_number', 'err_bad_expiration', 'err_bad_cvv',
       'err_bad_pin', 'err_bad_zipcode', 'err_insufficient_balance',
       'err_technical_glitch', 'acct_open_year', 'acct_open_month',
       'expires_year', 'expires_month', 'months_to_expire', 'tx_year',
       'tx_month', 'tx_day', 'tx_hour', 'is_weekend', 'is_credit',
       'is_prepaid', 'male', 'mccg_Food_Daily', 'mccg_Transport_Travel',
       'mccg_Digital_Online', 'mccg_Financial', 'mccg_Retail', 'mccg_Medical',
       'mccg_Entertainment', 'mccg_Automotive_Home',
       'mccg_Utilities_Government', 'mccg_Professional_Services',
       'mccg_Industrial_/_Man

In [21]:
from OFFLINE.models.stage1 import Stage1Config, train_stage1

cfg = Stage1Config(model_name="logit", valid_ratio=0.21, target_recall=0.70)
out = train_stage1(df, cfg)  # df_feat 말고 원본 df로!
print(out["report"]["pr_auc_valid"], out["report"]["roc_auc_valid"])
print(out["report"]["confusion_matrix_valid"])


Stage1(logit):   0%|          | 0/6 [00:00<?, ?it/s]

0.2535016489555549 0.8797603081974581
[[130552, 15124], [907, 2119]]


In [23]:
cols = ["tx_year","tx_month","tx_day","tx_hour"]
is_sorted = (tmp[cols].to_numpy() == tmp[cols].sort_values(cols).to_numpy()).all()
print(is_sorted)


False


In [25]:
import numpy as np

cols = ["tx_year","tx_month","tx_day","tx_hour"]

t = (
    tmp["tx_year"].astype("int64") * 1000000
  + tmp["tx_month"].astype("int64") * 10000
  + tmp["tx_day"].astype("int64") * 100
  + tmp["tx_hour"].astype("int64")
)

bad = np.where(t.to_numpy()[1:] < t.to_numpy()[:-1])[0]
print("n_inversions:", len(bad))

if len(bad) > 0:
    i = bad[0]
    display(tmp.loc[[tmp.index[i], tmp.index[i+1]], cols + ["amount","amount_mean","amount_diff"]])


n_inversions: 418


Unnamed: 0,tx_year,tx_month,tx_day,tx_hour,amount,amount_mean,amount_diff
8694,2010,1,29,0,27.48,27.190588,-0.289412
13238,2010,1,2,15,23.219999,9.103333,-14.116666


In [28]:
t = (
    tmp["tx_year"].astype("int64") * 1000000
  + tmp["tx_month"].astype("int64") * 10000
  + tmp["tx_day"].astype("int64") * 100
  + tmp["tx_hour"].astype("int64")
)

print("is_monotonic_increasing:", t.is_monotonic_increasing)


is_monotonic_increasing: False


In [29]:
cols = ["tx_year","tx_month","tx_day","tx_hour"]

cid = df_feat["client_id"].iloc[0]
tmp = df_feat[df_feat["client_id"] == cid].copy()

tmp = tmp.sort_values(cols, kind="mergesort").reset_index(drop=True)

t = (
    tmp["tx_year"].astype("int64") * 1000000
  + tmp["tx_month"].astype("int64") * 10000
  + tmp["tx_day"].astype("int64") * 100
  + tmp["tx_hour"].astype("int64")
)

print("is_monotonic_increasing:", t.is_monotonic_increasing)


is_monotonic_increasing: True


In [31]:
import pandas as pd
import numpy as np

df = pd.read_parquet("data/online/train_oss.parquet")
print("Loaded:", df.shape)
print(df.dtypes.head(10))

Loaded: (708101, 58)
client_id           int64
card_id             int64
amount            float32
merchant_id         int64
current_age         int64
retirement_age      int64
birth_year          int64
birth_month          int8
latitude          float64
longitude         float64
dtype: object


In [32]:
from OFFLINE.models.stage1 import Stage1Config, time_split, train_stage1, save_stage1_artifacts
from OFFLINE.features.client_state import add_client_state_feature

class Cfg:
    client_col = "client_id"

VALID_RATIO = 0.30
TARGET_RECALL = 0.70
DROP_COLS = ("client_id", "card_id", "merchant_id")

MODELS = ["logit", "hgb", "xgb", "lgbm"]

In [33]:
split_cfg = Stage1Config(
    label_col="fraud",
    drop_cols=DROP_COLS,
    valid_ratio=VALID_RATIO,
    target_recall=TARGET_RECALL,
    model_name="logit", 
)

train_df, valid_df = time_split(df, split_cfg)

print("train_df:", train_df.shape, "pos_rate:", train_df["fraud"].mean())
print("valid_df:", valid_df.shape, "pos_rate:", valid_df["fraud"].mean())


train_df: (495670, 58) pos_rate: 0.009550709141162466
valid_df: (212431, 58) pos_rate: 0.018057628123955543


In [34]:
train_feat = add_client_state_feature(train_df, Cfg)
valid_feat = add_client_state_feature(valid_df, Cfg)

print("train_feat:", train_feat.shape)
print("valid_feat:", valid_feat.shape)

# quick sanity: monotonic check for one client in train_feat
cid = train_feat["client_id"].iloc[0]
tmp = train_feat[train_feat["client_id"] == cid].copy()

t = (
    tmp["tx_year"].astype("int64") * 1000000
  + tmp["tx_month"].astype("int64") * 10000
  + tmp["tx_day"].astype("int64") * 100
  + tmp["tx_hour"].astype("int64")
)
print("train client monotonic:", t.is_monotonic_increasing)

train_feat: (495670, 77)
valid_feat: (212431, 77)
train client monotonic: True


In [35]:
from OFFLINE.models.stage1 import build_feature_columns, make_xy, build_model_pipeline, choose_threshold_by_recall
from sklearn.metrics import average_precision_score, roc_auc_score, confusion_matrix, classification_report

def train_stage1_presplit(train_df_feat: pd.DataFrame, valid_df_feat: pd.DataFrame, cfg: Stage1Config):
    # Feature schema (fit on train schema)
    feature_cols = build_feature_columns(train_df_feat, cfg)
    X_tr, y_tr = make_xy(train_df_feat, feature_cols, cfg.label_col)
    X_va, y_va = make_xy(valid_df_feat, feature_cols, cfg.label_col)

    # Build + fit
    pipe = build_model_pipeline(feature_cols, cfg, y_train=y_tr)
    pipe.fit(X_tr, y_tr)

    # Predict
    proba_va = pipe.predict_proba(X_va)[:, 1]

    # Metrics
    pr_auc = float(average_precision_score(y_va, proba_va))
    roc_auc = float(roc_auc_score(y_va, proba_va))

    # Threshold
    thr_info = choose_threshold_by_recall(
        y_true=y_va,
        y_proba=proba_va,
        target_recall=cfg.target_recall,
        min_threshold=cfg.min_threshold,
        max_threshold=cfg.max_threshold,
    )
    thr = float(thr_info["threshold"])
    yhat_va = (proba_va >= thr).astype(np.int8)

    cm = confusion_matrix(y_va, yhat_va).tolist()
    cls = classification_report(y_va, yhat_va, digits=4, output_dict=True)

    feature_schema = {
        "label_col": cfg.label_col,
        "drop_cols": list(cfg.drop_cols),
        "feature_cols": feature_cols,
        "dtypes": {c: str(train_df_feat[c].dtype) for c in feature_cols},
        "model_name": cfg.model_name,
        "cfg": asdict(cfg),
    }

    report = {
        "model_name": cfg.model_name,
        "n_train": int(len(train_df_feat)),
        "n_valid": int(len(valid_df_feat)),
        "pos_rate_train": float(train_df_feat[cfg.label_col].mean()),
        "pos_rate_valid": float(valid_df_feat[cfg.label_col].mean()),
        "pr_auc_valid": pr_auc,
        "roc_auc_valid": roc_auc,
        "threshold": thr_info,
        "confusion_matrix_valid": cm,
        "classification_report_valid": cls,
    }

    return {
        "model": pipe,
        "feature_cols": feature_cols,
        "feature_schema": feature_schema,
        "threshold": thr_info,
        "report": report,
    }

print("Ready.")


Ready.


In [37]:
from dataclasses import asdict

outs = {}

for m in MODELS:
    cfg = Stage1Config(
        label_col="fraud",
        drop_cols=DROP_COLS,
        valid_ratio=VALID_RATIO,        # 여기서는 사용 안 됨(이미 split 완료)
        target_recall=TARGET_RECALL,
        model_name=m,
    )

    out = train_stage1_presplit(train_feat, valid_feat, cfg)
    outs[m] = out

    print(f"\n=== {m.upper()} REPORT ===")
    r = out["report"]
    print("n_train:", r["n_train"], "n_valid:", r["n_valid"])
    print("pos_rate_train:", r["pos_rate_train"], "pos_rate_valid:", r["pos_rate_valid"])
    print("PR-AUC(valid):", r["pr_auc_valid"])
    print("ROC-AUC(valid):", r["roc_auc_valid"])
    print("threshold:", r["threshold"])
    print("confusion_matrix(valid):", r["confusion_matrix_valid"])

    out_dir = f"artifacts/{m}"
    save_stage1_artifacts(
        model=out["model"],
        feature_schema=out["feature_schema"],
        threshold=out["threshold"],
        out_dir=out_dir,
        model_name=f"stage1_{m}.joblib",
        schema_name="feature_schema.json",
        threshold_name="threshold.json",
    )
    print("Saved to:", out_dir)

summary = pd.DataFrame([
    {
        "model": m,
        "pr_auc": outs[m]["report"]["pr_auc_valid"],
        "roc_auc": outs[m]["report"]["roc_auc_valid"],
        "thr": outs[m]["report"]["threshold"]["threshold"],
        "thr_prec": outs[m]["report"]["threshold"]["precision"],
        "thr_rec": outs[m]["report"]["threshold"]["recall"],
        "fp": outs[m]["report"]["confusion_matrix_valid"][0][1],
        "fn": outs[m]["report"]["confusion_matrix_valid"][1][0],
    }
    for m in outs
]).sort_values(["pr_auc", "roc_auc"], ascending=False)

summary



=== LOGIT REPORT ===
n_train: 495670 n_valid: 212431
pos_rate_train: 0.009550709141162466 pos_rate_valid: 0.018057628123955543
PR-AUC(valid): 0.24059408144393668
ROC-AUC(valid): 0.888504258880252
threshold: {'threshold': 0.752992843796272, 'precision': 0.11328553353015605, 'recall': 0.700208550573514, 'note': 'picked_best_precision_under_recall_constraint'}
confusion_matrix(valid): [[187570, 21025], [1150, 2686]]
Saved to: artifacts/logit

=== HGB REPORT ===
n_train: 495670 n_valid: 212431
pos_rate_train: 0.009550709141162466 pos_rate_valid: 0.018057628123955543
PR-AUC(valid): 0.25206485358809616
ROC-AUC(valid): 0.8557758508493727
threshold: {'threshold': 6.324776538287238e-05, 'precision': 0.06684079601990049, 'recall': 0.7004692387904067, 'note': 'picked_best_precision_under_recall_constraint'}
confusion_matrix(valid): [[171081, 37514], [1149, 2687]]
Saved to: artifacts/hgb

=== XGB REPORT ===
n_train: 495670 n_valid: 212431
pos_rate_train: 0.009550709141162466 pos_rate_valid: 0.018




=== LGBM REPORT ===
n_train: 495670 n_valid: 212431
pos_rate_train: 0.009550709141162466 pos_rate_valid: 0.018057628123955543
PR-AUC(valid): 0.19219342858158733
ROC-AUC(valid): 0.7568630167558555
threshold: {'threshold': 6.568256269069635e-06, 'precision': 0.03834677707188236, 'recall': 0.700208550573514, 'note': 'picked_best_precision_under_recall_constraint'}
confusion_matrix(valid): [[141235, 67360], [1150, 2686]]
Saved to: artifacts/lgbm


Unnamed: 0,model,pr_auc,roc_auc,thr,thr_prec,thr_rec,fp,fn
1,hgb,0.252065,0.855776,6.3e-05,0.066841,0.700469,37514,1149
0,logit,0.240594,0.888504,0.752993,0.113286,0.700209,21025,1150
2,xgb,0.233509,0.822238,0.000236,0.058419,0.700469,43309,1149
3,lgbm,0.192193,0.756863,7e-06,0.038347,0.700209,67360,1150


LOGIT: 0.113 (FP 21,025) => 제일 균형 좋음

HGB: 0.0668 (FP 37,514)

XGB: 0.0584 (FP 43,309)

LGBM: 0.0383 (FP 67,360)