In [73]:
import numpy as np
import pandas as pd

import statsmodels.api as sm
from scipy.stats import ks_2samp, mannwhitneyu
from sklearn.metrics import average_precision_score

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    precision_recall_curve,
    classification_report
)
pd.set_option("display.max_rows", None)
from statsmodels.tools.sm_exceptions import PerfectSeparationError

LABEL = "fraud"

%matplotlib inline

In [2]:
pwd

'/home/nakyung/projects/BDAIFin/EDA&FEATURE'

In [3]:
import pandas as pd
df = pd.read_parquet("../DATA/dataset/train_stage1")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5332979 entries, 0 to 5332978
Data columns (total 23 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   id                        int64         
 1   date                      datetime64[ns]
 2   client_id                 int64         
 3   card_id                   int64         
 4   amount                    float32       
 5   merchant_id               int64         
 6   mcc                       object        
 7   fraud                     int8          
 8   has_error                 int8          
 9   err_bad_card_number       int8          
 10  err_bad_expiration        int8          
 11  err_bad_cvv               int8          
 12  err_bad_pin               int8          
 13  err_bad_zipcode           int8          
 14  err_insufficient_balance  int8          
 15  err_technical_glitch      int8          
 16  tx_year                   int16         
 17  tx_month

*refund_high_amount 

In [5]:
thr = df["log_abs_amount"].quantile(0.9) # 위의 과정 바탕 0.9 사용
df["refund_high_amount"] = (
    (df["is_refund"] == 1) & (df["log_abs_amount"] > thr)
).astype("int8")

In [None]:
df["client_fraud_cum_prev"] = (
    df.groupby("client_id")["fraud"]
      .cumsum()
      .shift(1)
      .fillna(0)
      .astype("int32")
)

df["card_fraud_cum_prev"] = (
    df.groupby("card_id")["fraud"]
      .cumsum()
      .shift(1)
      .fillna(0)
      .astype("int32")
)


In [8]:
g = df.groupby("client_id")["has_error"]
df["client_error_last1"] = g.shift(1).fillna(0).astype("int8")

e1 = g.shift(1).fillna(0).astype("int8")
e2 = g.shift(2).fillna(0).astype("int8")
e3 = g.shift(3).fillna(0).astype("int8")
df["client_error_last3"] = (e1 + e2 + e3).astype("int8")

e1 = g.shift(1).fillna(0).astype("int8")
e2 = g.shift(2).fillna(0).astype("int8")
e3 = g.shift(3).fillna(0).astype("int8")
e4 = g.shift(4).fillna(0).astype("int8")
e5 = g.shift(5).fillna(0).astype("int8")
df["client_error_last5"] = (e1 + e2 + e3 + e4 + e5).astype("int8")

In [None]:
df["client_fraud_cum_prev"] = (
    df.groupby("client_id")["fraud"]
      .cumsum()
      .shift(1)
      .fillna(0)
      .astype("int32")
)

df["card_fraud_cum_prev"] = (
    df.groupby("card_id")["fraud"]
      .cumsum()
      .shift(1)
      .fillna(0)
      .astype("int32")
)


In [10]:
g = df.groupby("card_id")["has_error"]
df["card_error_last1"] = g.shift(1).fillna(0).astype("int8")

e1 = g.shift(1).fillna(0).astype("int8")
e2 = g.shift(2).fillna(0).astype("int8")
e3 = g.shift(3).fillna(0).astype("int8")
df["card_error_last3"] = (e1 + e2 + e3).astype("int8")

e1 = g.shift(1).fillna(0).astype("int8")
e2 = g.shift(2).fillna(0).astype("int8")
e3 = g.shift(3).fillna(0).astype("int8")
e4 = g.shift(4).fillna(0).astype("int8")
e5 = g.shift(5).fillna(0).astype("int8")
df["card_error_last5"] = (e1 + e2 + e3 + e4 + e5).astype("int8")

In [11]:
df["client_has_fraud_history"] = (
    df["client_fraud_cum_prev"] > 0
).astype("int8")
df["card_has_fraud_history"] = (
    df["card_fraud_cum_prev"] > 0
).astype("int8")

In [12]:
df["client_hist_x_error"] = (
    df["client_has_fraud_history"] * df["has_error"]
)
df["card_hist_x_error"] = (
    df["card_has_fraud_history"] * df["card_error_last1"]
)

In [13]:
error_cols = [
    "err_bad_card_number",
    "err_bad_expiration",
    "err_bad_cvv",
    "err_bad_pin",
    "err_bad_zipcode",
    "err_insufficient_balance",
    "err_technical_glitch"
]

df["error_count"] = df[error_cols].sum(axis=1)

In [None]:
df["error_count"] = (
    df["error_count"] * df["log_abs_amount"]
)

In [15]:
threshold = df["log_abs_amount"].quantile(0.9)

df["high_amount"] = (df["log_abs_amount"] >= threshold).astype("int8")

df["error_high_amount"] = (
    (df["error_count"] >= 2) &
    (df["high_amount"] == 1)
).astype("int8")

In [16]:
df["card_err1_x_amount"] = (
    df["card_error_last1"] * df["log_abs_amount"]
)

df["card_hist_err_x_amount"] = (
    df["card_hist_x_error"] * df["log_abs_amount"]
)

In [17]:
df["hour_sin"] = np.sin(2 * np.pi * df["tx_hour"] / 24).astype("float32")

In [18]:
df["hour_cos"] = np.cos(2 * np.pi * df["tx_hour"] / 24).astype("float32")

In [19]:
df = df.sort_values(["client_id", "date"])

df["client_weekday_prev"] = (
    df.groupby("client_id")["weekday"]
      .shift(1)
)

df["client_weekday_match_last1"] = (
    df["weekday"] == df["client_weekday_prev"]
).astype("int8")


In [20]:
df["client_weekday_prev"] = df.groupby("client_id")["weekday"].shift(1)

df["client_weekday_prev"] = df["client_weekday_prev"].fillna(df["weekday"]).astype(df["weekday"].dtype)

df["client_weekday_match_last1"] = (df["weekday"] == df["client_weekday_prev"]).astype("int8")


In [None]:
df["client_weekday_match_last1"] = (
    df.groupby(["client_id", "weekday"])
      .cumcount()
)

df["client_weekday_is_new"] = (
    df["client_weekday_prior_count"] == 0
).astype("int8")

In [22]:
# 1-1) shift (과거값)
df["sin_shift"] = (
    df.groupby("client_id")["hour_sin"]
      .shift(1)
      .fillna(df["hour_sin"])
)

df["cos_shift"] = (
    df.groupby("client_id")["hour_cos"]
      .shift(1)
      .fillna(df["hour_cos"])
)
# 1-2) 누적합 (첫 값 NaN이면 cumsum도 NaN이 될 수 있으니 0으로 채우고 누적)
df["sin_cumsum"] = df["sin_shift"].fillna(0).groupby(df["client_id"]).cumsum()
df["cos_cumsum"] = df["cos_shift"].fillna(0).groupby(df["client_id"]).cumsum()

# 1-3) 과거 개수 (첫 거래는 0)
df["cnt_past"] = df.groupby("client_id").cumcount()

# 1-4) 과거 평균 (0으로 나누기 방지)
df["client_sin_mean_past"] = np.where(
    df["cnt_past"] > 0,
    df["sin_cumsum"] / df["cnt_past"],
    np.nan
)
df["client_cos_mean_past"] = np.where(
    df["cnt_past"] > 0,
    df["cos_cumsum"] / df["cnt_past"],
    np.nan
)

# 1-5) 첫 거래 결측 채우기:
# 첫 거래는 과거가 없으니 "현재 값 = 자기 자신"으로 채우면 distance=0이 됨 (중립 처리)
df["client_sin_mean_past"] = df["client_sin_mean_past"].fillna(df["hour_sin"])
df["client_cos_mean_past"] = df["client_cos_mean_past"].fillna(df["hour_cos"])

# 1-6) 원형 거리
df["hour_circular_distance"] = np.sqrt(
    (df["hour_sin"] - df["client_sin_mean_past"])**2 +
    (df["hour_cos"] - df["client_cos_mean_past"])**2
)


In [23]:
high_risk_days = [0, 4, 6]

df["is_highrisk_weekday"] = df["weekday"].isin(high_risk_days).astype("int8")

In [24]:
# 하드코딩 버려야 함!!! 

def add_is_highrisk_mcc(
    df: pd.DataFrame,
    y_col: str = "fraud",
    mcc_col: str = "mcc",
    year_col: str = "tx_year",
    cutoff_year: int = 2015,   
    top_pct: float = 0.95,   
    min_tx: int = 200       
):
    # 1) 정의용 구간(=train 역할)만 사용
    base = df[df[year_col] <= cutoff_year].copy()

    # 2) mcc별 통계 (거래수, fraud_rate)
    stats = (
        base.groupby(mcc_col)[y_col]
        .agg(tx_count="size", fraud_rate="mean")
        .sort_values("fraud_rate", ascending=False)
    )

    # 3) 안정성: 표본 너무 적은 mcc 제거
    stats = stats[stats["tx_count"] >= min_tx].copy()

    if len(stats) == 0:
        raise ValueError("min_tx 조건 때문에 유효 mcc가 0개")

    # 4) fraud_rate 상위 top_pct 기준으로 highrisk 선정
    thr = stats["fraud_rate"].quantile(top_pct)
    highrisk_mcc_list = stats.loc[stats["fraud_rate"] >= thr].index.tolist()

    # 5) df 전체에 적용 (정의는 train 기준, 적용은 전체)
    df = df.copy()
    df["is_highrisk_mcc"] = df[mcc_col].isin(highrisk_mcc_list).astype("int8")

    # 참고용: 리턴값으로 리스트/통계 같이 반환
    return df, highrisk_mcc_list, stats, thr

cands = [0.90, 0.95, 0.97, 0.99]
out = []

for tp in cands:
    tmp_df, lst, _, _ = add_is_highrisk_mcc(df, cutoff_year=2015, top_pct=tp, min_tx=200)
    out.append([tp, len(lst), tmp_df["is_highrisk_mcc"].mean()])

pd.DataFrame(out, columns=["top_pct", "n_mcc", "highrisk_share"]).sort_values("top_pct", ascending=False)

Unnamed: 0,top_pct,n_mcc,highrisk_share
3,0.99,1,0.000532
2,0.97,3,0.000954
1,0.95,5,0.001515
0,0.9,10,0.002653


In [25]:
base_rate = df["fraud"].mean()

mcc_stats = (
    df.groupby("mcc")["fraud"]
      .agg(["mean", "count"])
      .rename(columns={"mean":"fraud_rate", "count":"tx_count"})
)

highrisk_mcc = mcc_stats[
    (mcc_stats["tx_count"] >= 1000) &
    (mcc_stats["fraud_rate"] >= base_rate * 3)
].index.tolist()

df["mcc_highrisk_90"] = df["mcc"].isin(highrisk_mcc).astype("int8")

In [26]:
df = df.sort_values(["client_id", "date"]).copy()
df["mcc"] = df["mcc"].astype("category")
g = df.groupby("client_id")["mcc"]

df["client_mcc_prev"] = g.shift(1)
df["client_mcc_match_last1"] = (df["mcc"] == df["client_mcc_prev"]).fillna(False).astype("int8")

In [27]:
# 최근 3/5거래에서 현재 mcc가 등장했는지 
m1 = (df["mcc"] == g.shift(1))
m2 = (df["mcc"] == g.shift(2))
m3 = (df["mcc"] == g.shift(3))
m4 = (df["mcc"] == g.shift(4))
m5 = (df["mcc"] == g.shift(5))

df["client_mcc_seen_last3"] = (m1 | m2 | m3).fillna(False).astype("int8")
df["client_mcc_seen_last5"] = (m1 | m2 | m3 | m4 | m5).fillna(False).astype("int8")

# 최근 3/5거래에서 현재 mcc가 몇 번 반복됐는지 (0~3 / 0~5)
df["client_mcc_repeat_cnt_last3"] = (m1.fillna(False).astype("int8")
                                   + m2.fillna(False).astype("int8")
                                   + m3.fillna(False).astype("int8"))

df["client_mcc_repeat_cnt_last5"] = (m1.fillna(False).astype("int8")
                                   + m2.fillna(False).astype("int8")
                                   + m3.fillna(False).astype("int8")
                                   + m4.fillna(False).astype("int8")
                                   + m5.fillna(False).astype("int8"))

# 비율 버전 (0~1)
df["client_mcc_repeat_ratio_last3"] = (df["client_mcc_repeat_cnt_last3"] / 3).astype("float32")
df["client_mcc_repeat_ratio_last5"] = (df["client_mcc_repeat_cnt_last5"] / 5).astype("float32")


In [28]:
df["client_mcc_prior_count"] = df.groupby(["client_id", "mcc"]).cumcount()
# 과거에 없었다 = 첫 등장(현재 포함) 
df["client_mcc_is_new"] = (df["client_mcc_prior_count"] == 0).astype("int8")

  df["client_mcc_prior_count"] = df.groupby(["client_id", "mcc"]).cumcount()


In [29]:
prev = g.shift(1)
prev2 = g.shift(2)
prev3 = g.shift(3)
prev4 = g.shift(4)
prev5 = g.shift(5)

# 최근 5개에서 바뀐 횟수
# (prev5->prev4, prev4->prev3, prev3->prev2, prev2->prev1) 중 바뀐 횟수
chg1 = (prev  != prev2)
chg2 = (prev2 != prev3)
chg3 = (prev3 != prev4)
chg4 = (prev4 != prev5)

df["client_mcc_change_cnt_last5"] = (chg1.fillna(False).astype("int8")
                                  + chg2.fillna(False).astype("int8")
                                  + chg3.fillna(False).astype("int8")
                                  + chg4.fillna(False).astype("int8"))

In [30]:
g = df.groupby("card_id")["mcc"]

df["card_mcc_prev"] = g.shift(1)
df["card_mcc_match_last1"] = (df["mcc"] == df["card_mcc_prev"]).fillna(False).astype("int8")

In [31]:
# 최근 3/5거래에서 현재 mcc가 등장했는지 
m1 = (df["mcc"] == g.shift(1))
m2 = (df["mcc"] == g.shift(2))
m3 = (df["mcc"] == g.shift(3))
m4 = (df["mcc"] == g.shift(4))
m5 = (df["mcc"] == g.shift(5))

df["card_mcc_seen_last3"] = (m1 | m2 | m3).fillna(False).astype("int8")
df["card_mcc_seen_last5"] = (m1 | m2 | m3 | m4 | m5).fillna(False).astype("int8")

# 최근 3/5거래에서 '현재 mcc'가 몇 번 반복됐는지 (0~3 / 0~5)
df["card_mcc_repeat_cnt_last3"] = (m1.fillna(False).astype("int8")
                                   + m2.fillna(False).astype("int8")
                                   + m3.fillna(False).astype("int8"))

df["card_mcc_repeat_cnt_last5"] = (m1.fillna(False).astype("int8")
                                   + m2.fillna(False).astype("int8")
                                   + m3.fillna(False).astype("int8")
                                   + m4.fillna(False).astype("int8")
                                   + m5.fillna(False).astype("int8"))

# 비율 버전 (0~1)
df["card_mcc_repeat_ratio_last3"] = (df["card_mcc_repeat_cnt_last3"] / 3).astype("float32")
df["card_mcc_repeat_ratio_last5"] = (df["card_mcc_repeat_cnt_last5"] / 5).astype("float32")


In [32]:
df["card_mcc_prior_count"] = df.groupby(["card_id", "mcc"]).cumcount()
df["card_mcc_is_new"] = (df["card_mcc_prior_count"] == 0).astype("int8")

  df["card_mcc_prior_count"] = df.groupby(["card_id", "mcc"]).cumcount()


In [33]:
prev = g.shift(1)
prev2 = g.shift(2)
prev3 = g.shift(3)
prev4 = g.shift(4)
prev5 = g.shift(5)

chg1 = (prev  != prev2)
chg2 = (prev2 != prev3)
chg3 = (prev3 != prev4)
chg4 = (prev4 != prev5)

df["card_mcc_change_cnt_last5"] = (chg1.fillna(False).astype("int8")
                                  + chg2.fillna(False).astype("int8")
                                  + chg3.fillna(False).astype("int8")
                                  + chg4.fillna(False).astype("int8"))


In [34]:
df["client_merchant_is_new"] = (
    df.groupby(["client_id", "merchant_id"], sort=False).cumcount().eq(0).astype("int8")
)

df["card_merchant_is_new"] = (
    df.groupby(["card_id", "merchant_id"], sort=False).cumcount().eq(0).astype("int8")
)

In [35]:
prev_merchant = df.groupby("card_id", sort=False)["merchant_id"].shift(1)

df["merchant_changed"] = (
    df["merchant_id"].ne(prev_merchant)      # 직전과 다르면 True
    .fillna(True)                            # 첫 거래는 변경
    .astype("int8")
)

# 2) 최근 5건에서 변경 횟수
df["merchant_change_cnt_last5"] = (
    df.groupby("card_id", sort=False)["merchant_changed"]
      .rolling(window=5, min_periods=1)
      .sum()
      .reset_index(level=0, drop=True)
      .astype("int8")
)

df.drop(columns=["merchant_changed"], inplace=True)

In [None]:
# merchant_is_new 정의(카드 기준)
df["merchant_is_new"] = df["card_merchant_is_new"].astype("int8")

# merchant_is_new × mcc_is_new 
if "card_mcc_is_new" in df.columns:
    df["merchant_is_new_x_mcc_is_new"] = (
        df["merchant_is_new"].astype("int8") * df["card_mcc_is_new"].astype("int8")
    ).astype("int8")
else:
    # 없으면 대체: card 단위로 mcc 첫 등장 여부 생성
    df["card_mcc_is_new"] = (
        df.groupby(["card_id", "mcc"], sort=False).cumcount().eq(0).astype("int8")
    )
    df["merchant_is_new_x_mcc_is_new"] = (
        df["merchant_is_new"] * df["card_mcc_is_new"]
    ).astype("int8")

# merchant_is_new × has_error
df["merchant_is_new_x_has_error"] = (
    df["merchant_is_new"].astype("int8") * df["has_error"].astype("int8")
).astype("int8")

In [37]:
# 1) 이전 거래 시점
df["prev_tx_time"] = df.groupby("client_id")["date"].shift(1)

# 2) 초 단위 간격
df["seconds_since_prev_tx"] = (
    (df["date"] - df["prev_tx_time"]).dt.total_seconds()
)

# 첫 거래는 간격 없음 → 큰 값으로 처리 (중립)
df["seconds_since_prev_tx"] = df["seconds_since_prev_tx"].fillna(0)

# 로그 변환
df["log_interval"] = np.log1p(df["seconds_since_prev_tx"])

In [38]:
# 과거값 shift
df["log_interval_shift"] = df.groupby("client_id")["log_interval"].shift(1)

# 누적합
df["interval_cumsum"] = (
    df["log_interval_shift"].fillna(0)
      .groupby(df["client_id"])
      .cumsum()
)

# 과거 개수
df["interval_cnt_past"] = df.groupby("client_id").cumcount()

# 과거 평균
df["client_avg_interval_prev"] = np.where(
    df["interval_cnt_past"] > 0,
    df["interval_cumsum"] / df["interval_cnt_past"],
    df["log_interval"]  # 첫 거래는 자기 자신으로 중립 처리
)


In [39]:
# ratio
df["interval_ratio"] = (
    df["log_interval"] /
    (df["client_avg_interval_prev"] + 1e-6)
)

# deviation (z-score 방식)
df["log_interval_dev"] = (
    df["log_interval"] -
    df["client_avg_interval_prev"]
)

In [40]:
df = df.sort_values(["client_id", "date"]).reset_index(drop=True)

# numpy index 준비
n = len(df)

client_tx_1h = np.zeros(n, dtype=np.int32)
card_tx_1h = np.zeros(n, dtype=np.int32)

for cid, idx in df.groupby("client_id").groups.items():
    g = df.loc[idx]
    times = g["date"].values.astype("datetime64[s]").astype("int64")
    
    # 누적 거래 번호
    cum = np.arange(len(g))
    
    # 1시간 전 timestamp
    t_minus_1h = times - 3600
    
    left = np.searchsorted(times, t_minus_1h)
    
    client_tx_1h[idx] = cum - left + 1

In [41]:
df["client_tx_1h"] = client_tx_1h

In [42]:
df["client_tx_1h_shift"] = df.groupby("client_id")["client_tx_1h"].shift(1)

df["client_tx_1h_cumsum"] = (
    df["client_tx_1h_shift"].fillna(0)
      .groupby(df["client_id"])
      .cumsum()
)

df["client_tx_cnt_past"] = df.groupby("client_id").cumcount()

df["client_tx_1h_avg_prev"] = np.where(
    df["client_tx_cnt_past"] > 0,
    df["client_tx_1h_cumsum"] / df["client_tx_cnt_past"],
    df["client_tx_1h"]
)

In [43]:
df["velocity_spike_ratio"] = (
    df["client_tx_1h"] /
    (df["client_tx_1h_avg_prev"] + 1e-6)
)

In [44]:
for cid, idx in df.groupby("card_id").groups.items():
    g = df.loc[idx]
    times = g["date"].values.astype("datetime64[s]").astype("int64")
    
    cum = np.arange(len(g))
    t_minus_1h = times - 3600
    
    left = np.searchsorted(times, t_minus_1h)
    
    card_tx_1h[idx] = cum - left + 1

df["card_tx_1h"] = card_tx_1h

In [45]:
df["card_tx_1h_shift"] = df.groupby("card_id")["card_tx_1h"].shift(1)

df["card_tx_1h_cumsum"] = (
    df["card_tx_1h_shift"].fillna(0)
      .groupby(df["card_id"])
      .cumsum()
)

df["card_tx_cnt_past"] = df.groupby("card_id").cumcount()

df["card_tx_1h_avg_prev"] = np.where(
    df["card_tx_cnt_past"] > 0,
    df["card_tx_1h_cumsum"] / df["card_tx_cnt_past"],
    df["card_tx_1h"]
)

df["card_velocity_spike_ratio"] = (
    df["card_tx_1h"] /
    (df["card_tx_1h_avg_prev"] + 1e-6)
)

In [46]:
df = df.sort_values(["client_id", "date"]).reset_index(drop=True)
AMT = "log_abs_amount"

df["amt_shift"] = df.groupby("client_id")[AMT].shift(1)

df["amt_cumsum"] = (
    df["amt_shift"].fillna(0)
      .groupby(df["client_id"])
      .cumsum()
)

df["amt_cnt_past"] = df.groupby("client_id").cumcount()

df["client_avg_amt_prev"] = np.where(
    df["amt_cnt_past"] > 0,
    df["amt_cumsum"] / df["amt_cnt_past"],
    df[AMT]  # 첫 거래는 중립 처리
).astype("float32")


In [None]:
eps = 1e-6

df["amount_vs_client_avg_ratio"] = (
    df[AMT] / (df["client_avg_amt_prev"] + eps)
).astype("float32")

df["amount_vs_client_avg_diff"] = (
    df[AMT] - df["client_avg_amt_prev"]
).astype("float32")

# 로그 ratio도 추천 (분포 안정)
df["log_amount_vs_client_avg_ratio"] = np.log1p(df["amount_vs_client_avg_ratio"]).astype("float32")


In [48]:
K = 10  # 최근 10건 기준 
df["client_recent_avg_amt"] = (
    df.groupby("client_id")[AMT]
      .shift(1)
      .rolling(K, min_periods=1)
      .mean()
      .reset_index(level=0, drop=True)
).astype("float32")

# 첫 거래는 NaN → 중립 처리
df["client_recent_avg_amt"] = df["client_recent_avg_amt"].fillna(df["client_avg_amt_prev"]).astype("float32")

df["amount_vs_recent_window_avg"] = (
    df[AMT] / (df["client_recent_avg_amt"] + eps)
).astype("float32")

df["log_amount_vs_recent_window_avg"] = np.log1p(df["amount_vs_recent_window_avg"]).astype("float32")


In [49]:
KQ = 50 

df["client_q95_prev"] = (
    df.groupby("client_id")[AMT]
      .shift(1)
      .rolling(KQ, min_periods=10)   # 최소 10건 이상일 때만 의미
      .quantile(0.95)
      .reset_index(level=0, drop=True)
).astype("float32")

df["client_q99_prev"] = (
    df.groupby("client_id")[AMT]
      .shift(1)
      .rolling(KQ, min_periods=10)
      .quantile(0.99)
      .reset_index(level=0, drop=True)
).astype("float32")

# NaN(거래 부족)일 때는 avg 기반으로 대체 (중립)
df["client_q95_prev"] = df["client_q95_prev"].fillna(df["client_avg_amt_prev"])
df["client_q99_prev"] = df["client_q99_prev"].fillna(df["client_avg_amt_prev"])

# ratio 형태
df["amount_vs_client_quantile_q95"] = (df[AMT] / (df["client_q95_prev"] + eps)).astype("float32")
df["amount_vs_client_quantile_q99"] = (df[AMT] / (df["client_q99_prev"] + eps)).astype("float32")

# flag 형태(추천)
df["amt_over_q95"] = (df[AMT] > df["client_q95_prev"]).astype("int8")
df["amt_over_q99"] = (df[AMT] > df["client_q99_prev"]).astype("int8")


In [50]:
df["client_fraud_last1"] = (
    df.groupby("client_id")["fraud"]
      .shift(1)
      .fillna(0)
      .astype("int8")
)
df["card_fraud_last1"] = (
    df.groupby("card_id")["fraud"]
      .shift(1)
      .fillna(0)
      .astype("int8")
)
f1 = df.groupby("client_id")["fraud"].shift(1)
f2 = df.groupby("client_id")["fraud"].shift(2)
f3 = df.groupby("client_id")["fraud"].shift(3)

df["client_fraud_last3"] = (
    f1.fillna(0).astype("int8") +
    f2.fillna(0).astype("int8") +
    f3.fillna(0).astype("int8")
)
f1 = df.groupby("card_id")["fraud"].shift(1)
f2 = df.groupby("card_id")["fraud"].shift(2)
f3 = df.groupby("card_id")["fraud"].shift(3)

df["card_fraud_last3"] = (
    f1.fillna(0).astype("int8") +
    f2.fillna(0).astype("int8") +
    f3.fillna(0).astype("int8")
)

In [51]:
client_amt_mean = df.groupby("client_id")["log_abs_amount"].transform("mean")
client_amt_std  = df.groupby("client_id")["log_abs_amount"].transform("std")

df["amount_deviation"] = (
    (df["log_abs_amount"] - client_amt_mean) / (client_amt_std + 1e-6)
)

In [52]:
from IPython.display import HTML, display
import pandas as pd

cols_df = pd.DataFrame({"column_name": df.columns})

html = cols_df.to_html(index=False)
display(HTML(f"""
<div style="max-height:320px; overflow:auto; border:1px solid #ddd;">
{html}
</div>
"""))


column_name
id
date
client_id
card_id
amount
merchant_id
mcc
fraud
has_error
err_bad_card_number


In [53]:
df["mccnew_x_error"] = df["client_mcc_is_new"] * df["has_error"]
df["mccnew_x_velocity"] = df["client_mcc_is_new"] * df["velocity_spike_ratio"]

In [None]:
DEV = "log_amount_vs_recent_window_avg" 

# 1) amount_deviation × mcc_is_new
df["dev_x_mccnew"] = (
    df[DEV] * df["client_mcc_is_new"]
).astype("float32")

# 2) amount_deviation × velocity_spike_ratio
df["dev_x_velocity"] = (
    df[DEV] * df["card_velocity_spike_ratio"]
).astype("float32")

# 3) amount_deviation × has_error
df["dev_x_error"] = (
    df[DEV] * df["has_error"]
).astype("float32")

In [61]:
features = ["refund_high_a mount", "client_error_last1", "client_error_last3", "client_error_last5", "client_fraud_cum_prev", "card_fraud_cum_prev", "card_error_last1", "card_error_last3", "card_error_last5", "client_has_fraud_history", "card_has_fraud_history", "client_hist_x_error", "card_hist_x_error", "error_count", "high_amount", "error_high_amount", "card_err1_x_amount", "card_hist_err_x_amount", "hour_sin", "hour_cos", "client_weekday_prev", "client_weekday_match_last1", "client_weekday_prev", "client_weekday_match_last1", "client_weekday_is_new", "sin_shift", "cos_shift", "sin_cumsum", "cos_cumsum", "cnt_past", "client_sin_mean_past", "client_cos_mean_past", "hour_circular_distance", "is_highrisk_weekday", "is_highrisk_mcc", "mcc_highrisk_90", "client_mcc_prev", "client_mcc_match_last1", "client_mcc_seen_last3", "client_mcc_seen_last5", "client_mcc_repeat_cnt_last3", "client_mcc_repeat_cnt_last5", "client_mcc_repeat_ratio_last3", "client_mcc_repeat_ratio_last5", "client_mcc_prior_count", "client_mcc_is_new", "client_mcc_change_cnt_last5", "card_mcc_prev", "card_mcc_match_last1", "card_mcc_seen_last3", "card_mcc_seen_last5", "card_mcc_repeat_cnt_last3", "card_mcc_repeat_cnt_last5", "card_mcc_repeat_ratio_last3", "card_mcc_repeat_ratio_last5", "card_mcc_prior_count", "card_mcc_is_new", "card_mcc_change_cnt_last5", "client_merchant_is_new", "card_merchant_is_new", "merchant_changed", "merchant_change_cnt_last5", "merchant_is_new", "merchant_is_new_x_has_error", "prev_tx_time", "seconds_since_prev_tx", "seconds_since_prev_tx", "log_interval", "log_interval_shift", "interval_cumsum", "interval_cnt_past", "client_avg_interval_prev", "interval_ratio", "log_interval_dev", "client_tx_1h", "client_tx_1h_shift", "client_tx_1h_cumsum", "client_tx_cnt_past", "client_tx_1h_avg_prev", "velocity_spike_ratio", "card_tx_1h", "card_tx_1h_shift", "card_tx_1h_cumsum", "card_tx_cnt_past", "card_tx_1h_avg_prev", "card_velocity_spike_ratio", "amt_shift", "amt_cumsum", "amt_cnt_past", "client_avg_amt_prev", "amount_vs_client_avg_ratio", "amount_vs_client_avg_diff", "log_amount_vs_client_avg_ratio", "client_recent_avg_amt", "amount_vs_recent_window_avg", "log_amount_vs_recent_window_avg", "client_q95_prev", "client_q99_prev", "client_q95_prev", "client_q99_prev", "amount_vs_client_quantile_q95", "amount_vs_client_quantile_q99", "amt_over_q95", "amt_over_q99", "client_fraud_last1", "card_fraud_last1", "client_fraud_last3", "card_fraud_last3", "amount_deviation", "dev_x_mccnew", "dev_x_velocity", "dev_x_error"]

In [90]:
BASELINE = [
    # Amount
    "log_abs_amount",
    
    # Error
    "has_error",
    
    # Time
    "tx_hour",
    "weekday",
    
    # Refund
    "is_refund",
]

ERROR_FEATURES = [
    "error_count",
    "client_error_last1",
    "client_error_last3",
    "client_error_last5",
    "card_error_last1",
    "card_error_last3",
    "card_error_last5",
    "client_hist_x_error",
    "card_hist_x_error",
    "error_high_amount",
    "card_err1_x_amount",
    "card_hist_err_x_amount",
]

FRAUD_HISTORY_FEATURES = [
    "client_fraud_last1",
    "client_fraud_last3",
    "card_fraud_last1",
    "card_fraud_last3",
    "client_fraud_cum_prev",
    "card_fraud_cum_prev",
    "client_has_fraud_history",
    "card_has_fraud_history",
]

AMOUNT_FEATURES = [
    "high_amount",
    "amount_deviation",
    "amt_shift",
    "amt_cumsum",
    "amt_cnt_past",
    "client_avg_amt_prev",
    "amount_vs_client_avg_ratio",
    "amount_vs_client_avg_diff",
    "log_amount_vs_client_avg_ratio",
    "client_recent_avg_amt",
    "amount_vs_recent_window_avg",
    "log_amount_vs_recent_window_avg",
    "client_q95_prev",
    "client_q99_prev",
    "amount_vs_client_quantile_q95",
    "amount_vs_client_quantile_q99",
    "amt_over_q95",
    "amt_over_q99",
    "dev_x_mccnew",
]

VELOCITY_FEATURES = [
    "seconds_since_prev_tx",
    "log_interval",
    "log_interval_shift",
    "interval_cumsum",
    "interval_cnt_past",
    "client_avg_interval_prev",
    "interval_ratio",
    "log_interval_dev",
    "client_tx_1h",
    "client_tx_1h_shift",
    "client_tx_1h_cumsum",
    "client_tx_cnt_past",
    "client_tx_1h_avg_prev",
    "velocity_spike_ratio",
    "card_tx_1h",
    "card_tx_1h_shift",
    "card_tx_1h_cumsum",
    "card_tx_cnt_past",
    "card_tx_1h_avg_prev",
    "card_velocity_spike_ratio",
    "dev_x_velocity",
]

MCC_FEATURES = [
    "mcc_highrisk_90",
    "client_mcc_match_last1",
    "client_mcc_seen_last3",
    "client_mcc_seen_last5",
    "client_mcc_repeat_cnt_last3",
    "client_mcc_repeat_cnt_last5",
    "client_mcc_repeat_ratio_last3",
    "client_mcc_repeat_ratio_last5",
    "client_mcc_prior_count",
    "client_mcc_is_new",
    "client_mcc_change_cnt_last5",
    "card_mcc_match_last1",
    "card_mcc_seen_last3",
    "card_mcc_seen_last5",
    "card_mcc_repeat_cnt_last3",
    "card_mcc_repeat_cnt_last5",
    "card_mcc_repeat_ratio_last3",
    "card_mcc_repeat_ratio_last5",
    "card_mcc_prior_count",
    "card_mcc_is_new",
    "card_mcc_change_cnt_last5",
]

MERCHANT_FEATURES = [
    "client_merchant_is_new",
    "card_merchant_is_new",
    "merchant_is_new",
    "merchant_change_cnt_last5",
    "merchant_is_new_x_has_error",
]

TIME_PATTERN_FEATURES = [
    "hour_sin",
    "hour_cos",
    "sin_shift",
    "cos_shift",
    "sin_cumsum",
    "cos_cumsum",
    "hour_circular_distance",
    "is_highrisk_weekday",
    "client_weekday_prev",
    "client_weekday_match_last1",
    "client_weekday_is_new",
]

ALL = BASELINE + ERROR_FEATURES + FRAUD_HISTORY_FEATURES + AMOUNT_FEATURES + VELOCITY_FEATURES + MCC_FEATURES + MERCHANT_FEATURES + TIME_PATTERN_FEATURES

### BASELINE

In [None]:
results = []

for col in BASELINE:

    X = sm.add_constant(df[[col]].dropna())
    y = df.loc[X.index, "fraud"]

    model = sm.Logit(y, X).fit(disp=0)

    coef = model.params[col]
    pval = model.pvalues[col]
    or_val = np.exp(coef)

    results.append([col, coef, or_val, pval])

pd.DataFrame(results, columns=["feature","coef","OR","p_value"])

Unnamed: 0,feature,coef,OR,p_value
0,log_abs_amount,0.662108,1.938874,0.0
1,has_error,1.047494,2.850498,5.016368e-79
2,tx_hour,-0.061113,0.940717,4.402438e-158
3,weekday,0.073358,1.076116,1.162421e-37
4,is_refund,-0.202315,0.816838,0.0004063756


- log_abs_amount
- has_error

### ERROR_FEATURES

In [76]:
results = []

for col in ERROR_FEATURES:

    X = sm.add_constant(df[[col]].dropna())
    y = df.loc[X.index, "fraud"]

    model = sm.Logit(y, X).fit(disp=0)

    coef = model.params[col]
    pval = model.pvalues[col]
    or_val = np.exp(coef)

    results.append([col, coef, or_val, pval])

pd.DataFrame(results, columns=["feature","coef","OR","p_value"])

Unnamed: 0,feature,coef,OR,p_value
0,error_count,1.038275,2.824341,1.555756e-80
1,client_error_last1,0.525087,1.690607,1.274717e-13
2,client_error_last3,0.446943,1.563526,3.3166830000000003e-28
3,client_error_last5,0.377435,1.458539,5.652507e-32
4,card_error_last1,0.72218,2.058917,5.547411e-29
5,card_error_last3,0.632964,1.883185,1.3244389999999999e-65
6,card_error_last5,0.533612,1.70508,9.016821e-73
7,client_hist_x_error,1.043104,2.838011,9.553913e-32
8,card_hist_x_error,1.079705,2.943812,5.216327e-17
9,error_high_amount,3.415494,30.431964,7.106049e-09


In [93]:
# 1) 유틸: 모델 학습 + 점수
def fit_logit_and_score(df, features, y_col=LABEL):
    use_cols = features + [y_col]
    tmp = df[use_cols].dropna(subset=use_cols).copy()

    X = sm.add_constant(tmp[features], has_constant="add")
    y = tmp[y_col].astype(int)

    X = X.apply(pd.to_numeric, errors="raise")

    model = sm.Logit(y, X).fit(disp=0)
    score = model.predict(X)
    return model, y, score

def top_decile_lift(y_true, score, q=0.90):
    y_true = pd.Series(y_true).astype(int)
    score = pd.Series(score)

    base_rate = y_true.mean()
    thr = score.quantile(q)

    top_mask = score >= thr
    top_rate = y_true[top_mask].mean()
    lift = top_rate / base_rate if base_rate > 0 else np.nan

    return {
        "base_rate": float(base_rate),
        "top_decile_rate": float(top_rate),
        "top_decile_lift": float(lift),
        "thr": float(thr),
        "top_n": int(top_mask.sum()),
    }

# 2) Baseline 고정 성능
base_model, y_base, s_base = fit_logit_and_score(df, BASELINE, y_col=LABEL)
base_lift = top_decile_lift(y_base, s_base)

print("=== BASELINE ===")
print(base_lift)

# 3) 시간 피처 add-one 다변량 검증
rows = []

for tcol in ERROR_FEATURES:
    feats = BASELINE + [tcol]

    model, y_aligned, score = fit_logit_and_score(df, feats, y_col=LABEL)
    lift = top_decile_lift(y_aligned, score)

    # 추가된 피처의 coef/OR/p-value만 뽑기
    coef = model.params.get(tcol, np.nan)
    pval = model.pvalues.get(tcol, np.nan)
    or_val = float(np.exp(coef)) if pd.notnull(coef) else np.nan

    # baseline 대비 lift 개선량
    delta_lift = lift["top_decile_lift"] - base_lift["top_decile_lift"]
    delta_rate = lift["top_decile_rate"] - base_lift["top_decile_rate"]

    rows.append({
        "added_time_feature": tcol,
        "coef": float(coef) if pd.notnull(coef) else np.nan,
        "OR": or_val,
        "p_value": float(pval) if pd.notnull(pval) else np.nan,
        "base_rate": lift["base_rate"],
        "top_decile_rate": lift["top_decile_rate"],
        "top_decile_lift": lift["top_decile_lift"],
        "delta_top_rate_vs_baseline": delta_rate,
        "delta_lift_vs_baseline": delta_lift,
        "top_n": lift["top_n"],
    })

inc_df = (
    pd.DataFrame(rows)
      .sort_values("delta_lift_vs_baseline", ascending=False)
      .reset_index(drop=True)
)

display(inc_df)


=== BASELINE ===
{'base_rate': 0.0014481587120444314, 'top_decile_rate': 0.005934768178391819, 'top_decile_lift': 4.098147619478418, 'thr': 0.003006617717321854, 'top_n': 533298}


Unnamed: 0,added_time_feature,coef,OR,p_value,base_rate,top_decile_rate,top_decile_lift,delta_top_rate_vs_baseline,delta_lift_vs_baseline,top_n
0,card_error_last3,0.560707,1.75191,1.4628229999999998e-51,0.001448,0.006012,4.151228,7.7e-05,0.05308,533299
1,card_error_last5,0.472084,1.603332,3.549086e-57,0.001448,0.005995,4.139582,6e-05,0.041435,533298
2,client_error_last3,0.369865,1.44754,1.110511e-19,0.001448,0.005963,4.117591,2.8e-05,0.019443,533463
3,client_error_last5,0.316123,1.371799,7.937132000000001e-23,0.001448,0.00596,4.115772,2.6e-05,0.017624,533531
4,client_error_last1,0.369213,1.446596,2.068263e-07,0.001448,0.005954,4.111096,1.9e-05,0.012948,533298
5,card_hist_x_error,0.986081,2.680709,2.214503e-14,0.001448,0.00595,4.108499,1.5e-05,0.010351,533299
6,card_hist_err_x_amount,0.221443,1.247876,3.624427e-14,0.001448,0.005946,4.105917,1.1e-05,0.007769,533298
7,card_err1_x_amount,0.132756,1.141971,2.5875029999999996e-19,0.001448,0.005946,4.105863,1.1e-05,0.007715,533305
8,card_error_last1,0.601566,1.824975,1.7187159999999998e-20,0.001448,0.005942,4.103327,8e-06,0.005179,533298
9,client_hist_x_error,0.035756,1.036403,0.7500035,0.001448,0.005937,4.099442,2e-06,0.001295,533298


[High]

| feature            | OR   | p_value  | Δ Lift vs Baseline | 해석                          |
| ------------------ | ---- | -------- | ------------------ | --------------------------- |
| card_error_last3   | 1.75 | 1.46e-51 | +0.053             | Card 최근 에러 3회 – 가장 강한 독립 신호 |
| card_error_last5   | 1.60 | 3.55e-57 | +0.041             | last3와 유사, 대체 후보            |
| client_error_last3 | 1.44 | 1.11e-19 | +0.019             | Client 최근 에러 대표 변수          |
| client_error_last5 | 1.37 | 7.94e-23 | +0.017             | last3와 경쟁                   |
| card_hist_x_error  | 2.68 | 2.21e-14 | +0.010             | 과거이력 × 현재에러 interaction     |

실제 다변량에 포함시킬 1차 후보군

[Mid]

| feature                | OR   | p_value  | Δ Lift vs Baseline | 해석                     |
| ---------------------- | ---- | -------- | ------------------ | ---------------------- |
| card_err1_x_amount     | 1.14 | 2.59e-19 | +0.0077            | 에러 × 금액 보조 interaction |
| card_hist_err_x_amount | 1.25 | 3.62e-14 | +0.0077            | 누적에러 × 금액 interaction  |
| client_error_last1     | 1.44 | 2.07e-07 | +0.013             | last3에 비해 정보 중복 가능     |
| card_error_last1       | 1.82 | 1.72e-20 | +0.005             | last3에 흡수될 가능성 높음      |

다변량에서 공선성 확인 후 선택

[Low]

| feature             | OR   | p_value | Δ Lift vs Baseline | 해석                  |
| ------------------- | ---- | ------- | ------------------ | ------------------- |
| client_hist_x_error | 1.03 | 0.75    | +0.001             | 통계적 유의성 없음          |
| error_count         | 2.27 | 0.10    | -0.001             | baseline 대비 성능 저하   |
| error_high_amount   | 5.46 | 0.004   | -0.0015            | OR는 크지만 실질 성능 개선 없음 |

제거 권장

### FRAUD_HISTORY_FEATURES

In [78]:
results = []

for col in FRAUD_HISTORY_FEATURES:

    X = sm.add_constant(df[[col]].dropna())
    y = df.loc[X.index, "fraud"]

    model = sm.Logit(y, X).fit(disp=0)

    coef = model.params[col]
    pval = model.pvalues[col]
    or_val = np.exp(coef)

    results.append([col, coef, or_val, pval])

pd.DataFrame(results, columns=["feature","coef","OR","p_value"])

Unnamed: 0,feature,coef,OR,p_value
0,client_fraud_last1,7.465741,1747.149894,0.0
1,client_fraud_last3,3.365209,28.939545,0.0
2,card_fraud_last1,8.344178,4205.623203,0.0
3,card_fraud_last3,3.455528,31.674994,0.0
4,client_fraud_cum_prev,0.001049,1.00105,0.607202
5,card_fraud_cum_prev,0.003078,1.003083,0.378596
6,client_has_fraud_history,-0.010302,0.989751,0.662148
7,card_has_fraud_history,0.060065,1.061906,0.043916


In [101]:
rows = []

for tcol in FRAUD_HISTORY_FEATURES:
    feats = BASELINE + [tcol]

    model, y_aligned, score = fit_logit_and_score(df, feats, y_col=LABEL)
    lift = top_decile_lift(y_aligned, score)

    # 추가된 피처의 coef/OR/p-value만 뽑기
    coef = model.params.get(tcol, np.nan)
    pval = model.pvalues.get(tcol, np.nan)
    or_val = float(np.exp(coef)) if pd.notnull(coef) else np.nan

    # baseline 대비 lift 개선량
    delta_lift = lift["top_decile_lift"] - base_lift["top_decile_lift"]
    delta_rate = lift["top_decile_rate"] - base_lift["top_decile_rate"]

    rows.append({
        "added_time_feature": tcol,
        "coef": float(coef) if pd.notnull(coef) else np.nan,
        "OR": or_val,
        "p_value": float(pval) if pd.notnull(pval) else np.nan,
        "base_rate": lift["base_rate"],
        "top_decile_rate": lift["top_decile_rate"],
        "top_decile_lift": lift["top_decile_lift"],
        "delta_top_rate_vs_baseline": delta_rate,
        "delta_lift_vs_baseline": delta_lift,
        "top_n": lift["top_n"],
    })

inc_df = (
    pd.DataFrame(rows)
      .sort_values("delta_lift_vs_baseline", ascending=False)
      .reset_index(drop=True)
)

display(inc_df)

Unnamed: 0,added_time_feature,coef,OR,p_value,base_rate,top_decile_rate,top_decile_lift,delta_top_rate_vs_baseline,delta_lift_vs_baseline,top_n
0,card_fraud_last3,3.486559,32.673324,0.0,0.001448,0.012762,8.812636,0.006827327,4.714488,533298
1,client_fraud_last3,3.414534,30.402771,0.0,0.001448,0.012198,8.422891,0.006262915,4.324743,533298
2,card_fraud_last1,8.373637,4331.359832,0.0,0.001448,0.011691,8.073256,0.005756588,3.975108,533300
3,client_fraud_last1,7.48121,1774.387069,0.0,0.001448,0.010579,7.30545,0.004644683,3.207302,533298
4,card_fraud_cum_prev,0.00436,1.00437,0.212562,0.001448,0.005952,4.109801,1.687612e-05,0.011654,533298
5,client_has_fraud_history,1.8e-05,1.000018,0.999402,0.001448,0.005935,4.098132,-2.225677e-08,-1.5e-05,533300
6,client_fraud_cum_prev,0.002167,1.002169,0.289292,0.001448,0.005933,4.096853,-1.875124e-06,-0.001295,533298
7,card_has_fraud_history,0.071007,1.073589,0.017318,0.001448,0.005922,4.089084,-1.312587e-05,-0.009064,533298


[High]

| feature            | OR      | p_value  | Δ Lift vs Baseline | 해석                           |
| ------------------ | ------- | -------- | ------------------ | ---------------------------- |
| card_fraud_last3   | 32.67   | 0.000000 | +4.714488          | 카드 최근 3회 사기 이력, 가장 강력한 재발 신호 |
| client_fraud_last3 | 30.40   | 0.000000 | +4.324743          | 고객 최근 3회 사기 이력               |
| card_fraud_last1   | 4331.36 | 0.000000 | +3.975108          | 직전 카드 사기 여부, 극단적 재발 패턴       |
| client_fraud_last1 | 1774.39 | 0.000000 | +3.207302          | 직전 고객 사기 여부                  |

이 네 변수는 fraud 재발 구조를 직접적으로 설명하며 baseline 대비 lift를 대폭 증가시키는 핵심 변수들이다.

[Mid]

| feature             | OR    | p_value  | Δ Lift vs Baseline | 해석                   |
| ------------------- | ----- | -------- | ------------------ | -------------------- |
| card_fraud_cum_prev | 1.004 | 0.212562 | +0.011654          | 누적 사기 횟수, 추가 정보는 제한적 |

통계적으로 유의하지는 않지만 lift는 소폭 증가한다. 다변량에서 다른 fraud_last 변수에 흡수될 가능성이 높다.

[Low]

| feature                  | OR    | p_value  | Δ Lift vs Baseline | 해석                    |
| ------------------------ | ----- | -------- | ------------------ | --------------------- |
| client_fraud_cum_prev    | 1.002 | 0.289292 | -0.001295          | 누적 사기 횟수, 독립 정보 약함    |
| client_has_fraud_history | 1.000 | 0.999402 | -0.000015          | 구분력 없음                |
| card_has_fraud_history   | 1.073 | 0.017318 | -0.009064          | 통계적 유의성은 있으나 성능 개선 없음 |

이 그룹은 baseline 모델에 추가했을 때 실질적인 성능 개선이 없거나 감소한다. Fraud History 블록에서는 last1, last3 계열만 유지하는 것이 가장 합리적이다.


### AMOUNT_FEATURES 

In [79]:
results = []

for col in AMOUNT_FEATURES:

    X = sm.add_constant(df[[col]].dropna())
    y = df.loc[X.index, "fraud"]

    model = sm.Logit(y, X).fit(disp=0)

    coef = model.params[col]
    pval = model.pvalues[col]
    or_val = np.exp(coef)

    results.append([col, coef, or_val, pval])

pd.DataFrame(results, columns=["feature","coef","OR","p_value"])

Unnamed: 0,feature,coef,OR,p_value
0,high_amount,1.874949,6.520487,0.0
1,amount_deviation,0.7610561,2.140536,0.0
2,amt_shift,0.2889845,1.335071,3.338472e-182
3,amt_cumsum,-2.871023e-05,0.999971,2.8458429999999997e-65
4,amt_cnt_past,-0.0001034369,0.999897,9.979317e-70
5,client_avg_amt_prev,0.06005547,1.061895,0.01104424
6,amount_vs_client_avg_ratio,-4.351283e-07,1.0,0.9690294
7,amount_vs_client_avg_diff,0.6747618,1.963565,0.0
8,log_amount_vs_client_avg_ratio,3.633503,37.845155,0.0
9,client_recent_avg_amt,0.5911425,1.806051,1.440543e-183


In [102]:
rows = []

for tcol in AMOUNT_FEATURES:
    feats = BASELINE + [tcol]

    model, y_aligned, score = fit_logit_and_score(df, feats, y_col=LABEL)
    lift = top_decile_lift(y_aligned, score)

    # 추가된 피처의 coef/OR/p-value만 뽑기
    coef = model.params.get(tcol, np.nan)
    pval = model.pvalues.get(tcol, np.nan)
    or_val = float(np.exp(coef)) if pd.notnull(coef) else np.nan

    # baseline 대비 lift 개선량
    delta_lift = lift["top_decile_lift"] - base_lift["top_decile_lift"]
    delta_rate = lift["top_decile_rate"] - base_lift["top_decile_rate"]

    rows.append({
        "added_time_feature": tcol,
        "coef": float(coef) if pd.notnull(coef) else np.nan,
        "OR": or_val,
        "p_value": float(pval) if pd.notnull(pval) else np.nan,
        "base_rate": lift["base_rate"],
        "top_decile_rate": lift["top_decile_rate"],
        "top_decile_lift": lift["top_decile_lift"],
        "delta_top_rate_vs_baseline": delta_rate,
        "delta_lift_vs_baseline": delta_lift,
        "top_n": lift["top_n"],
    })

inc_df = (
    pd.DataFrame(rows)
      .sort_values("delta_lift_vs_baseline", ascending=False)
      .reset_index(drop=True)
)

display(inc_df)

Unnamed: 0,added_time_feature,coef,OR,p_value,base_rate,top_decile_rate,top_decile_lift,delta_top_rate_vs_baseline,delta_lift_vs_baseline,top_n
0,dev_x_mccnew,2.348035,10.464988,0.0,0.001448,0.006702,4.627734,0.000767,0.529587,533298
1,amount_deviation,0.5150832,1.673778,5.375027e-77,0.001448,0.006355,4.388189,0.00042,0.290042,533298
2,client_avg_amt_prev,-0.4404779,0.643729,1.825592e-74,0.001448,0.006233,4.304026,0.000298,0.205879,533298
3,amount_vs_client_avg_diff,0.4404779,1.553449,1.825592e-74,0.001448,0.006233,4.304026,0.000298,0.205879,533298
4,high_amount,1.282358,3.605131,2.988199e-309,0.001448,0.00622,4.294962,0.000285,0.196815,533298
5,log_amount_vs_client_avg_ratio,0.5977076,1.817947,5.052232e-10,0.001448,0.006,4.143467,6.6e-05,0.045319,533298
6,amt_cumsum,-3.195992e-05,0.999968,1.429539e-80,0.001448,0.005985,4.133108,5.1e-05,0.034961,533298
7,amount_vs_client_quantile_q99,-3.918318,0.019874,2.064915e-161,0.001448,0.00598,4.129224,4.5e-05,0.031076,533298
8,client_q99_prev,0.5991402,1.820553,3.102073e-206,0.001448,0.005974,4.125339,3.9e-05,0.027192,533298
9,amount_vs_client_avg_ratio,-7.70291e-07,0.999999,0.9771607,0.001448,0.005935,4.098148,0.0,0.0,533298


[High]

| feature                   | OR    | p_value   | Δ Lift vs Baseline | 해석                                  |
| ------------------------- | ----- | --------- | ------------------ | ----------------------------------- |
| dev_x_mccnew              | 10.46 | 0.000000  | +0.529587          | 금액 이상치 × MCC 신규성 결합, 구조적으로 가장 강한 신호 |
| amount_deviation          | 1.67  | 5.38e-77  | +0.290042          | 고객 평균 대비 이탈도, 안정적 핵심 변수             |
| client_avg_amt_prev       | 0.64  | 1.83e-74  | +0.205879          | 고객 평균 금액 레벨 자체의 구분력                 |
| amount_vs_client_avg_diff | 1.55  | 1.83e-74  | +0.205879          | 평균 대비 차이, deviation 계열 대표           |
| high_amount               | 3.61  | 2.99e-309 | +0.196815          | 절대 고액 거래 신호                         |

금액 블록에서 baseline 대비 실질적인 lift 개선이 뚜렷한 핵심 변수들이다.

[Mid]

| feature                        | OR       | p_value   | Δ Lift vs Baseline | 해석              |
| ------------------------------ | -------- | --------- | ------------------ | --------------- |
| log_amount_vs_client_avg_ratio | 1.82     | 5.05e-10  | +0.045319          | 로그 비율 기반 이상 신호  |
| amt_cumsum                     | 0.999968 | 1.43e-80  | +0.034961          | 누적 금액 규모 효과     |
| amount_vs_client_quantile_q99  | 0.0199   | 2.06e-161 | +0.031076          | 상위 분위수 대비 초과 여부 |
| client_q99_prev                | 1.82     | 3.10e-206 | +0.027192          | 고객 고액 기준치       |
| amount_vs_client_avg_ratio     | 1.00     | 0.977     | 0.000000           | 정보 거의 없음 (경계선)  |

통계적으로는 유의하나 High 그룹 대비 lift 개선폭이 제한적이거나 일부 변수와 정보 중복 가능성이 있다.

[Low]

| feature                         | OR       | p_value   | Δ Lift vs Baseline | 해석                 |
| ------------------------------- | -------- | --------- | ------------------ | ------------------ |
| amt_cnt_past                    | 0.999901 | 7.67e-65  | -0.018128          | 거래 횟수 누적, 독립 신호 약함 |
| client_q95_prev                 | 1.78     | 2.49e-123 | -0.023307          | q99 변수에 흡수 가능      |
| amount_vs_client_quantile_q95   | 0.052    | 1.18e-85  | -0.024602          | q95 계열 중복          |
| amt_shift                       | 1.22     | 5.14e-91  | -0.045320          | 단기 금액 변화는 효과 약함    |
| amount_vs_recent_window_avg     | 0.63     | 5.81e-20  | -0.059562          | 최근 평균 대비는 불안정      |
| client_recent_avg_amt           | 1.33     | 8.30e-43  | -0.089344          | 정보 중복              |
| log_amount_vs_recent_window_avg | 0.07     | 1.38e-91  | -0.128189          | lift 감소            |
| amt_over_q99                    | 2.17     | 4.59e-89  | -0.145021          | 단순 초과 여부는 약함       |
| amt_over_q95                    | 2.64     | 7.19e-196 | -0.170918          | q95 초과는 오히려 성능 저하  |

이 그룹은 단변량에서는 강해 보일 수 있으나 baseline 모델에 추가했을 때 실질적인 성능 개선이 없거나 감소한다. 금액 블록에서는 High 그룹 중심으로 구성하는 것이 가장 합리적이다.


### VELOCITY_FEATURES

In [80]:
results = []

for col in VELOCITY_FEATURES:

    X = sm.add_constant(df[[col]].dropna())
    y = df.loc[X.index, "fraud"]

    model = sm.Logit(y, X).fit(disp=0)

    coef = model.params[col]
    pval = model.pvalues[col]
    or_val = np.exp(coef)

    results.append([col, coef, or_val, pval])

pd.DataFrame(results, columns=["feature","coef","OR","p_value"])

  return 1/(1+np.exp(-X))


Unnamed: 0,feature,coef,OR,p_value
0,seconds_since_prev_tx,-1.187449e-05,0.999988,5.884344e-211
1,log_interval,-0.1841944,0.831774,0.0
2,log_interval_shift,-0.1251072,0.882402,2.709475e-134
3,interval_cumsum,-1.099502e-05,0.999989,4.252667e-65
4,interval_cnt_past,-0.0001034369,0.999897,9.979317e-70
5,client_avg_interval_prev,0.6183259,1.855819,3.33216e-148
6,interval_ratio,-4.227181e-08,1.0,0.6494763
7,log_interval_dev,-0.2246354,0.798807,0.0
8,client_tx_1h,0.338873,1.403365,2.7044089999999997e-146
9,client_tx_1h_shift,0.2505815,1.284772,1.725882e-67


In [103]:
rows = []

for tcol in VELOCITY_FEATURES:
    feats = BASELINE + [tcol]

    model, y_aligned, score = fit_logit_and_score(df, feats, y_col=LABEL)
    lift = top_decile_lift(y_aligned, score)

    # 추가된 피처의 coef/OR/p-value만 뽑기
    coef = model.params.get(tcol, np.nan)
    pval = model.pvalues.get(tcol, np.nan)
    or_val = float(np.exp(coef)) if pd.notnull(coef) else np.nan

    # baseline 대비 lift 개선량
    delta_lift = lift["top_decile_lift"] - base_lift["top_decile_lift"]
    delta_rate = lift["top_decile_rate"] - base_lift["top_decile_rate"]

    rows.append({
        "added_time_feature": tcol,
        "coef": float(coef) if pd.notnull(coef) else np.nan,
        "OR": or_val,
        "p_value": float(pval) if pd.notnull(pval) else np.nan,
        "base_rate": lift["base_rate"],
        "top_decile_rate": lift["top_decile_rate"],
        "top_decile_lift": lift["top_decile_lift"],
        "delta_top_rate_vs_baseline": delta_rate,
        "delta_lift_vs_baseline": delta_lift,
        "top_n": lift["top_n"],
    })

inc_df = (
    pd.DataFrame(rows)
      .sort_values("delta_lift_vs_baseline", ascending=False)
      .reset_index(drop=True)
)

display(inc_df)

  return 1/(1+np.exp(-X))


Unnamed: 0,added_time_feature,coef,OR,p_value,base_rate,top_decile_rate,top_decile_lift,delta_top_rate_vs_baseline,delta_lift_vs_baseline,top_n
0,seconds_since_prev_tx,-1.24762e-05,0.999988,7.890978e-223,0.001448,0.006422,4.434804,0.000488,0.336657,533298
1,card_tx_1h_cumsum,-0.0003075423,0.999693,5.8792989999999994e-161,0.001448,0.006325,4.367473,0.00039,0.269325,533298
2,card_tx_cnt_past,-0.000405952,0.999594,4.26449e-177,0.001448,0.006287,4.341576,0.000353,0.243429,533298
3,log_interval_dev,-0.2135703,0.807695,0.0,0.001448,0.006265,4.326038,0.00033,0.227891,533298
4,client_avg_interval_prev,0.6179583,1.855136,1.442732e-150,0.001448,0.006124,4.228926,0.000189,0.130778,533298
5,card_tx_1h_avg_prev,-2.004659,0.134706,9.172178e-81,0.001448,0.006119,4.225041,0.000184,0.126894,533298
6,log_interval_shift,-0.1222726,0.884907,5.129228e-129,0.001448,0.006058,4.182311,0.000123,0.084163,533177
7,card_velocity_spike_ratio,0.7345118,2.084464,0.0,0.001448,0.006053,4.179722,0.000118,0.081575,533298
8,card_tx_1h_shift,0.3391067,1.403693,3.750279e-102,0.001449,0.006053,4.177132,0.000119,0.078985,532920
9,dev_x_velocity,0.7404319,2.096841,4.143045e-214,0.001448,0.006042,4.171953,0.000107,0.073806,533298


[High]

| feature                  | OR       | p_value   | Δ Lift vs Baseline | 해석                        |
| ------------------------ | -------- | --------- | ------------------ | ------------------------- |
| seconds_since_prev_tx    | 0.999988 | 7.89e-223 | +0.336657          | 직전 거래 간격 자체가 강한 속도 리스크 신호 |
| card_tx_1h_cumsum        | 0.999693 | 5.88e-161 | +0.269325          | 카드 기준 단기 누적 활동 강도         |
| card_tx_cnt_past         | 0.999594 | 4.26e-177 | +0.243429          | 카드 전체 거래 규모 효과            |
| log_interval_dev         | 0.807695 | 0.000000  | +0.227891          | 평균 대비 간격 이탈도              |
| client_avg_interval_prev | 1.855136 | 1.44e-150 | +0.130778          | 고객 평균 거래 간격 수준            |
| card_tx_1h_avg_prev      | 0.134706 | 9.17e-81  | +0.126894          | 카드 시간당 평균 활동 강도           |

속도 블록에서 baseline 대비 명확한 lift 개선을 보이는 핵심 변수들이다. 단기 활동 강도와 간격 이탈 구조가 핵심 축이다.

[Mid]

| feature                   | OR       | p_value   | Δ Lift vs Baseline | 해석           |
| ------------------------- | -------- | --------- | ------------------ | ------------ |
| log_interval_shift        | 0.884907 | 5.13e-129 | +0.084163          | 직전 간격 변화     |
| card_velocity_spike_ratio | 2.084464 | 0.000000  | +0.081575          | 카드 급증 비율     |
| card_tx_1h_shift          | 1.403693 | 3.75e-102 | +0.078985          | 카드 단기 변화     |
| dev_x_velocity            | 2.096841 | 4.14e-214 | +0.073806          | 금액 × 속도 결합   |
| log_interval              | 0.840175 | 2.64e-300 | +0.067331          | 로그 간격 자체     |
| client_tx_1h_avg_prev     | 0.252074 | 5.59e-60  | +0.051793          | 고객 시간당 평균 활동 |
| client_tx_1h_shift        | 1.275773 | 2.80e-63  | +0.050498          | 고객 단기 활동 변화  |
| velocity_spike_ratio      | 1.794875 | 3.63e-229 | +0.016833          | 고객 급증 비율     |
| client_tx_1h_cumsum       | 0.999932 | 1.02e-63  | +0.012948          | 고객 누적 활동량    |

통계적으로 매우 유의하지만 High 그룹에 비해 lift 개선폭이 작거나 정보가 중첩될 가능성이 있는 보조 변수들이다.

[Low]

| feature            | OR       | p_value   | Δ Lift vs Baseline | 해석                  |
| ------------------ | -------- | --------- | ------------------ | ------------------- |
| card_tx_1h         | 1.531767 | 2.13e-194 | +0.001295          | 단순 1시간 거래 수는 정보 제한적 |
| interval_ratio     | 1.000000 | 0.553     | -0.001295          | 구분력 없음              |
| client_tx_cnt_past | 0.999901 | 7.67e-65  | -0.018128          | 거래 횟수 누적 중복 신호      |
| interval_cnt_past  | 0.999901 | 7.67e-65  | -0.018128          | 누적 횟수 정보 약함         |
| interval_cumsum    | 0.999990 | 2.77e-59  | -0.028486          | 누적 간격 정보 중복         |
| client_tx_1h       | 1.361186 | 7.92e-115 | -0.041435          | 단순 활동량은 독립 효과 약함    |

이 그룹은 단변량에서는 유의해 보일 수 있으나 baseline 대비 lift 개선이 없거나 감소한다. 속도 블록은 High 그룹 중심으로 구성하는 것이 가장 합리적이다.


### MCC_FEATURES

In [88]:
results = []

for col in MCC_FEATURES:

    X = sm.add_constant(df[[col]].dropna())
    y = df.loc[X.index, "fraud"]

    model = sm.Logit(y, X).fit(disp=0)

    coef = model.params[col]
    pval = model.pvalues[col]
    or_val = np.exp(coef)

    results.append([col, coef, or_val, pval])

pd.DataFrame(results, columns=["feature","coef","OR","p_value"])

Unnamed: 0,feature,coef,OR,p_value
0,mcc_highrisk_90,2.585485,13.269723,0.0
1,client_mcc_match_last1,-1.082264,0.338828,3.787448e-133
2,client_mcc_seen_last3,-1.108433,0.330076,1.553979e-251
3,client_mcc_seen_last5,-1.164985,0.311927,0.0
4,client_mcc_repeat_cnt_last3,-0.950111,0.386698,6.81102e-258
5,client_mcc_repeat_cnt_last5,-0.883079,0.413508,0.0
6,client_mcc_repeat_ratio_last3,-2.850333,0.057825,6.811008999999999e-258
7,client_mcc_repeat_ratio_last5,-4.415394,0.01209,0.0
8,client_mcc_prior_count,-0.009979,0.99007,0.0
9,client_mcc_is_new,3.06801,21.499069,0.0


In [104]:
rows = []

for tcol in MCC_FEATURES:
    feats = BASELINE + [tcol]

    model, y_aligned, score = fit_logit_and_score(df, feats, y_col=LABEL)
    lift = top_decile_lift(y_aligned, score)

    # 추가된 피처의 coef/OR/p-value만 뽑기
    coef = model.params.get(tcol, np.nan)
    pval = model.pvalues.get(tcol, np.nan)
    or_val = float(np.exp(coef)) if pd.notnull(coef) else np.nan

    # baseline 대비 lift 개선량
    delta_lift = lift["top_decile_lift"] - base_lift["top_decile_lift"]
    delta_rate = lift["top_decile_rate"] - base_lift["top_decile_rate"]

    rows.append({
        "added_time_feature": tcol,
        "coef": float(coef) if pd.notnull(coef) else np.nan,
        "OR": or_val,
        "p_value": float(pval) if pd.notnull(pval) else np.nan,
        "base_rate": lift["base_rate"],
        "top_decile_rate": lift["top_decile_rate"],
        "top_decile_lift": lift["top_decile_lift"],
        "delta_top_rate_vs_baseline": delta_rate,
        "delta_lift_vs_baseline": delta_lift,
        "top_n": lift["top_n"],
    })

inc_df = (
    pd.DataFrame(rows)
      .sort_values("delta_lift_vs_baseline", ascending=False)
      .reset_index(drop=True)
)

display(inc_df)

Unnamed: 0,added_time_feature,coef,OR,p_value,base_rate,top_decile_rate,top_decile_lift,delta_top_rate_vs_baseline,delta_lift_vs_baseline,top_n
0,mcc_highrisk_90,2.31596,10.134651,0.0,0.001448,0.008513,5.878544,0.002578,1.780396,533298
1,card_mcc_prior_count,-0.023119,0.977146,0.0,0.001448,0.00811,5.600154,0.002175,1.502007,533298
2,client_mcc_prior_count,-0.008391,0.991644,0.0,0.001448,0.007684,5.306227,0.001749,1.20808,533298
3,card_mcc_is_new,2.29163,9.891045,0.0,0.001448,0.007425,5.127232,0.00149,1.029085,533330
4,client_mcc_is_new,2.563397,12.979838,0.0,0.001448,0.007028,4.853026,0.001093,0.754879,533299
5,card_mcc_change_cnt_last5,0.526591,1.69315,4.862403e-216,0.001448,0.006265,4.326038,0.00033,0.227891,533298
6,client_mcc_change_cnt_last5,0.445864,1.561839,1.374197e-164,0.001448,0.006246,4.31309,0.000311,0.214942,533298
7,client_mcc_repeat_ratio_last5,-4.004353,0.018236,1.487588e-291,0.001448,0.006184,4.27036,0.000249,0.172213,533298
8,client_mcc_repeat_cnt_last5,-0.800871,0.448938,1.487585e-291,0.001448,0.006184,4.27036,0.000249,0.172213,533298
9,card_mcc_repeat_ratio_last5,-3.551473,0.028682,1.3548990000000001e-268,0.001448,0.006113,4.221157,0.000178,0.123009,533298


[High]

| feature                | OR    | p_value  | Δ Lift vs Baseline | 해석                            |
| ---------------------- | ----- | -------- | ------------------ | ----------------------------- |
| mcc_highrisk_90        | 10.13 | 0.000000 | +1.780396          | 고위험 MCC 여부, 가장 강력한 카테고리 리스크 축 |
| card_mcc_prior_count   | 0.977 | 0.000000 | +1.502007          | 카드 기준 MCC 누적 경험 수             |
| client_mcc_prior_count | 0.992 | 0.000000 | +1.208080          | 고객 기준 MCC 누적 경험 수             |
| card_mcc_is_new        | 9.89  | 0.000000 | +1.029085          | 카드 기준 신규 MCC 진입               |
| client_mcc_is_new      | 12.98 | 0.000000 | +0.754879          | 고객 기준 신규 MCC 진입               |

MCC 블록의 핵심은 “고위험 카테고리”와 “신규 진입/경험 부족” 구조다. lift 개선폭이 명확하게 크다.

[Mid]

| feature                       | OR    | p_value   | Δ Lift vs Baseline | 해석              |
| ----------------------------- | ----- | --------- | ------------------ | --------------- |
| card_mcc_change_cnt_last5     | 1.69  | 4.86e-216 | +0.227891          | 최근 MCC 변경 빈도    |
| client_mcc_change_cnt_last5   | 1.56  | 1.37e-164 | +0.214942          | 고객 최근 변경 빈도     |
| client_mcc_repeat_ratio_last5 | 0.018 | 1.49e-291 | +0.172213          | 반복 비율 (높을수록 안정) |
| client_mcc_repeat_cnt_last5   | 0.449 | 1.49e-291 | +0.172213          | 최근 반복 횟수        |
| card_mcc_repeat_ratio_last5   | 0.029 | 1.35e-268 | +0.123009          | 카드 반복 비율        |
| card_mcc_repeat_cnt_last5     | 0.491 | 1.35e-268 | +0.123009          | 카드 반복 횟수        |
| client_mcc_seen_last5         | 0.355 | 7.19e-273 | +0.118468          | 최근 5회 내 관측 여부   |
| client_mcc_repeat_cnt_last3   | 0.411 | 3.74e-223 | +0.094523          | 단기 반복 횟수        |
| client_mcc_repeat_ratio_last3 | 0.069 | 3.74e-223 | +0.094523          | 단기 반복 비율        |
| card_mcc_repeat_cnt_last3     | 0.452 | 1.01e-207 | +0.094507          | 카드 단기 반복        |
| card_mcc_repeat_ratio_last3   | 0.092 | 1.01e-207 | +0.094507          | 카드 단기 반복 비율     |
| client_mcc_match_last1        | 0.325 | 6.19e-139 | +0.090630          | 직전 MCC 일치 여부    |
| client_mcc_seen_last3         | 0.359 | 5.59e-209 | +0.082869          | 최근 3회 관측 여부     |
| card_mcc_match_last1          | 0.362 | 3.03e-125 | +0.078977          | 카드 직전 일치 여부     |
| card_mcc_seen_last5           | 0.391 | 7.76e-241 | +0.067331          | 카드 최근 5회 관측     |
| card_mcc_seen_last3           | 0.396 | 5.24e-187 | +0.046614          | 카드 최근 3회 관측     |

반복·관측·일치 계열은 모두 통계적으로 매우 유의하지만, lift 개선폭은 High 그룹에 비해 제한적이다. 정보 중복 가능성이 높아 다변량에서 일부만 선택하는 것이 적절하다.

[Low]

이번 결과에서는 MCC 블록에서 baseline 대비 lift가 감소하는 변수는 없으며, 대부분이 유의하고 양의 ΔLift를 보인다. 다만 prior_count, repeat, seen, match 계열은 구조적으로 강한 상관관계를 가질 가능성이 높으므로 전부 사용하는 것은 권장되지 않는다. High 그룹을 축으로 하고, Mid 그룹에서 대표 변수만 선별하는 전략이 합리적이다.


### MERCHANT_FEATURES

In [91]:
results = []

for col in MERCHANT_FEATURES:

    X = sm.add_constant(df[[col]].dropna())
    y = df.loc[X.index, "fraud"]

    model = sm.Logit(y, X).fit(disp=0)

    coef = model.params[col]
    pval = model.pvalues[col]
    or_val = np.exp(coef)

    results.append([col, coef, or_val, pval])

pd.DataFrame(results, columns=["feature","coef","OR","p_value"])

Unnamed: 0,feature,coef,OR,p_value
0,client_merchant_is_new,3.291007,26.869903,0.0
1,card_merchant_is_new,3.028324,20.662583,0.0
2,merchant_is_new,3.028324,20.662583,0.0
3,merchant_change_cnt_last5,0.676117,1.966229,2.51794e-308
4,merchant_is_new_x_has_error,3.22214,25.081745,0.0


In [105]:
rows = []

for tcol in MERCHANT_FEATURES:
    feats = BASELINE + [tcol]

    model, y_aligned, score = fit_logit_and_score(df, feats, y_col=LABEL)
    lift = top_decile_lift(y_aligned, score)

    # 추가된 피처의 coef/OR/p-value만 뽑기
    coef = model.params.get(tcol, np.nan)
    pval = model.pvalues.get(tcol, np.nan)
    or_val = float(np.exp(coef)) if pd.notnull(coef) else np.nan

    # baseline 대비 lift 개선량
    delta_lift = lift["top_decile_lift"] - base_lift["top_decile_lift"]
    delta_rate = lift["top_decile_rate"] - base_lift["top_decile_rate"]

    rows.append({
        "added_time_feature": tcol,
        "coef": float(coef) if pd.notnull(coef) else np.nan,
        "OR": or_val,
        "p_value": float(pval) if pd.notnull(pval) else np.nan,
        "base_rate": lift["base_rate"],
        "top_decile_rate": lift["top_decile_rate"],
        "top_decile_lift": lift["top_decile_lift"],
        "delta_top_rate_vs_baseline": delta_rate,
        "delta_lift_vs_baseline": delta_lift,
        "top_n": lift["top_n"],
    })

inc_df = (
    pd.DataFrame(rows)
      .sort_values("delta_lift_vs_baseline", ascending=False)
      .reset_index(drop=True)
)

display(inc_df)

Unnamed: 0,added_time_feature,coef,OR,p_value,base_rate,top_decile_rate,top_decile_lift,delta_top_rate_vs_baseline,delta_lift_vs_baseline,top_n
0,card_merchant_is_new,2.862509,17.505387,0.0,0.001448,0.009936,6.861322,0.004002,2.763174,533298
1,merchant_is_new,2.862509,17.505387,0.0,0.001448,0.009936,6.861322,0.004002,2.763174,533298
2,client_merchant_is_new,3.101376,22.228507,0.0,0.001448,0.009666,6.674678,0.003731,2.576531,533313
3,merchant_change_cnt_last5,0.69976,2.01327,0.0,0.001448,0.006548,4.521524,0.000613,0.423377,533302
4,merchant_is_new_x_has_error,2.873222,17.693939,1.012139e-142,0.001448,0.00609,4.205603,0.000156,0.107455,533300


[High]

| feature                | OR    | p_value  | Δ Lift vs Baseline | 해석              |
| ---------------------- | ----- | -------- | ------------------ | --------------- |
| card_merchant_is_new   | 17.51 | 0.000000 | +2.763174          | 카드 기준 신규 가맹점 진입 |
| merchant_is_new        | 17.51 | 0.000000 | +2.763174          | 전체 기준 신규 가맹점 여부 |
| client_merchant_is_new | 22.23 | 0.000000 | +2.576531          | 고객 기준 신규 가맹점 진입 |

가맹점 신규 진입 여부는 baseline 대비 lift를 2.5 이상 끌어올리는 강력한 리스크 축이다. Merchant Novelty 블록의 핵심 변수들이다. card_merchant_is_new와 merchant_is_new는 동일 정보일 가능성이 높아 하나만 선택하는 것이 적절하다.

[Mid]

| feature                     | OR    | p_value   | Δ Lift vs Baseline | 해석             |
| --------------------------- | ----- | --------- | ------------------ | -------------- |
| merchant_change_cnt_last5   | 2.01  | 0.000000  | +0.423377          | 최근 가맹점 변경 빈도   |
| merchant_is_new_x_has_error | 17.69 | 1.01e-142 | +0.107455          | 신규 가맹점 × 에러 결합 |

변경 빈도는 신규 여부보다는 약하지만 독립적인 리스크 신호를 제공한다. interaction 변수는 효과는 있으나 신규 여부 변수에 일부 흡수될 가능성이 있다.

[Low]

이번 Merchant 블록에서는 baseline 대비 lift가 감소하는 변수는 없다. 다만 merchant_is_new와 card_merchant_is_new는 정보 중복 가능성이 매우 높으므로 동시에 사용하는 것은 권장되지 않는다. High 그룹에서 대표 변수 하나를 선택하고, Mid 그룹에서 보조 변수 한두 개를 추가하는 구조가 가장 합리적이다.


### TIME_PATTERN_FEATURES

In [92]:
results = []

for col in TIME_PATTERN_FEATURES:

    X = sm.add_constant(df[[col]].dropna())
    y = df.loc[X.index, "fraud"]

    model = sm.Logit(y, X).fit(disp=0)

    coef = model.params[col]
    pval = model.pvalues[col]
    or_val = np.exp(coef)

    results.append([col, coef, or_val, pval])

pd.DataFrame(results, columns=["feature","coef","OR","p_value"])

Unnamed: 0,feature,coef,OR,p_value
0,hour_sin,0.412225,1.510175,2.082158e-139
1,hour_cos,-0.748895,0.472889,1.7864290000000002e-208
2,sin_shift,0.377274,1.458303,4.842799e-118
3,cos_shift,-0.355924,0.700526,5.225513999999999e-63
4,sin_cumsum,3e-06,1.000003,0.8233802
5,cos_cumsum,0.00017,1.00017,8.947422999999999e-48
6,hour_circular_distance,0.467859,1.596571,1.0443769999999999e-38
7,is_highrisk_weekday,0.60184,1.825474,4.069455e-150
8,client_weekday_prev,0.078684,1.081862,5.488508e-43
9,client_weekday_match_last1,0.717364,2.049024,5.4963130000000006e-160


In [106]:
rows = []

for tcol in TIME_PATTERN_FEATURES:
    feats = BASELINE + [tcol]

    model, y_aligned, score = fit_logit_and_score(df, feats, y_col=LABEL)
    lift = top_decile_lift(y_aligned, score)

    # 추가된 피처의 coef/OR/p-value만 뽑기
    coef = model.params.get(tcol, np.nan)
    pval = model.pvalues.get(tcol, np.nan)
    or_val = float(np.exp(coef)) if pd.notnull(coef) else np.nan

    # baseline 대비 lift 개선량
    delta_lift = lift["top_decile_lift"] - base_lift["top_decile_lift"]
    delta_rate = lift["top_decile_rate"] - base_lift["top_decile_rate"]

    rows.append({
        "added_time_feature": tcol,
        "coef": float(coef) if pd.notnull(coef) else np.nan,
        "OR": or_val,
        "p_value": float(pval) if pd.notnull(pval) else np.nan,
        "base_rate": lift["base_rate"],
        "top_decile_rate": lift["top_decile_rate"],
        "top_decile_lift": lift["top_decile_lift"],
        "delta_top_rate_vs_baseline": delta_rate,
        "delta_lift_vs_baseline": delta_lift,
        "top_n": lift["top_n"],
    })

inc_df = (
    pd.DataFrame(rows)
      .sort_values("delta_lift_vs_baseline", ascending=False)
      .reset_index(drop=True)
)

display(inc_df)

Unnamed: 0,added_time_feature,coef,OR,p_value,base_rate,top_decile_rate,top_decile_lift,delta_top_rate_vs_baseline,delta_lift_vs_baseline,top_n
0,hour_cos,-0.918885,0.398963,3.880064e-269,0.001448,0.00636,4.391729,0.000425,0.293581,533340
1,client_weekday_match_last1,1.143326,3.137186,0.0,0.001448,0.006315,4.360999,0.000381,0.262851,533298
2,cos_shift,-0.448179,0.63879,3.71454e-95,0.001448,0.006087,4.202974,0.000152,0.104826,533305
3,is_highrisk_weekday,0.572853,1.77332,5.965197e-132,0.001448,0.006085,4.201734,0.00015,0.103587,533298
4,hour_circular_distance,0.20471,1.22717,1.097739e-08,0.001448,0.005976,4.126634,4.1e-05,0.028486,533298
5,sin_cumsum,-4.8e-05,0.999952,8.453364e-05,0.001448,0.005954,4.111096,1.9e-05,0.012948,533298
6,client_weekday_is_new,-0.690091,0.50153,0.06829369,0.001448,0.005933,4.096853,-2e-06,-0.001295,533298
7,sin_shift,0.332871,1.394967,6.496249999999999e-91,0.001448,0.005901,4.074841,-3.4e-05,-0.023307,533298
8,client_weekday_prev,0.056412,1.058033,1.797205e-14,0.001448,0.005897,4.072251,-3.8e-05,-0.025897,533298
9,hour_sin,0.361351,1.435268,5.406156e-35,0.001448,0.005884,4.063187,-5.1e-05,-0.034961,533298


[High]

| feature                    | OR    | p_value   | Δ Lift vs Baseline | 해석                            |
| -------------------------- | ----- | --------- | ------------------ | ----------------------------- |
| hour_cos                   | 0.399 | 3.88e-269 | +0.293581          | 특정 시간대(야간/새벽) 리스크 구조를 가장 잘 설명 |
| client_weekday_match_last1 | 3.14  | 0.000000  | +0.262851          | 직전 요일 패턴 반복 여부                |
| cos_shift                  | 0.639 | 3.71e-95  | +0.104826          | 시간대 이동(shift) 신호              |
| is_highrisk_weekday        | 1.77  | 5.97e-132 | +0.103587          | 특정 요일 고위험 구조                  |

시간 블록에서 baseline 대비 의미 있는 lift 개선을 보이는 핵심 변수들이다. 특히 hour_cos가 가장 강한 단일 시간대 구조 신호다.

[Mid]

| feature                | OR       | p_value  | Δ Lift vs Baseline | 해석             |
| ---------------------- | -------- | -------- | ------------------ | -------------- |
| hour_circular_distance | 1.23     | 1.10e-08 | +0.028486          | 특정 기준 시간과의 거리  |
| sin_cumsum             | 0.999952 | 8.45e-05 | +0.012948          | 누적 시간 패턴       |
| hour_sin               | 1.44     | 5.41e-35 | -0.034961          | cos와 중복 가능성 높음 |
| client_weekday_prev    | 1.06     | 1.80e-14 | -0.025897          | 단순 요일 효과       |
| cos_cumsum             | 1.000165 | 3.05e-45 | -0.041435          | 누적 효과 중복       |
| sin_shift              | 1.39     | 6.50e-91 | -0.023307          | shift 계열 중복 가능 |

통계적으로 유의하지만 lift 개선폭이 작거나 High 변수들과 정보가 중복될 가능성이 있다.

[Low]

| feature               | OR   | p_value | Δ Lift vs Baseline | 해석                  |
| --------------------- | ---- | ------- | ------------------ | ------------------- |
| client_weekday_is_new | 0.50 | 0.068   | -0.001295          | 통계적 유의성 부족, lift 감소 |

시간 블록에서는 hour_cos와 요일 반복 구조(client_weekday_match_last1)를 중심으로 구성하는 것이 가장 합리적이다.


---

# Stage1 (경량 버전b리스트)

```python

---

# Stage1에서 제거한 것 (너무 heavy)

### MCC rolling / repeat / change 계열

* `client_mcc_repeat_cnt_last5`
* `client_mcc_repeat_ratio_last5`
* `card_mcc_repeat_cnt_last5`
* `card_mcc_repeat_ratio_last5`
* `client_mcc_seen_last5`
* `card_mcc_seen_last5`
* `client_mcc_seen_last3`
* `card_mcc_seen_last3`
* `client_mcc_match_last1`
* `card_mcc_match_last1`
* `card_mcc_change_cnt_last5`
* `client_mcc_change_cnt_last5`
* `card_mcc_prior_count`
* `client_mcc_prior_count`

이건 Stage2로.

---

### 대규모 누적/rolling 기반

* `card_tx_1h_cumsum`
* `card_tx_cnt_past`
* `client_tx_1h_shift`
* `client_tx_1h_avg_prev`
* `card_tx_1h_avg_prev`
* `log_interval_dev`
* `log_interval_shift`
* `log_interval`
* `velocity_spike_ratio` (client쪽)
* `dev_x_velocity`

Stage2가 맞음.

---

### 중복/과도한 파생

* `card_error_last3`
* `card_error_last5`
* `client_error_last3`
* `client_error_last5`
* `client_mcc_repeat_cnt_last3`
* `client_mcc_repeat_ratio_last3`
* `card_mcc_repeat_cnt_last3`
* `card_mcc_repeat_ratio_last3`
* `hour_circular_distance`
* `cos_shift`
* `dev_x_mccnew`

Stage1에서까지는 필요 없음.

---

이 구성은:

* 피처 수: 약 20개 내외
* 계산 복잡도: 낮음
* state 요구: 최근 1~3개 값 정도
* MCC heavy rolling 제거
* 대규모 cumsum 제거

즉,

> ✔ 빠름
> ✔ 유지보수 쉬움
> ✔ 설명력 충분
> ✔ Stage2와 역할 분리 명확

---