In [11]:
import pandas as pd
import os
import numpy as np

In [12]:
df = pd.read_parquet("../DATA/dataset/TRAIN_stage2")

# 다중공선성 check

In [14]:
X = df.drop(columns=["fraud"])
corr = X.corr().abs()

In [None]:
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

high_corr_pairs = (
    upper.stack()
    .reset_index()
    .rename(columns={0: "corr"})
    .query("corr >= 0.8")
    .sort_values("corr", ascending=False)
)

high_corr_pairs

Unnamed: 0,level_0,level_1,corr
636,card_mcc_is_new,merchant_is_new_x_mcc_is_new,0.99979
809,amount_vs_client_avg_diff,amount_deviation,0.971469
664,client_mcc_is_new,dev_x_mccnew,0.965988
665,client_mcc_is_new,mccnew_x_velocity,0.941656
34,log_abs_amount,amount_vs_client_avg_diff,0.920859
567,years_to_retirement,current_age,0.912415
817,dev_x_mccnew,mccnew_x_velocity,0.908777
39,log_abs_amount,amount_deviation,0.896821
385,tx_hour,hour_sin,0.851207
688,card_mcc_change_cnt_last5,merchant_change_cnt_last5,0.815209


### Drop 

| Feature                        | Reason                                                     |
| ------------------------------ | ---------------------------------------------------------- |
| `merchant_is_new_x_mcc_is_new` | `card_mcc_is_new`와 거의 완전 중복 (corr ≈ 0.9998)                |
| `amount_vs_client_avg_diff`    | `amount_deviation`과 매우 높은 상관 (corr ≈ 0.97)                 |
| `years_to_retirement`          | `current_age`와 구조적 선형 관계 (corr ≈ 0.91)                     |
| `client_mcc_is_new`            | interaction 변수(`dev_x_mccnew`, `mccnew_x_velocity`)와 강한 중복 |
| `log_abs_amount` *(선택적)*       | `amount_deviation` + interaction이 이미 정보 흡수 가능              |

---

### Keep (유지 권장)

| Feature                                | Reason                      |
| -------------------------------------- | --------------------------- |
| `amount_deviation`                     | 표준화된 anomaly 지표, 안정적        |
| `card_mcc_is_new`                      | 신규 MCC 신호의 핵심               |
| `dev_x_mccnew`                         | MCC 신규 × 금액 이상치 interaction |
| `mccnew_x_velocity`                    | MCC 신규 × 속도 이상치 interaction |
| `current_age`                          | 연속형 인구통계 핵심 변수              |
| `hour_sin`, `hour_cos`                 | 순환 시간 표현 (tx_hour 대체 가능)    |
| `card_mcc_change_cnt_last5`            | 구조 변화 신호                    |
| `merchant_change_cnt_last5`            | 상점 변화 패턴 신호                 |
| `card_fraud_last1`, `card_fraud_last3` | 강력한 과거 fraud 히스토리           |

---

### Conditional (성능 비교 후 결정)

| Feature Pair                                               | Recommendation             |
| ---------------------------------------------------------- | -------------------------- |
| `dev_x_mccnew` vs `mccnew_x_velocity`                      | SHAP / ablation 후 낮은 쪽 제거  |
| `card_mcc_change_cnt_last5` vs `merchant_change_cnt_last5` | importance 낮은 쪽 제거         |
| `tx_hour` vs (`hour_sin`, `hour_cos`)                      | sin/cos 유지 시 tx_hour 제거 가능 |
| `card_fraud_last1` vs `card_fraud_last3`                   | 둘 중 성능 낮은 쪽 제거             |

---

## 정리 기준

* **corr > 0.95** → 구조적 중복 → 반드시 하나 제거
* **corr 0.85~0.95** → interaction 여부 확인 후 결정
* interaction 변수는 기본 변수 제거 가능

---

In [15]:
df.drop(columns=["merchant_is_new_x_mcc_is_new", "amount_vs_client_avg_diff", "years_to_retirement", "client_mcc_is_new"], inplace=True)

# SHAP & Ablation

In [16]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

LABEL_COL = "fraud"  

y = df[LABEL_COL].astype(int)
X = df.drop(columns=[LABEL_COL])

# 1) 컬럼 타입 자동 추정

cat_cols = [c for c in X.columns if str(X[c].dtype) in ("object", "category")]
num_cols = [c for c in X.columns if c not in cat_cols]

print("num_cols:", len(num_cols), "cat_cols:", len(cat_cols))


# 2) Train/Valid split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3) 간단 전처리 파이프

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)), 
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
    ],
    remainder="drop",
)


num_cols: 37 cat_cols: 0


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import shap
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm.auto import tqdm

X_train_lgb = X_train.copy()
X_valid_lgb = X_valid.copy()

dtrain = lgb.Dataset(X_train_lgb, label=y_train, free_raw_data=False)
dvalid = lgb.Dataset(X_valid_lgb, label=y_valid, free_raw_data=False)

params = dict(
    objective="binary",
    metric=["auc", "average_precision"],
    learning_rate=0.05,
    num_leaves=64,
    min_data_in_leaf=200,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    verbosity=-1,
)

bst = lgb.train(
    params,
    dtrain,
    num_boost_round=5000,
    valid_sets=[dvalid],
    valid_names=["valid"],
    callbacks=[
        lgb.early_stopping(200, verbose=True),
        lgb.log_evaluation(0),
    ],
)

print("best_iter:", bst.best_iteration)
print("best_score:", bst.best_score)

pred_valid = bst.predict(X_valid_lgb, num_iteration=bst.best_iteration)
print("LGB AUC:", roc_auc_score(y_valid, pred_valid))
print("LGB PR-AUC:", average_precision_score(y_valid, pred_valid))


# SHAP with tqdm (batch version)

from tqdm.auto import tqdm

sv = X_valid_lgb

batch_size = 2000
all_contrib = []

print("\nComputing SHAP (LightGBM native)...")

for i in tqdm(range(0, len(sv), batch_size)):
    batch = sv.iloc[i:i+batch_size]
    contrib = bst.predict(batch, pred_contrib=True)
    all_contrib.append(contrib[:, :-1])  # 마지막 열은 bias

shap_values = np.vstack(all_contrib)

imp = pd.Series(
    np.abs(shap_values).mean(axis=0),
    index=sv.columns
).sort_values(ascending=False)

print("\nTop SHAP features:\n", imp.head(30))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid's auc: 0.956962	valid's average_precision: 0.714686
best_iter: 1
best_score: defaultdict(<class 'collections.OrderedDict'>, {'valid': OrderedDict([('auc', np.float64(0.9569618469348784)), ('average_precision', np.float64(0.7146858039053874))])})
LGB AUC: 0.9569618469348784
LGB PR-AUC: 0.7146858039053874

Computing SHAP (LightGBM native)...


  0%|          | 0/532 [00:00<?, ?it/s]


Top SHAP features:
 card_fraud_last3               0.061796
card_merchant_is_new           0.015708
client_fraud_last3             0.012736
client_merchant_is_new         0.011837
amount_deviation               0.010041
log_abs_amount                 0.004591
hour_sin                       0.004169
tx_month                       0.001326
tx_hour                        0.001248
log_yearly_income              0.000990
mccnew_x_velocity              0.000973
sin_shift                      0.000947
client_sin_mean_past           0.000763
card_velocity_spike_ratio      0.000563
client_tx_1h_avg_prev          0.000271
years_to_retirement            0.000160
current_age                    0.000156
has_error                      0.000110
card_hist_x_error              0.000000
card_error_last5               0.000000
err_bad_card_number            0.000000
card_error_last1               0.000000
err_bad_cvv                    0.000000
err_insufficient_balance       0.000000
card_error_last3   

### 

In [17]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score

feature_names = list(X_train.columns)
n_features = len(feature_names)

scaler = StandardScaler()
Xtr = scaler.fit_transform(X_train.values.astype(np.float32))
Xva = scaler.transform(X_valid.values.astype(np.float32))

ytr = y_train.values.astype(np.float32)
yva = y_valid.values.astype(np.float32)

class TabDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_loader = DataLoader(TabDataset(Xtr, ytr), batch_size=4096, shuffle=True, num_workers=0)
valid_loader = DataLoader(TabDataset(Xva, yva), batch_size=8192, shuffle=False, num_workers=0)

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Attention layer (weights 반환)

class AttnEncoderLayer(nn.Module):
    def __init__(self, d_model=64, n_heads=4, dropout=0.1):
        super().__init__()
        self.mha = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
        self.ln1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(dropout),
        )
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x, return_attn=False):
        # x: [B, T, D]
        attn_out, attn_w = self.mha(x, x, x, need_weights=True, average_attn_weights=False)
        x = self.ln1(x + attn_out)
        x = self.ln2(x + self.ff(x))
        if return_attn:
            # attn_w: [B, heads, T, T]
            return x, attn_w
        return x


# 2) Tabular Transformer (CLS 토큰 사용)

class TabularAttentionModel(nn.Module):
    def __init__(self, n_features, d_model=64, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()
        self.n_features = n_features
        self.d_model = d_model

        # feature별 1->d 투영 (각 feature마다 별도의 linear)
        self.feat_proj = nn.ModuleList([nn.Linear(1, d_model) for _ in range(n_features)])

        # CLS token
        self.cls = nn.Parameter(torch.zeros(1, 1, d_model))
        nn.init.normal_(self.cls, std=0.02)

        self.layers = nn.ModuleList([AttnEncoderLayer(d_model, n_heads, dropout) for _ in range(n_layers)])

        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, X, return_attn=False):
        # X: [B, F]
        B, F = X.shape

        # feature tokens 만들기: [B, F, D]
        toks = []
        for j in range(F):
            xj = X[:, j:j+1]                 # [B, 1]
            toks.append(self.feat_proj[j](xj))  # [B, D]
        tok = torch.stack(toks, dim=1)       # [B, F, D]

        # CLS 붙이기: [B, 1+F, D]
        cls = self.cls.expand(B, -1, -1)
        x = torch.cat([cls, tok], dim=1)

        attn_all = []
        for layer in self.layers:
            if return_attn:
                x, attn = layer(x, return_attn=True)
                attn_all.append(attn)
            else:
                x = layer(x, return_attn=False)

        # CLS representation으로 예측
        cls_repr = x[:, 0, :]               # [B, D]
        logit = self.head(cls_repr).squeeze(1)

        if return_attn:
            return logit, attn_all  # list of [B, heads, T, T]
        return logit


# 3) 학습 루프

model = TabularAttentionModel(n_features=n_features, d_model=64, n_heads=4, n_layers=2, dropout=0.1).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)
loss_fn = nn.BCEWithLogitsLoss()

def eval_model():
    model.eval()
    ps, ys = [], []
    with torch.no_grad():
        for xb, yb in valid_loader:
            xb = xb.to(device)
            logit = model(xb)
            prob = torch.sigmoid(logit).cpu().numpy()
            ps.append(prob)
            ys.append(yb.numpy())
    p = np.concatenate(ps)
    t = np.concatenate(ys)
    return roc_auc_score(t, p), average_precision_score(t, p)

EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    model.train()
    pbar = tqdm(train_loader, desc=f"train epoch {epoch}", leave=False)
    for xb, yb in pbar:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad(set_to_none=True)
        logit = model(xb)
        loss = loss_fn(logit, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        pbar.set_postfix(loss=float(loss.detach().cpu()))

    auc, pr = eval_model()
    print(f"[epoch {epoch}] valid AUC={auc:.5f}  PR-AUC={pr:.5f}")


# 4) Attention 추출 → 컬럼 중요도
# - CLS(0번 토큰)에서 각 feature 토큰으로 가는 attention을 사용

def extract_feature_attention_importance(model, loader, n_batches=50):
    model.eval()
    # 누적: feature별 attention 합
    att_sum = np.zeros((n_features,), dtype=np.float64)
    cnt = 0

    with torch.no_grad():
        for b, (xb, yb) in enumerate(tqdm(loader, desc="extract attention", total=min(n_batches, len(loader)))):
            if b >= n_batches:
                break
            xb = xb.to(device)

            logit, attn_all = model(xb, return_attn=True)
            attn = attn_all[-1]  # [B, heads, T, T]

            # CLS -> feature 토큰 attention: query=0, key=1..F
            # shape: [B, heads, F]
            cls_to_feat = attn[:, :, 0, 1:]  

            # heads 평균, batch 평균 → [F]
            score = cls_to_feat.mean(dim=1).mean(dim=0).cpu().numpy()
            att_sum += score
            cnt += 1

    att_mean = att_sum / max(cnt, 1)
    imp = pd.Series(att_mean, index=feature_names).sort_values(ascending=False)
    return imp

att_imp = extract_feature_attention_importance(model, valid_loader, n_batches=50)
print("\nTop Attention features:\n", att_imp.head(30))


train epoch 1:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 1] valid AUC=0.98043  PR-AUC=0.75369


train epoch 2:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 2] valid AUC=0.98360  PR-AUC=0.76254


train epoch 3:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 3] valid AUC=0.98491  PR-AUC=0.76464


train epoch 4:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 4] valid AUC=0.98561  PR-AUC=0.76796


train epoch 5:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 5] valid AUC=0.98671  PR-AUC=0.76746


extract attention:   0%|          | 0/50 [00:00<?, ?it/s]


Top Attention features:
 client_mcc_repeat_cnt_last5    0.128525
amount_deviation               0.076894
log_abs_amount                 0.060830
has_error                      0.049783
tx_hour                        0.047700
merchant_change_cnt_last5      0.042738
err_bad_cvv                    0.035408
mcc_highrisk_90                0.035325
card_error_last3               0.030601
card_mcc_is_new                0.028266
dev_x_mccnew                   0.027846
client_fraud_last3             0.026458
err_bad_card_number            0.025621
current_age                    0.023396
mccnew_x_velocity              0.022851
client_tx_1h_avg_prev          0.022654
num_credit_cards               0.021627
card_error_last1               0.021053
sin_shift                      0.020894
hour_sin                       0.020816
hour_cos                       0.018760
seconds_since_prev_tx          0.017846
card_fraud_last3               0.017270
client_merchant_is_new         0.015743
card_hist_x_er

In [18]:
compare = pd.DataFrame({
    "shap_mean_abs": imp,        
    "attn_cls2feat": att_imp
}).fillna(0.0)

compare["shap_rank"] = compare["shap_mean_abs"].rank(ascending=False, method="min")
compare["attn_rank"] = compare["attn_cls2feat"].rank(ascending=False, method="min")
compare["rank_gap"] = compare["attn_rank"] - compare["shap_rank"]

print(compare.sort_values("shap_mean_abs", ascending=False).head(40))


                              shap_mean_abs  attn_cls2feat  shap_rank  \
card_fraud_last3                   0.061796       0.017270        1.0   
card_merchant_is_new               0.015708       0.007011        2.0   
client_fraud_last3                 0.012736       0.026458        3.0   
client_merchant_is_new             0.011837       0.015743        4.0   
amount_deviation                   0.010041       0.076894        5.0   
log_abs_amount                     0.004591       0.060830        6.0   
hour_sin                           0.004169       0.020816        7.0   
tx_month                           0.001326       0.013703        8.0   
tx_hour                            0.001248       0.047700        9.0   
log_yearly_income                  0.000990       0.008328       10.0   
mccnew_x_velocity                  0.000973       0.022851       11.0   
sin_shift                          0.000947       0.020894       12.0   
client_sin_mean_past               0.000763       0

---

## Ablation Test

In [19]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm.auto import tqdm

params = dict(
    objective="binary",
    metric="auc",
    learning_rate=0.05,
    num_leaves=64,
    min_data_in_leaf=200,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    verbosity=-1,
)

def train_eval(X_tr, y_tr, X_va, y_va):
    dtrain = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)
    dvalid = lgb.Dataset(X_va, label=y_va, free_raw_data=False)

    bst = lgb.train(
        params,
        dtrain,
        num_boost_round=3000,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)],
    )

    pred = bst.predict(X_va, num_iteration=bst.best_iteration)

    return {
        "auc": roc_auc_score(y_va, pred),
        "prauc": average_precision_score(y_va, pred),
        "best_iter": bst.best_iteration,
    }


In [20]:
base_result = train_eval(X_train, y_train, X_valid, y_valid)
print("BASE:", base_result)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.960813
BASE: {'auc': 0.9608128060808006, 'prauc': 0.6516423281572084, 'best_iter': 1}


In [21]:
drop_one_results = []

for col in tqdm(X_train.columns):
    cols = [c for c in X_train.columns if c != col]

    res = train_eval(
        X_train[cols],
        y_train,
        X_valid[cols],
        y_valid,
    )

    drop_one_results.append({
        "dropped_feature": col,
        "auc_drop": base_result["auc"] - res["auc"],
        "prauc_drop": base_result["prauc"] - res["prauc"],
        "auc": res["auc"],
        "prauc": res["prauc"],
    })

drop_one_df = pd.DataFrame(drop_one_results)\
    .sort_values("auc_drop", ascending=False)

drop_one_df

  0%|          | 0/37 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969617
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969617
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969617
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969617
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969617
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969617
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969617
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969617


Unnamed: 0,dropped_feature,auc_drop,prauc_drop,auc,prauc
19,mcc_highrisk_90,-0.002848,0.112948,0.963661,0.538694
15,client_sin_mean_past,-0.004962,-0.078413,0.965775,0.730055
12,hour_sin,-0.006429,-0.079296,0.967241,0.730939
30,card_velocity_spike_ratio,-0.008379,-0.071575,0.969192,0.723217
31,client_tx_1h_avg_prev,-0.008379,-0.071575,0.969192,0.723217
29,client_avg_interval_prev,-0.008379,-0.071575,0.969192,0.723217
27,merchant_is_new_x_has_error,-0.008449,-0.072212,0.969262,0.723854
28,seconds_since_prev_tx,-0.008452,-0.064112,0.969265,0.715755
25,client_merchant_is_new,-0.008457,-0.072062,0.96927,0.723705
26,merchant_change_cnt_last5,-0.008457,-0.072062,0.96927,0.723705


---

# Stage2 Feature Selection Summary

(SHAP + Attention + Ablation 종합 정리)

---

## 1. Strong Keep (핵심 유지)

| Feature              | 근거                                    |
| -------------------- | ------------------------------------- |
| `card_fraud_last3`   | SHAP 1위, Ablation 최대 하락 (Stage2 핵심 축) |
| `client_fraud_last3` | SHAP 상위, 제거 시 AUC 유의미 감소              |
| `amount_deviation`   | SHAP Top5, drop 시 성능 급감               |
| `mccnew_x_velocity`  | Ablation 상위권, 구조적 상호작용 핵심             |
| `dev_x_mccnew`       | drop 시 성능 크게 감소, 상호작용 리스크 포착          |

이들은 Stage2의 **행동 붕괴 + 과거 이력 결합 구조**를 형성하는 중심 축이다.

---

## 2. Keep (유지 권장)

| Feature                       | 근거                            |
| ----------------------------- | ----------------------------- |
| `client_merchant_is_new`      | SHAP/ablation 모두 일정 기여        |
| `card_merchant_is_new`        | merchant novelty 구조 보강        |
| `merchant_change_cnt_last5`   | 행동 변화 신호                      |
| `card_mcc_change_cnt_last5`   | MCC 변화 패턴                     |
| `client_mcc_repeat_cnt_last5` | Attention 매우 높음 + ablation 기여 |
| `tx_hour`                     | 시간 패턴 (Stage2 보조 신호)          |
| `hour_sin` / `hour_cos`       | 순환 시간 패턴 안정화                  |
| `client_tx_1h_avg_prev`       | 단기 행동 평균                      |
| `card_velocity_spike_ratio`   | 속도 이상 보강                      |
| `current_age`                 | 약하지만 안정적 맥락 변수                |
| `log_yearly_income`           | 개인 재무 맥락 보조                   |

이 그룹은 단독 SHAP은 낮지만
**Ablation에서 구조적 기여를 확인**할 수 있음.

---

## 3. Drop 후보 (중복/기여 낮음)

| Feature                        | 근거               |
| ------------------------------ | ---------------- |
| `err_bad_card_number`          | SHAP 0, 영향 거의 없음 |
| `err_bad_cvv`                  | SHAP 0           |
| `err_bad_expiration`           | SHAP 0           |
| `err_insufficient_balance`     | SHAP 0           |
| `card_error_last1/3/5`         | Stage1과 중복       |
| `card_hist_x_error`            | 기여 없음            |
| `has_error`                    | 단독 영향 미미         |
| `mcc_highrisk_90`              | Stage1에서 충분히 반영  |
| `amount_vs_client_avg_diff`    | 중복 구조            |
| `client_mcc_is_new`            | SHAP 0           |
| `merchant_is_new_x_mcc_is_new` | 영향 없음            |

Error 세부 변수는 Stage1에서 이미 충분히 설명되었으며
Stage2에서는 중복 신호로 작동할 가능성이 높음.

---

## 4. SHAP vs Attention 괴리 해석

### 대표 사례

| Feature                       | 현상                      |
| ----------------------------- | ----------------------- |
| `client_mcc_repeat_cnt_last5` | Attention 1위, SHAP 거의 0 |
| `has_error`                   | Attention 상위, SHAP 낮음   |
| 일부 error 계열                   | Attention 존재, 실제 기여 낮음  |

**해석**

* Attention은 “참고한 정도”
* SHAP은 “결정 경계에 기여한 정도”

따라서 일부 변수는 모델이 구조상 참고는 하지만
최종 분리 경계 형성에는 거의 기여하지 않음.

---

## 5. Stage2 구조 해석

현재 Stage2는 다음 5개 축으로 요약 가능:

1. Card / Client Fraud History
2. Merchant Novelty + Velocity 결합
3. Amount 개인 평균 대비 이탈
4. 행동 변화 (MCC 반복/변경)
5. 시간 패턴 보조

Stage1이 “거래 단위 이상 탐지”라면
Stage2는 “행동 프로파일 붕괴 탐지”에 해당한다.

---

## 6. Slim Stage2 

```python
CORE_STAGE2 = [
    # Fraud history
    "card_fraud_last3",
    "client_fraud_last3",

    # Amount anomaly
    "amount_deviation",

    # Merchant novelty + interaction
    "client_merchant_is_new",
    "card_merchant_is_new",
    "mccnew_x_velocity",
    "dev_x_mccnew",

    # Behavior pattern
    "client_mcc_repeat_cnt_last5",
    "card_mcc_change_cnt_last5",
    "merchant_change_cnt_last5",

    # Velocity context
    "client_tx_1h_avg_prev",
    "card_velocity_spike_ratio",

    # Time context
    "tx_hour",
    "hour_sin",
    "hour_cos",

    # Demographic context
    "current_age",
    "log_yearly_income",
]
```

이 구성은:

* Stage1과 중복 최소화
* 상호작용 구조 유지
* 행동 기반 리스크 축 보존
* Error 세부 변수 제거
* 성능 하락 없이 경량화 가능성 높음

---
