In [1]:
import pandas as pd
import os

In [2]:
df = pd.read_parquet("../DATA/dataset/TRAIN_stage2")

In [3]:
df.head()

Unnamed: 0,fraud,has_error,err_bad_card_number,err_bad_expiration,err_bad_cvv,tx_month,tx_hour,is_refund,log_abs_amount,current_age,...,client_tx_1h_avg_prev,card_velocity_spike_ratio,amount_vs_client_avg_diff,card_fraud_last3,client_fraud_last3,amount_deviation,mccnew_x_error,mccnew_x_velocity,dev_x_mccnew,dev_x_velocity
0,0,0,0,0,0,1,13,0,3.554204,33,...,1.0,0.999999,0.0,0,0,-0.055478,0,0.999999,0.693147,0.693146
1,0,0,0,0,0,1,19,0,2.172477,33,...,1.0,1.999998,-1.381728,0,0,-1.355013,0,0.999999,0.477005,0.954009
2,0,0,0,0,0,1,22,0,4.202601,33,...,1.0,0.666666,1.339261,0,0,0.554348,0,0.999999,0.903297,0.602198
3,0,0,0,0,0,1,15,0,0.86289,33,...,1.0,0.749999,-2.446871,0,0,-2.586697,0,0.999999,0.231676,0.173757
4,0,0,0,0,0,1,8,0,5.127647,33,...,1.0,0.799999,2.429604,0,0,1.424367,0,0.999999,1.064885,0.851907


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5312525 entries, 0 to 5312524
Data columns (total 60 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   fraud                          int8   
 1   has_error                      int8   
 2   err_bad_card_number            int8   
 3   err_bad_expiration             int8   
 4   err_bad_cvv                    int8   
 5   tx_month                       int8   
 6   tx_hour                        int8   
 7   is_refund                      int8   
 8   log_abs_amount                 float32
 9   current_age                    int64  
 10  num_credit_cards               int8   
 11  has_chip                       int8   
 12  year_pin_last_changed          Int16  
 13  is_prepaid                     int8   
 14  cb_Amex                        int8   
 15  cb_Discover                    int8   
 16  years_to_retirement            int16  
 17  log_yearly_income              float32
 18  lo

# SHAP & Ablation

In [5]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

LABEL_COL = "fraud"  

y = df[LABEL_COL].astype(int)
X = df.drop(columns=[LABEL_COL])

# 1) 컬럼 타입 자동 추정

cat_cols = [c for c in X.columns if str(X[c].dtype) in ("object", "category")]
num_cols = [c for c in X.columns if c not in cat_cols]

print("num_cols:", len(num_cols), "cat_cols:", len(cat_cols))


# 2) Train/Valid split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3) 간단 전처리 파이프

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)), 
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
    ],
    remainder="drop",
)


num_cols: 59 cat_cols: 0


In [11]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import shap
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm.auto import tqdm

X_train_lgb = X_train.copy()
X_valid_lgb = X_valid.copy()

dtrain = lgb.Dataset(X_train_lgb, label=y_train, free_raw_data=False)
dvalid = lgb.Dataset(X_valid_lgb, label=y_valid, free_raw_data=False)

params = dict(
    objective="binary",
    metric=["auc", "average_precision"],
    learning_rate=0.05,
    num_leaves=64,
    min_data_in_leaf=200,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    verbosity=-1,
)

bst = lgb.train(
    params,
    dtrain,
    num_boost_round=5000,
    valid_sets=[dvalid],
    valid_names=["valid"],
    callbacks=[
        lgb.early_stopping(200, verbose=True),
        lgb.log_evaluation(0),
    ],
)

print("best_iter:", bst.best_iteration)
print("best_score:", bst.best_score)

pred_valid = bst.predict(X_valid_lgb, num_iteration=bst.best_iteration)
print("LGB AUC:", roc_auc_score(y_valid, pred_valid))
print("LGB PR-AUC:", average_precision_score(y_valid, pred_valid))


# SHAP with tqdm (batch version)

from tqdm.auto import tqdm

sv = X_valid_lgb

batch_size = 2000
all_contrib = []

print("\nComputing SHAP (LightGBM native)...")

for i in tqdm(range(0, len(sv), batch_size)):
    batch = sv.iloc[i:i+batch_size]
    contrib = bst.predict(batch, pred_contrib=True)
    all_contrib.append(contrib[:, :-1])  # 마지막 열은 bias

shap_values = np.vstack(all_contrib)

imp = pd.Series(
    np.abs(shap_values).mean(axis=0),
    index=sv.columns
).sort_values(ascending=False)

print("\nTop SHAP features:\n", imp.head(30))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid's auc: 0.969243	valid's average_precision: 0.730224
best_iter: 1
best_score: defaultdict(<class 'collections.OrderedDict'>, {'valid': OrderedDict([('auc', np.float64(0.9692425987082228)), ('average_precision', np.float64(0.7302237658253058))])})
LGB AUC: 0.9692425987082229
LGB PR-AUC: 0.7302237658253057

Computing SHAP (LightGBM native)...


  0%|          | 0/532 [00:00<?, ?it/s]


Top SHAP features:
 card_fraud_last3              0.063566
mcc_highrisk_90               0.015959
card_merchant_is_new          0.014943
client_merchant_is_new        0.011164
client_fraud_last3            0.008731
amount_deviation              0.005496
log_abs_amount                0.002768
tx_hour                       0.002746
hour_sin                      0.002656
client_weekday_match_last1    0.001700
hour_cos                      0.001327
tx_month                      0.000948
hour_circular_distance        0.000930
amount_vs_client_avg_diff     0.000897
log_amount_limit_ratio        0.000743
client_mcc_seen_last5         0.000605
log_yearly_income             0.000520
is_highrisk_weekday           0.000466
dev_x_mccnew                  0.000446
client_tx_1h_avg_prev         0.000417
card_velocity_spike_ratio     0.000369
num_credit_cards              0.000299
log_income_ratio_region       0.000192
year_pin_last_changed         0.000050
current_age                   0.000000
has_

### 

In [14]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score

feature_names = list(X_train.columns)
n_features = len(feature_names)

scaler = StandardScaler()
Xtr = scaler.fit_transform(X_train.values.astype(np.float32))
Xva = scaler.transform(X_valid.values.astype(np.float32))

ytr = y_train.values.astype(np.float32)
yva = y_valid.values.astype(np.float32)

class TabDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_loader = DataLoader(TabDataset(Xtr, ytr), batch_size=4096, shuffle=True, num_workers=0)
valid_loader = DataLoader(TabDataset(Xva, yva), batch_size=8192, shuffle=False, num_workers=0)

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Attention layer (weights 반환)

class AttnEncoderLayer(nn.Module):
    def __init__(self, d_model=64, n_heads=4, dropout=0.1):
        super().__init__()
        self.mha = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
        self.ln1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(dropout),
        )
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x, return_attn=False):
        # x: [B, T, D]
        attn_out, attn_w = self.mha(x, x, x, need_weights=True, average_attn_weights=False)
        x = self.ln1(x + attn_out)
        x = self.ln2(x + self.ff(x))
        if return_attn:
            # attn_w: [B, heads, T, T]
            return x, attn_w
        return x


# 2) Tabular Transformer (CLS 토큰 사용)

class TabularAttentionModel(nn.Module):
    def __init__(self, n_features, d_model=64, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()
        self.n_features = n_features
        self.d_model = d_model

        # feature별 1->d 투영 (각 feature마다 별도의 linear)
        self.feat_proj = nn.ModuleList([nn.Linear(1, d_model) for _ in range(n_features)])

        # CLS token
        self.cls = nn.Parameter(torch.zeros(1, 1, d_model))
        nn.init.normal_(self.cls, std=0.02)

        self.layers = nn.ModuleList([AttnEncoderLayer(d_model, n_heads, dropout) for _ in range(n_layers)])

        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, X, return_attn=False):
        # X: [B, F]
        B, F = X.shape

        # feature tokens 만들기: [B, F, D]
        toks = []
        for j in range(F):
            xj = X[:, j:j+1]                 # [B, 1]
            toks.append(self.feat_proj[j](xj))  # [B, D]
        tok = torch.stack(toks, dim=1)       # [B, F, D]

        # CLS 붙이기: [B, 1+F, D]
        cls = self.cls.expand(B, -1, -1)
        x = torch.cat([cls, tok], dim=1)

        attn_all = []
        for layer in self.layers:
            if return_attn:
                x, attn = layer(x, return_attn=True)
                attn_all.append(attn)
            else:
                x = layer(x, return_attn=False)

        # CLS representation으로 예측
        cls_repr = x[:, 0, :]               # [B, D]
        logit = self.head(cls_repr).squeeze(1)

        if return_attn:
            return logit, attn_all  # list of [B, heads, T, T]
        return logit


# 3) 학습 루프

model = TabularAttentionModel(n_features=n_features, d_model=64, n_heads=4, n_layers=2, dropout=0.1).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)
loss_fn = nn.BCEWithLogitsLoss()

def eval_model():
    model.eval()
    ps, ys = [], []
    with torch.no_grad():
        for xb, yb in valid_loader:
            xb = xb.to(device)
            logit = model(xb)
            prob = torch.sigmoid(logit).cpu().numpy()
            ps.append(prob)
            ys.append(yb.numpy())
    p = np.concatenate(ps)
    t = np.concatenate(ys)
    return roc_auc_score(t, p), average_precision_score(t, p)

EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    model.train()
    pbar = tqdm(train_loader, desc=f"train epoch {epoch}", leave=False)
    for xb, yb in pbar:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad(set_to_none=True)
        logit = model(xb)
        loss = loss_fn(logit, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        pbar.set_postfix(loss=float(loss.detach().cpu()))

    auc, pr = eval_model()
    print(f"[epoch {epoch}] valid AUC={auc:.5f}  PR-AUC={pr:.5f}")


# 4) Attention 추출 → 컬럼 중요도
# - CLS(0번 토큰)에서 각 feature 토큰으로 가는 attention을 사용

def extract_feature_attention_importance(model, loader, n_batches=50):
    model.eval()
    # 누적: feature별 attention 합
    att_sum = np.zeros((n_features,), dtype=np.float64)
    cnt = 0

    with torch.no_grad():
        for b, (xb, yb) in enumerate(tqdm(loader, desc="extract attention", total=min(n_batches, len(loader)))):
            if b >= n_batches:
                break
            xb = xb.to(device)

            logit, attn_all = model(xb, return_attn=True)
            attn = attn_all[-1]  # [B, heads, T, T]

            # CLS -> feature 토큰 attention: query=0, key=1..F
            # shape: [B, heads, F]
            cls_to_feat = attn[:, :, 0, 1:]  

            # heads 평균, batch 평균 → [F]
            score = cls_to_feat.mean(dim=1).mean(dim=0).cpu().numpy()
            att_sum += score
            cnt += 1

    att_mean = att_sum / max(cnt, 1)
    imp = pd.Series(att_mean, index=feature_names).sort_values(ascending=False)
    return imp

att_imp = extract_feature_attention_importance(model, valid_loader, n_batches=50)
print("\nTop Attention features:\n", att_imp.head(30))


train epoch 1:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 1] valid AUC=0.98247  PR-AUC=0.75157


train epoch 2:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 2] valid AUC=0.98492  PR-AUC=0.77238


train epoch 3:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 3] valid AUC=0.98571  PR-AUC=0.77519


train epoch 4:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 4] valid AUC=0.98622  PR-AUC=0.77360


train epoch 5:   0%|          | 0/1038 [00:00<?, ?it/s]

[epoch 5] valid AUC=0.98714  PR-AUC=0.78041


extract attention:   0%|          | 0/50 [00:00<?, ?it/s]


Top Attention features:
 log_abs_amount                   0.113404
amount_vs_client_avg_diff        0.111617
amount_deviation                 0.044145
mccnew_x_velocity                0.040883
hour_sin                         0.039404
mcc_highrisk_90                  0.035921
merchant_is_new                  0.031926
client_mcc_seen_last5            0.031562
card_fraud_last3                 0.027470
hour_cos                         0.026686
dev_x_velocity                   0.022454
log_yearly_income                0.021815
tx_hour                          0.021662
discover_x_cvv                   0.018952
refund_high_amount               0.016100
card_hist_x_error                0.014681
years_to_retirement              0.014231
dev_x_mccnew                     0.013246
sin_shift                        0.013241
cos_shift                        0.012126
err_bad_expiration               0.011984
merchant_change_cnt_last5        0.011818
client_merchant_is_new           0.011631
seconds_

In [15]:
compare = pd.DataFrame({
    "shap_mean_abs": imp,        
    "attn_cls2feat": att_imp
}).fillna(0.0)

compare["shap_rank"] = compare["shap_mean_abs"].rank(ascending=False, method="min")
compare["attn_rank"] = compare["attn_cls2feat"].rank(ascending=False, method="min")
compare["rank_gap"] = compare["attn_rank"] - compare["shap_rank"]

print(compare.sort_values("shap_mean_abs", ascending=False).head(40))


                               shap_mean_abs  attn_cls2feat  shap_rank  \
card_fraud_last3                    0.063566       0.027470        1.0   
mcc_highrisk_90                     0.015959       0.035921        2.0   
card_merchant_is_new                0.014943       0.005576        3.0   
client_merchant_is_new              0.011164       0.011631        4.0   
client_fraud_last3                  0.008731       0.008492        5.0   
amount_deviation                    0.005496       0.044145        6.0   
log_abs_amount                      0.002768       0.113404        7.0   
tx_hour                             0.002746       0.021662        8.0   
hour_sin                            0.002656       0.039404        9.0   
client_weekday_match_last1          0.001700       0.005577       10.0   
hour_cos                            0.001327       0.026686       11.0   
tx_month                            0.000948       0.003850       12.0   
hour_circular_distance              0.

### 핵심 feature

- card_fraud_last3
- mcc_highrisk_90
- client_merchant_is_new
- client_fraud_last3
- amount_deviation
- log_abs_amount
- hour_sin / hour_cos

### SHAP은 낮은데 Attention은 높은 것

- amount_vs_client_avg_diff  (attn_rank 2위)
- log_abs_amount             (attn_rank 1위)
- dev_x_velocity
- discover_x_cvv
- card_hist_x_error

### Drop 후보

- cb_Discover
- log_income_ratio_region
- num_credit_cards
- tx_month
- log_amount_limit_ratio

---

## Ablation Test

In [17]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm.auto import tqdm

params = dict(
    objective="binary",
    metric="auc",
    learning_rate=0.05,
    num_leaves=64,
    min_data_in_leaf=200,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    verbosity=-1,
)

def train_eval(X_tr, y_tr, X_va, y_va):
    dtrain = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)
    dvalid = lgb.Dataset(X_va, label=y_va, free_raw_data=False)

    bst = lgb.train(
        params,
        dtrain,
        num_boost_round=3000,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)],
    )

    pred = bst.predict(X_va, num_iteration=bst.best_iteration)

    return {
        "auc": roc_auc_score(y_va, pred),
        "prauc": average_precision_score(y_va, pred),
        "best_iter": bst.best_iteration,
    }


In [19]:
base_result = train_eval(X_train, y_train, X_valid, y_valid)
print("BASE:", base_result)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.969243
BASE: {'auc': 0.9692425987082229, 'prauc': 0.7302237658253057, 'best_iter': 1}


In [20]:
drop_one_results = []

for col in tqdm(X_train.columns):
    cols = [c for c in X_train.columns if c != col]

    res = train_eval(
        X_train[cols],
        y_train,
        X_valid[cols],
        y_valid,
    )

    drop_one_results.append({
        "dropped_feature": col,
        "auc_drop": base_result["auc"] - res["auc"],
        "prauc_drop": base_result["prauc"] - res["prauc"],
        "auc": res["auc"],
        "prauc": res["prauc"],
    })

drop_one_df = pd.DataFrame(drop_one_results)\
    .sort_values("auc_drop", ascending=False)

drop_one_df

  0%|          | 0/59 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.970463
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.970463
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.970463
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.970463
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.970441
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.970469
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.970463
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.970424


Unnamed: 0,dropped_feature,auc_drop,prauc_drop,auc,prauc
47,seconds_since_prev_tx,0.010344,0.006613,0.958898,0.723611
49,client_tx_1h_avg_prev,0.010334,0.006001,0.958908,0.724222
50,card_velocity_spike_ratio,0.010334,0.006001,0.958908,0.724222
48,client_avg_interval_prev,0.010334,0.006001,0.958908,0.724222
40,card_mcc_change_cnt_last5,0.010331,0.00606,0.958912,0.724164
41,client_merchant_is_new,0.010331,0.00606,0.958912,0.724164
34,client_mcc_is_new,0.010331,0.00606,0.958912,0.724164
39,card_mcc_is_new,0.010331,0.00606,0.958912,0.724164
45,merchant_is_new_x_mcc_is_new,0.010331,0.00606,0.958912,0.724164
44,merchant_is_new,0.010331,0.00606,0.958912,0.724164


---

**Important**
> card_fraud_last3
> prauc_drop = 0.130306


**행동 급변 + novelty cluster**
> seconds_since_prev_tx\
> client_tx_1h_avg_prev\
> card_velocity_spike_ratio\
> client_avg_interval_prev\
> merchant_is_new 계열\
> mcc_change 계열\
> mcc_is_new 계열\
> merchant_change_cnt_last5


**Noise or Redundancy**

> log_abs_amount\
> tx_month\
> err_bad_* 계열\
> has_error\
> current_age\
> hour_sin/cos


### Block 1 — Recent Fraud Memory (가장 중요)

- card_fraud_last3

- client_fraud_last3

### Block 2 — Velocity & Interval Change

- seconds_since_prev_tx

- client_tx_1h_avg_prev

- card_velocity_spike_ratio

- client_avg_interval_prev

### Block 3 — Merchant / MCC Novelty

- merchant_is_new

- card_merchant_is_new

- client_merchant_is_new

- mcc_change_cnt_last5

- mcc_is_new 계열

### Block 4 — Amount Deviation

- amount_vs_client_avg_diff

- amount_deviation