In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, average_precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [7]:
df = pd.read_parquet("transactions_clean.parquet")
df["date"] = pd.to_datetime(df["date"])

df["client_id"] = df["client_id"].astype("int32")
df["card_id"] = df["card_id"].astype("int32")
df["merchant_id"] = df["merchant_id"].astype("int32")
df["mcc"] = df["mcc"].astype("int16")

df["amount"] = df["amount"].astype("float32")

for c in ["use_chip", "merchant_city", "merchant_state", "zip"]:
    df[c] = df[c].astype("category")

for c in [
    "has_error",
    "err_card_credential",
    "err_authentication",
    "err_financial",
    "err_system"
]:
    df[c] = df[c].astype("int8")

df["fraud"] = df["fraud"].astype("int8")

In [3]:
df

Unnamed: 0,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,fraud,has_error,err_card_credential,err_authentication,err_financial,err_system
0,2010-01-01 00:01:00,1556,2972,-77.000000,Swipe Transaction,59935,Beulah,ND,58523.0,5499,0,0,0,0,0,0
1,2010-01-01 00:02:00,561,4575,14.570000,Swipe Transaction,67570,Bettendorf,IA,52722.0,5311,0,0,0,0,0,0
2,2010-01-01 00:02:00,1129,102,80.000000,Swipe Transaction,27092,Vista,CA,92084.0,4829,0,0,0,0,0,0
3,2010-01-01 00:06:00,848,3915,46.410000,Swipe Transaction,13051,Harwood,MD,20776.0,5813,0,0,0,0,0,0
4,2010-01-01 00:07:00,1807,165,4.810000,Swipe Transaction,20519,Bronx,NY,10464.0,5942,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8851556,2019-10-31 23:54:00,672,5001,12.930000,Chip Transaction,47508,Cosby,TN,37722.0,7230,0,0,0,0,0,0
8851557,2019-10-31 23:54:00,1384,3723,67.010002,Chip Transaction,58136,Williamson,GA,30292.0,5812,0,0,0,0,0,0
8851558,2019-10-31 23:56:00,1718,2379,1.110000,Chip Transaction,86438,West Covina,CA,91792.0,5499,0,0,0,0,0,0
8851559,2019-10-31 23:56:00,1766,2066,12.800000,Online Transaction,39261,ONLINE,ONLINE,ONLINE,5815,0,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8851561 entries, 0 to 8851560
Data columns (total 16 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date                 datetime64[ns]
 1   client_id            int32         
 2   card_id              int32         
 3   amount               float32       
 4   use_chip             category      
 5   merchant_id          int32         
 6   merchant_city        category      
 7   merchant_state       category      
 8   zip                  category      
 9   mcc                  int16         
 10  fraud                int8          
 11  has_error            int8          
 12  err_card_credential  int8          
 13  err_authentication   int8          
 14  err_financial        int8          
 15  err_system           int8          
dtypes: category(4), datetime64[ns](1), float32(1), int16(1), int32(3), int8(6)
memory usage: 321.8 MB


In [11]:
df.isnull().sum()

client_id              0
card_id                0
amount                 0
use_chip               0
merchant_id            0
merchant_city          0
merchant_state         0
zip                    0
mcc                    0
fraud                  0
has_error              0
err_card_credential    0
err_authentication     0
err_financial          0
err_system             0
dow                    0
month                  0
day                    0
date_ts                0
dtype: int64

In [5]:
df["fraud"].value_counts()

fraud
0    8841548
1      10013
Name: count, dtype: int64

---

# üìä Fraud Îç∞Ïù¥ÌÑ∞ ÏÇ¨Ï†Ñ Í≤ÄÏ¶ù Ïã§Ìóò ÏöîÏïΩ (Sanity Check)

## 1Ô∏è‚É£ Î¨∏Ï†ú ÏÉÅÌô©

* Ï†ÑÏ≤¥ Í±∞Îûò Ïàò: **ÏïΩ 885Îßå Í±¥**
* Fraud ÎπÑÏú®: **ÏïΩ 0.11% (10,013Í±¥)**
* üëâ **Í∑πÎã®Ï†Å Î∂àÍ∑†Ìòï Îç∞Ïù¥ÌÑ∞**

  * Í∑∏ÎåÄÎ°ú ÌïôÏäµÌïòÎ©¥ Î™®Îç∏Ïù¥ Ï†ÑÎ∂Ä `Ï†ïÏÉÅ(0)`Îßå ÏòàÏ∏°Ìï¥ÎèÑ ÏÑ±Îä•Ïù¥ Ï¢ãÏïÑ Î≥¥Ïù¥Îäî Ï∞©Ïãú Î∞úÏÉù

---

## 2Ô∏è‚É£ Ïã§Ìóò Î™©Ï†Å

> **‚ÄúÏù¥ Îç∞Ïù¥ÌÑ∞Ïóê fraudÎ•º Íµ¨Î∂ÑÌï† Ïàò ÏûàÎäî Ïã†Ìò∏(signal)Í∞Ä Ïã§Ï†úÎ°ú Ï°¥Ïû¨ÌïòÎäîÍ∞Ä?‚Äù**

Ï¶â,

* Ïö¥ÏòÅ ÏÑ±Îä•ÏùÑ Î∞îÎ°ú Î≥¥Î†§Îäî Ïã§Ìóò ‚ùå
* **Îç∞Ïù¥ÌÑ∞ ÏûêÏ≤¥Í∞Ä Î™®Îç∏ÎßÅ Í∞ÄÏπòÍ∞Ä ÏûàÎäîÏßÄ Í≤ÄÏ¶ù**ÌïòÎäî Îã®Í≥Ñ ‚úÖ

---

## 3Ô∏è‚É£ Ïã§Ìóò Î∞©Î≤ï (ÏùòÎèÑÏ†ÅÏúºÎ°ú Îã®ÏàúÌïòÍ≤å ÏÑ§Í≥Ñ)

### (1) Îç∞Ïù¥ÌÑ∞ Íµ¨ÏÑ±

* Fraud(1): **Ï†ÑÎ∂Ä ÏÇ¨Ïö©**
* Non-fraud(0): FraudÏùò **1.2Î∞∞Îßå ÎûúÎç§ ÏÉòÌîåÎßÅ**

  * ‚Üí ÌïôÏäµÏù¥ Í∞ÄÎä•Ìïú **Ï§Ä-Í∑†Ìòï Îç∞Ïù¥ÌÑ∞ÏÖã** Íµ¨ÏÑ±

### (2) Î∞òÎ≥µ Ïã§Ìóò

* ÏúÑ Í≥ºÏ†ïÏùÑ **20Î≤à Î∞òÎ≥µ**
* Îß§Î≤à:

  * ÎûúÎç§ undersampling
  * stratified train/test split
  * Logistic Regression ÌïôÏäµ

üëâ **ÏÉòÌîåÎßÅ Ïö¥Îπ® Ï†úÍ±∞ + Í≤∞Í≥º ÏïàÏ†ïÏÑ± ÌôïÏù∏ Î™©Ï†Å**

---

## 4Ô∏è‚É£ ÏÇ¨Ïö©Ìïú ÌîºÏ≤ò

* Î™®Îì† Í±∞Îûò Îã®ÏúÑ Í∏∞Î≥∏ Ï†ïÎ≥¥ ÏÇ¨Ïö©

  * Í∏àÏï°(amount)
  * Í≤∞Ï†ú Î∞©Ïãù(use_chip)
  * Í∞ÄÎßπÏ†ê ÏúÑÏπò(city/state)
  * ÏóÖÏ¢Ö(mcc)
  * Ïπ¥Îìú/Í≥†Í∞ù Ï†ïÎ≥¥
  * ÏóêÎü¨ ÌîåÎûòÍ∑∏Îì§
  * ÏãúÍ∞Ñ ÌååÏÉù Î≥ÄÏàò(dow, month, day, timestamp)

---

## 5Ô∏è‚É£ Í≤∞Í≥º ÏöîÏïΩ

### üìà Î∞òÎ≥µ Ïã§Ìóò Í≤∞Í≥º (20 runs)

* **ROC-AUC**

  * ÌèâÍ∑†: **0.865**
  * ÌëúÏ§ÄÌé∏Ï∞®: **0.007**
* **PR-AUC**

  * ÌèâÍ∑†: **0.834**
  * ÌëúÏ§ÄÌé∏Ï∞®: **0.017**

üëâ run Í∞Ñ Ìé∏Ï∞®Í∞Ä Îß§Ïö∞ ÏûëÏùå ‚Üí **Ïö∞Ïó∞Ïù¥ ÏïÑÎãå Íµ¨Ï°∞Ï†Å Ïã†Ìò∏**

---

## 6Ô∏è‚É£ Ïù¥ Í≤∞Í≥ºÍ∞Ä ÏùòÎØ∏ÌïòÎäî Í≤É

### ‚úÖ Í∏çÏ†ïÏ†ÅÏù∏ Ï†ê

* Îã®ÏàúÌïú Logistic RegressionÎßåÏúºÎ°úÎèÑ

  * fraud / non-fraud Íµ¨Î∂Ñ Í∞ÄÎä•
* Îç∞Ïù¥ÌÑ∞/ÎùºÎ≤®/ÌîºÏ≤òÏóê **Î™ÖÌôïÌïú ÌåêÎ≥Ñ Ïã†Ìò∏ Ï°¥Ïû¨**
* Î™®Îç∏Ïù¥ **ÏïàÏ†ïÏ†ÅÏúºÎ°ú ÌïôÏäµÎê®**

### ‚ùó Ï£ºÏùòÌï† Ï†ê

* Ïù¥ Ï†êÏàòÎäî **Í∑†Ìòï Îç∞Ïù¥ÌÑ∞ Í∏∞Ï§Ä**
* Ïã§Ï†ú Ïö¥ÏòÅ Î∂ÑÌè¨(0.11%)ÏóêÏÑúÏùò ÏÑ±Îä•ÏùÑ ÏùòÎØ∏ÌïòÏßÄÎäî ÏïäÏùå
* Ï¶â,

  * **‚ÄúÏù¥ Î™®Îç∏ÏùÑ Î∞îÎ°ú Ïì∏ Ïàò ÏûàÎã§‚Äù ‚ùå**
  * **‚ÄúÏù¥ Îç∞Ïù¥ÌÑ∞Î°ú Î™®Îç∏ÎßÅÏùÑ ÏßÑÌñâÌï† Í∞ÄÏπòÍ∞Ä ÏûàÎã§‚Äù ‚úÖ**

---

## 7Ô∏è‚É£ Í≤∞Î°† (Ìïú Ï§Ñ ÏöîÏïΩ)

> **‚ÄúÌòÑÏû¨ Îç∞Ïù¥ÌÑ∞Îäî fraud ÌÉêÏßÄÎ•º ÏúÑÌïú ÏùòÎØ∏ ÏûàÎäî Ïã†Ìò∏Î•º Ï∂©Î∂ÑÌûà Ìè¨Ìï®ÌïòÍ≥† ÏûàÏúºÎ©∞,
> Ï∂îÍ∞ÄÏ†ÅÏù∏ Ïö¥ÏòÅ ÌôòÍ≤Ω ÌèâÍ∞Ä(ÏõêÎûò Î∂ÑÌè¨ Í∏∞Ï§Ä, Top-K/threshold ÌèâÍ∞Ä)Î•º ÏßÑÌñâÌï† Í∞ÄÏπòÍ∞Ä ÏûàÎã§.‚Äù**

---

## 8Ô∏è‚É£ Îã§Ïùå Îã®Í≥Ñ (Ïã§Ï†ÑÏö©)

Ïù¥Ï†ú ÏïÑÎûò Îã®Í≥ÑÎ°ú ÎÑòÏñ¥Í∞ÄÎ©¥ **Ïã§Ï†ú Ï†ÅÏö© Í∞ÄÎä•ÏÑ±**ÏùÑ ÌåêÎã®Ìï† Ïàò ÏûàÏùå:

1. ÏõêÎûò Î∂ÑÌè¨ testÏóêÏÑú **Top-K precision / recall**
2. client_id Í∏∞Ï§Ä **Group split**
3. Tree Í≥ÑÏó¥ Î™®Îç∏(XGBoost/LightGBM) ÎπÑÍµê

---


In [None]:
# 0) Data Ï§ÄÎπÑ

df2 = df.copy()

TARGET = "fraud"
X_cols = [c for c in df2.columns if c != TARGET]

# pos / neg Î∂ÑÎ¶¨
pos_df = df2[df2[TARGET] == 1]
neg_df = df2[df2[TARGET] == 0]

n_pos = len(pos_df)
print("Positives:", n_pos)
print("Negatives pool:", len(neg_df))

# neg : pos = 1.2 : 1  (ÏòàÏ†Ñ Ïã§ÌóòÍ≥º ÎèôÏùº)
NEG_POS_RATIO = 1.2
n_neg_target = int(np.ceil(n_pos * NEG_POS_RATIO))
n_neg_target = min(n_neg_target, len(neg_df))

print("Neg/Pos target:", n_neg_target / n_pos)

# 1) Î∞òÎ≥µ Ïã§Ìóò

N_RUNS = 20

roc_list = []
pr_list = []

# categorical / numeric
cat_cols = df2[X_cols].select_dtypes(include=["category", "object"]).columns.tolist()
num_cols = [c for c in X_cols if c not in cat_cols]

for run in range(N_RUNS):
    # ---- negative undersampling
    neg_sampled = neg_df.sample(
        n=n_neg_target,
        replace=False,
        random_state=run
    )

    df_bal = (
        pd.concat([pos_df, neg_sampled], axis=0)
        .sample(frac=1, random_state=run)
        .reset_index(drop=True)
    )

    X = df_bal[X_cols].copy()
    y = df_bal[TARGET].astype(int)

    # ---- stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=run
    )

    # ---- categorical encoding
    X_train[cat_cols] = X_train[cat_cols].astype(str)
    X_test[cat_cols]  = X_test[cat_cols].astype(str)

    enc = OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )

    X_train[cat_cols] = enc.fit_transform(X_train[cat_cols])
    X_test[cat_cols]  = enc.transform(X_test[cat_cols])

    # ---- model
    clf = LogisticRegression(
        max_iter=2000,
        n_jobs=-1,
        solver="lbfgs"
    )

    clf.fit(X_train, y_train)

    y_prob = clf.predict_proba(X_test)[:, 1]

    roc = roc_auc_score(y_test, y_prob)
    pr  = average_precision_score(y_test, y_prob)

    roc_list.append(roc)
    pr_list.append(pr)

    print(f"[Run {run+1:02d}] ROC={roc:.4f}, PR={pr:.4f}")

print("\n================ FINAL SUMMARY ================")
print(f"ROC-AUC : mean={np.mean(roc_list):.4f}, std={np.std(roc_list):.4f}")
print(f"PR-AUC  : mean={np.mean(pr_list):.4f}, std={np.std(pr_list):.4f}")


Positives: 10013
Negatives pool: 8841548
Neg/Pos target: 1.2000399480675121
[Run 01] ROC=0.8761, PR=0.8639
[Run 02] ROC=0.8743, PR=0.8569
[Run 03] ROC=0.8696, PR=0.8310
[Run 04] ROC=0.8638, PR=0.8207
[Run 05] ROC=0.8618, PR=0.8221
[Run 06] ROC=0.8808, PR=0.8693
[Run 07] ROC=0.8591, PR=0.8175
[Run 08] ROC=0.8521, PR=0.8155
[Run 09] ROC=0.8626, PR=0.8418
[Run 10] ROC=0.8648, PR=0.8427
[Run 11] ROC=0.8587, PR=0.8168
[Run 12] ROC=0.8594, PR=0.8343
[Run 13] ROC=0.8630, PR=0.8318
[Run 14] ROC=0.8601, PR=0.8198
[Run 15] ROC=0.8618, PR=0.8141
[Run 16] ROC=0.8652, PR=0.8228
[Run 17] ROC=0.8570, PR=0.8328
[Run 18] ROC=0.8663, PR=0.8219
[Run 19] ROC=0.8690, PR=0.8453
[Run 20] ROC=0.8770, PR=0.8558

ROC-AUC : mean=0.8651, std=0.0072
PR-AUC  : mean=0.8338, std=0.0166
