In [3]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from imblearn.under_sampling import OneSidedSelection
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [4]:
trans = pd.read_csv('common/trans_common.csv')

In [20]:
trans

Unnamed: 0.1,Unnamed: 0,date,client_id,card_id,amount,use_chip,merchant_id,mcc,fraud,has_error,err_card_credential,err_authentication,err_financial,err_system,is_online,home_state,is_out_of_state
0,0,2016-11-08 19:59:00,0,1271,7.28,Chip Transaction,13153,5812,0,0,0,0,0,0,0,ME,0
1,1,2015-04-06 20:35:00,0,4639,9.03,Chip Transaction,44919,5814,0,0,0,0,0,0,0,ME,1
2,2,2017-11-09 13:02:00,0,1271,42.28,Chip Transaction,60569,5300,0,0,0,0,0,0,0,ME,0
3,3,2014-08-05 19:39:00,0,1271,9.73,Swipe Transaction,887,5812,0,0,0,0,0,0,0,ME,1
4,4,2013-08-09 19:50:00,0,1271,8.89,Swipe Transaction,98648,5814,0,0,0,0,0,0,0,ME,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647519,7647519,2011-02-11 08:18:00,1998,2160,13.28,Swipe Transaction,59935,5499,0,0,0,0,0,0,0,CA,0
7647520,7647520,2011-04-17 07:03:00,1998,2160,6.61,Online Transaction,9932,5311,0,0,0,0,0,0,0,CA,1
7647521,7647521,2010-09-01 13:34:00,1998,2160,3.52,Swipe Transaction,59935,5499,0,0,0,0,0,0,0,CA,0
7647522,7647522,2013-10-30 08:41:00,1998,2160,5.05,Online Transaction,47399,5815,0,0,0,0,0,0,0,CA,1


In [21]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 17 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Unnamed: 0           int64  
 1   date                 object 
 2   client_id            int64  
 3   card_id              int64  
 4   amount               float64
 5   use_chip             object 
 6   merchant_id          int64  
 7   mcc                  int64  
 8   fraud                int64  
 9   has_error            int64  
 10  err_card_credential  int64  
 11  err_authentication   int64  
 12  err_financial        int64  
 13  err_system           int64  
 14  is_online            int64  
 15  home_state           object 
 16  is_out_of_state      int64  
dtypes: float64(1), int64(13), object(3)
memory usage: 991.9+ MB


In [None]:
trans["date"] = pd.to_datetime(trans["date"], errors="coerce")
trans["hour"] = trans["date"].dt.hour.astype("int16")
trans["dow"]  = trans["date"].dt.dayofweek.astype("int16")   # 0=Mon
trans["month"]= trans["date"].dt.month.astype("int16")

use_chip_ohe = pd.get_dummies(
    trans["use_chip"],
    prefix="use_chip",
    dtype="int8"
)

trans = pd.concat([trans.drop(columns=["use_chip"]), use_chip_ohe], axis=1)

obj_cols_all = trans.select_dtypes(include=["object"]).columns.tolist()
if len(obj_cols_all) > 0:
    trans = trans.drop(columns=obj_cols_all)

if "Unnamed: 0" in trans.columns:
    trans = trans.drop(columns=["Unnamed: 0"])

print(trans.dtypes)
print(trans.shape)

SAVE_DIR = "./oss_batches"
os.makedirs(SAVE_DIR, exist_ok=True)

RATIO = 20
RANDOM_STATE = 42

y_arr = trans["fraud"].astype(int).values
idx_pos = np.where(y_arr == 1)[0]
idx_neg = np.where(y_arr == 0)[0]

n_pos = len(idx_pos)
n_neg_per_batch = n_pos * RATIO

rng = np.random.RandomState(RANDOM_STATE)
rng.shuffle(idx_neg)

max_batches = len(idx_neg) // n_neg_per_batch
print("batches:", max_batches)
print(f"per batch: pos={n_pos:,}, neg={n_neg_per_batch:,}, total={n_pos+n_neg_per_batch:,}")

oss = OneSidedSelection(random_state=42, n_neighbors=1, n_seeds_S=5)

for i in tqdm(range(max_batches), desc="Batch -> OSS -> Save"):
    start = i * n_neg_per_batch
    end   = start + n_neg_per_batch
    idx_batch = np.concatenate([idx_pos, idx_neg[start:end]])

    batch = trans.iloc[idx_batch].copy()

    Xb = batch.drop(columns=["fraud", "date"])  
    yb = batch["fraud"].astype(int).values

    obj_cols = Xb.select_dtypes(include=["object"]).columns
    if len(obj_cols) > 0:
        Xb = Xb.drop(columns=obj_cols)

    X_res, y_res = oss.fit_resample(Xb, yb)

    df_res = pd.DataFrame(X_res, columns=Xb.columns)
    df_res["fraud"] = y_res

    save_path = f"{SAVE_DIR}/oss_batch_{i:02d}.parquet"
    df_res.to_parquet(save_path, index=False)

    if i == 0:
        print(
            f"[Batch {i}] saved -> {save_path} | "
            f"before={np.bincount(yb)}, after={np.bincount(y_res)}"
        )

print("All batches saved.")


date                           datetime64[ns]
client_id                               int64
card_id                                 int64
amount                                float64
merchant_id                             int64
mcc                                     int64
fraud                                   int64
has_error                               int64
err_card_credential                     int64
err_authentication                      int64
err_financial                           int64
err_system                              int64
is_online                               int64
is_out_of_state                         int64
hour                                    int16
dow                                     int16
month                                   int16
use_chip_Chip Transaction                int8
use_chip_Online Transaction              int8
use_chip_Swipe Transaction               int8
dtype: object
(7647524, 20)
batches: 34
per batch: pos=11,063, neg=221,260, tota

Batch -> OSS -> Save:   0%|          | 0/34 [00:00<?, ?it/s]

[Batch 0] saved -> ./oss_batches/oss_batch_00.parquet | before=[221260  11063], after=[218036  11063]
All batches saved.


In [None]:
X = trans.drop(columns=["fraud"])    
y = trans["fraud"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_test = X_test.drop(columns=["date"])
X_train = X_train.drop(columns=["date"]) 

batch_files = sorted(glob.glob(f"{SAVE_DIR}/oss_batch_*.parquet"))
print("num batch files:", len(batch_files))

model = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("lr", LogisticRegression(
        max_iter=300,
        solver="saga",
        n_jobs=-1
    ))
])


K = min(5, len(batch_files)) 
pr_auc_list = []
proba_sum = np.zeros(len(X_test), dtype=np.float64) 

for path in tqdm(batch_files[:K], desc="Train/Eval per batch"):
    df = pd.read_parquet(path)

    Xb = df.drop(columns=["fraud"])
    yb = df["fraud"].astype(int)

    
    model.fit(Xb, yb)

 
    proba = model.predict_proba(X_test)[:, 1]
    proba_sum += proba


    pr_auc = average_precision_score(y_test, proba)
    pr_auc_list.append(pr_auc)

    print(f"{os.path.basename(path)} | PR-AUC: {pr_auc:.6f}")

print("\nPR-AUC mean:", float(np.mean(pr_auc_list)), "std:", float(np.std(pr_auc_list)))

proba_ens = proba_sum / K
pr_auc_ens = average_precision_score(y_test, proba_ens)
print(f"\nEnsemble (avg of {K}) | PR-AUC: {pr_auc_ens:.6f}")

precision, recall, thr = precision_recall_curve(y_test, proba_ens)

target_p = 0.90
idx = np.where(precision >= target_p)[0]
if len(idx) > 0:
    j = idx[-1]  
    th = thr[j-1] if j > 0 and len(thr) > 0 else None
    print(f"Recall @ Precision>=0.90: {recall[j]:.6f} | threshold: {th}")
else:
    print("Precision>=0.90 달성 못함 (현재 앙상블 기준)")

TOPK = 1000
topk_idx = np.argsort(-proba_ens)[:TOPK]
topk_precision = y_test.iloc[topk_idx].mean()
print(f"Top-{TOPK} precision: {float(topk_precision):.6f}")

num batch files: 34


Train/Eval per batch:   0%|          | 0/5 [00:00<?, ?it/s]



oss_batch_00.parquet | PR-AUC: 0.020928
oss_batch_01.parquet | PR-AUC: 0.020742




oss_batch_02.parquet | PR-AUC: 0.020720




oss_batch_03.parquet | PR-AUC: 0.020844




oss_batch_04.parquet | PR-AUC: 0.020637

PR-AUC mean: 0.020774161548917373 std: 0.00010138352048670251

Ensemble (avg of 5) | PR-AUC: 0.020784
Recall @ Precision>=0.90: 0.000000 | threshold: 0.9999727155170426
Top-1000 precision: 0.034000


In [26]:
from sklearn.metrics import confusion_matrix, classification_report

threshold = 0.5
y_pred = (proba_ens >= threshold).astype(int)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (threshold=0.5):\n", cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))


Confusion Matrix (threshold=0.5):
 [[1522399    4893]
 [   2031     182]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9987    0.9968    0.9977   1527292
           1     0.0359    0.0822    0.0499      2213

    accuracy                         0.9955   1529505
   macro avg     0.5173    0.5395    0.5238   1529505
weighted avg     0.9973    0.9955    0.9964   1529505



In [None]:
# 1:10
RATIO = 10
SAVE_DIR = "./oss_batches_ratio10"
os.makedirs(SAVE_DIR, exist_ok=True)

y_arr = trans["fraud"].astype(int).values
idx_pos = np.where(y_arr == 1)[0]
idx_neg = np.where(y_arr == 0)[0]

n_pos = len(idx_pos)
n_neg_per_batch = n_pos * RATIO

rng = np.random.RandomState(RANDOM_STATE)
rng.shuffle(idx_neg)

max_batches = len(idx_neg) // n_neg_per_batch
print("batches:", max_batches)
print(f"per batch: pos={n_pos:,}, neg={n_neg_per_batch:,}, total={n_pos+n_neg_per_batch:,}")

oss = OneSidedSelection(random_state=42, n_neighbors=1, n_seeds_S=5)

for i in tqdm(range(max_batches), desc="Batch(1:10) -> OSS -> Save"):
    start = i * n_neg_per_batch
    end   = start + n_neg_per_batch
    idx_batch = np.concatenate([idx_pos, idx_neg[start:end]])

    batch = trans.iloc[idx_batch].copy()

    Xb = batch.drop(columns=["fraud", "date"])
    yb = batch["fraud"].astype(int).values

    X_res, y_res = oss.fit_resample(Xb, yb)

    df_res = pd.DataFrame(X_res, columns=Xb.columns)
    df_res["fraud"] = y_res

    save_path = f"{SAVE_DIR}/oss_batch_{i:02d}.parquet"
    df_res.to_parquet(save_path, index=False)

print("All ratio=10 batches saved.")


batches: 69
per batch: pos=11,063, neg=110,630, total=121,693


Batch(1:10) -> OSS -> Save:   0%|          | 0/69 [00:00<?, ?it/s]

All ratio=10 batches saved.


In [None]:
path = "./oss_batches_ratio10/oss_batch_00.parquet"
df = pd.read_parquet(path)

X = df.drop(columns=["fraud"])
y = df["fraud"].astype(int)

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("lr", LogisticRegression(
        max_iter=2000,
        solver="saga",
        n_jobs=-1
    ))
])

model.fit(X_tr, y_tr)

proba_va = model.predict_proba(X_va)[:, 1]
pr_auc = average_precision_score(y_va, proba_va)
print(classification_report(y_va, model.predict(X_va)))
print("Batch internal PR-AUC:", pr_auc)
print("Positive rate (val):", y_va.mean())


              precision    recall  f1-score   support

           0       0.94      0.98      0.96     21404
           1       0.64      0.40      0.49      2213

    accuracy                           0.92     23617
   macro avg       0.79      0.69      0.73     23617
weighted avg       0.91      0.92      0.91     23617

Batch internal PR-AUC: 0.5660149538983155
Positive rate (val): 0.09370368802134056




In [None]:
path = "./oss_batches/oss_batch_00.parquet"
df = pd.read_parquet(path)

X = df.drop(columns=["fraud"])
y = df["fraud"].astype(int)

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("lr", LogisticRegression(
        max_iter=2000,
        solver="saga",
        n_jobs=-1
    ))
])

model.fit(X_tr, y_tr)

proba_va = model.predict_proba(X_va)[:, 1]
pr_auc = average_precision_score(y_va, proba_va)
print(classification_report(y_va, model.predict(X_va)))
print("Batch internal PR-AUC:", pr_auc)
print("Positive rate (val):", y_va.mean())


              precision    recall  f1-score   support

           0       0.96      1.00      0.98     43607
           1       0.55      0.08      0.14      2213

    accuracy                           0.95     45820
   macro avg       0.75      0.54      0.56     45820
weighted avg       0.94      0.95      0.93     45820

Batch internal PR-AUC: 0.3975185933273484
Positive rate (val): 0.048297686599738106




In [11]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 183279 entries, 167685 to 67710
Data columns (total 18 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   client_id                    183279 non-null  int64  
 1   card_id                      183279 non-null  int64  
 2   amount                       183279 non-null  float64
 3   merchant_id                  183279 non-null  int64  
 4   mcc                          183279 non-null  int64  
 5   has_error                    183279 non-null  int64  
 6   err_card_credential          183279 non-null  int64  
 7   err_authentication           183279 non-null  int64  
 8   err_financial                183279 non-null  int64  
 9   err_system                   183279 non-null  int64  
 10  is_online                    183279 non-null  int64  
 11  is_out_of_state              183279 non-null  int64  
 12  hour                         183279 non-null  int16  
 13  