In [1]:
import numpy as np
import pandas as pd

import statsmodels.api as sm
from scipy.stats import ks_2samp, mannwhitneyu
from sklearn.metrics import average_precision_score

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    precision_recall_curve,
    classification_report
)
pd.set_option("display.max_rows", None)
from statsmodels.tools.sm_exceptions import PerfectSeparationError

LABEL = "fraud"

%matplotlib inline

In [2]:
df = pd.read_parquet("DATA/dataset/train_stage2")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5312525 entries, 0 to 5332978
Data columns (total 53 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   date                      datetime64[ns]
 1   client_id                 int64         
 2   card_id                   int64         
 3   amount                    float32       
 4   merchant_id               int64         
 5   mcc                       object        
 6   fraud                     int8          
 7   has_error                 int8          
 8   err_bad_card_number       int8          
 9   err_bad_expiration        int8          
 10  err_bad_cvv               int8          
 11  err_bad_pin               int8          
 12  err_bad_zipcode           int8          
 13  err_insufficient_balance  int8          
 14  err_technical_glitch      int8          
 15  tx_year                   int16         
 16  tx_month                  int8          
 17  tx_day       

amount

In [4]:
thr = df["log_abs_amount"].quantile(0.9) 
df["refund_high_amount"] = (
    (df["is_refund"] == 1) & (df["log_abs_amount"] > thr)
).astype("int8")

In [5]:
df.drop("amount", axis=1, inplace=True)

error

In [6]:
g = df.groupby("card_id")["has_error"]
df["card_error_last1"] = g.shift(1).fillna(0).astype("int8")

e1 = g.shift(1).fillna(0).astype("int8")
e2 = g.shift(2).fillna(0).astype("int8")
e3 = g.shift(3).fillna(0).astype("int8")
df["card_error_last3"] = (e1 + e2 + e3).astype("int8")

In [7]:
df["card_fraud_cum_prev"] = (
    df.groupby("card_id")["fraud"]
      .cumsum()
      .shift(1)
      .fillna(0)
      .astype("int32")
)

df["card_has_fraud_history"] = (
    df["card_fraud_cum_prev"] > 0
).astype("int8")

df["card_hist_x_error"] = (
    df["card_has_fraud_history"] * df["card_error_last1"]
)

df.drop(columns=["card_fraud_cum_prev", "card_has_fraud_history"], inplace=True)

In [8]:
df.drop(columns=["err_insufficient_balance", "err_technical_glitch", "err_bad_zipcode", "err_bad_pin"], inplace=True)

temporal

In [9]:
df["hour_sin"] = np.sin(2 * np.pi * df["tx_hour"] / 24).astype("float32")
df["hour_cos"] = np.cos(2 * np.pi * df["tx_hour"] / 24).astype("float32")

static

In [10]:
df["sin_shift"] = (
    df.groupby("client_id")["hour_sin"]
      .shift(1)
      .fillna(df["hour_sin"])
)

df["cos_shift"] = (
    df.groupby("client_id")["hour_cos"]
      .shift(1)
      .fillna(df["hour_cos"])
)

df["sin_cumsum"] = df["sin_shift"].fillna(0).groupby(df["client_id"]).cumsum()
df["cos_cumsum"] = df["cos_shift"].fillna(0).groupby(df["client_id"]).cumsum()

df["cnt_past"] = df.groupby("client_id").cumcount()

df["client_sin_mean_past"] = np.where(
    df["cnt_past"] > 0,
    df["sin_cumsum"] / df["cnt_past"],
    np.nan
)
df["client_cos_mean_past"] = np.where(
    df["cnt_past"] > 0,
    df["cos_cumsum"] / df["cnt_past"],
    np.nan
)

df["client_sin_mean_past"] = df["client_sin_mean_past"].fillna(df["hour_sin"])
df["client_cos_mean_past"] = df["client_cos_mean_past"].fillna(df["hour_cos"])

df["hour_circular_distance"] = np.sqrt(
    (df["hour_sin"] - df["client_sin_mean_past"])**2 +
    (df["hour_cos"] - df["client_cos_mean_past"])**2
)

df["client_weekday_prev"] = df.groupby("client_id")["weekday"].shift(1)

df["client_weekday_prev"] = df["client_weekday_prev"].fillna(df["weekday"]).astype(df["weekday"].dtype)


df["client_weekday_match_last1"] = (
    df["weekday"] == df["client_weekday_prev"]
).astype("int8")

Card/Channel

In [11]:
df["discover_x_cvv"] = df["cb_Discover"] * df["err_bad_cvv"]

In [12]:
df["card_type"] = np.select(
    [
        df["is_credit"] == 1,
        df["is_prepaid"] == 1
    ],
    [
        "credit",
        "debit(prepaid)"
    ],
    default="debit(non_prepaid)"
)

In [13]:
df["prepaid_logamount_interaction"] = (
    (df["card_type"] == "debit(prepaid)").astype(int)
    * df["log_abs_amount"]
)
df.drop("card_type", axis=1, inplace=True)

mcc

In [14]:
mcc_stats = (
    df.groupby("mcc")["fraud"]
      .agg(["mean", "count"])
      .rename(columns={"mean":"fraud_rate", "count":"tx_count"})
)
base_rate = df["fraud"].mean()
highrisk_mcc = mcc_stats[
    (mcc_stats["tx_count"] >= 1000) &
    (mcc_stats["fraud_rate"] >= base_rate * 3)
].index.tolist()

df["mcc_highrisk_90"] = df["mcc"].isin(highrisk_mcc).astype("int8")

In [15]:
high_risk_days = [0, 4, 6]

df["is_highrisk_weekday"] = df["weekday"].isin(high_risk_days).astype("int8")

In [16]:
df["client_mcc_prior_count"] = df.groupby(["client_id", "mcc"]).cumcount()
# 과거에 없었다 = 첫 등장(현재 포함) 
df["client_mcc_is_new"] = (df["client_mcc_prior_count"] == 0).astype("int8")

In [17]:
df["card_mcc_prior_count"] = df.groupby(["client_id", "mcc"]).cumcount()
df["card_mcc_is_new"] = (df["card_mcc_prior_count"] == 0).astype("int8")

In [18]:
g = df.groupby("client_id")["mcc"]

prev = g.shift(1)
prev2 = g.shift(2)
prev3 = g.shift(3)
prev4 = g.shift(4)
prev5 = g.shift(5)

# 최근 5개에서 바뀐 횟수
# (prev5->prev4, prev4->prev3, prev3->prev2, prev2->prev1) 중 바뀐 횟수
chg1 = (prev  != prev2)
chg2 = (prev2 != prev3)
chg3 = (prev3 != prev4)
chg4 = (prev4 != prev5)

df["client_mcc_change_cnt_last5"] = (chg1.fillna(False).astype("int8")
                                  + chg2.fillna(False).astype("int8")
                                  + chg3.fillna(False).astype("int8")
                                  + chg4.fillna(False).astype("int8"))

In [19]:
# 최근 3/5거래에서 현재 mcc가 등장했는지 
m1 = (df["mcc"] == g.shift(1))
m2 = (df["mcc"] == g.shift(2))
m3 = (df["mcc"] == g.shift(3))
m4 = (df["mcc"] == g.shift(4))
m5 = (df["mcc"] == g.shift(5))

df["client_mcc_seen_last5"] = (m1 | m2 | m3 | m4 | m5).fillna(False).astype("int8")

df["client_mcc_repeat_cnt_last5"] = (m1.fillna(False).astype("int8")
                                   + m2.fillna(False).astype("int8")
                                   + m3.fillna(False).astype("int8")
                                   + m4.fillna(False).astype("int8")
                                   + m5.fillna(False).astype("int8"))

df["client_mcc_repeat_ratio_last5"] = (df["client_mcc_repeat_cnt_last5"] / 5).astype("float32")


In [20]:
g = df.groupby("card_id")["mcc"]

prev = g.shift(1)
prev2 = g.shift(2)
prev3 = g.shift(3)
prev4 = g.shift(4)
prev5 = g.shift(5)

chg1 = (prev  != prev2)
chg2 = (prev2 != prev3)
chg3 = (prev3 != prev4)
chg4 = (prev4 != prev5)

df["card_mcc_change_cnt_last5"] = (chg1.fillna(False).astype("int8")
                                  + chg2.fillna(False).astype("int8")
                                  + chg3.fillna(False).astype("int8")
                                  + chg4.fillna(False).astype("int8"))


Merchant Novelty

In [21]:
df["client_merchant_is_new"] = (
    df.groupby(["client_id", "merchant_id"], sort=False).cumcount().eq(0).astype("int8")
)

df["card_merchant_is_new"] = (
    df.groupby(["card_id", "merchant_id"], sort=False).cumcount().eq(0).astype("int8")
)

In [22]:
prev_merchant = df.groupby("card_id", sort=False)["merchant_id"].shift(1)

df["merchant_changed"] = (
    df["merchant_id"].ne(prev_merchant)      # 직전과 다르면 True
    .fillna(True)                            # 첫 거래는 변경
    .astype("int8")
)

# 2) 최근 5건에서 변경 횟수
df["merchant_change_cnt_last5"] = (
    df.groupby("card_id", sort=False)["merchant_changed"]
      .rolling(window=5, min_periods=1)
      .sum()
      .reset_index(level=0, drop=True)
      .astype("int8")
)

df.drop(columns=["merchant_changed"], inplace=True)

In [23]:
# merchant_is_new 정의(카드 기준)
df["merchant_is_new"] = df["card_merchant_is_new"].astype("int8")

# merchant_is_new × mcc_is_new 
if "card_mcc_is_new" in df.columns:
    df["merchant_is_new_x_mcc_is_new"] = (
        df["merchant_is_new"].astype("int8") * df["card_mcc_is_new"].astype("int8")
    ).astype("int8")
else:
    # 없으면 대체: card 단위로 mcc 첫 등장 여부 생성
    df["card_mcc_is_new"] = (
        df.groupby(["card_id", "mcc"], sort=False).cumcount().eq(0).astype("int8")
    )
    df["merchant_is_new_x_mcc_is_new"] = (
        df["merchant_is_new"] * df["card_mcc_is_new"]
    ).astype("int8")

# merchant_is_new × has_error
df["merchant_is_new_x_has_error"] = (
    df["merchant_is_new"].astype("int8") * df["has_error"].astype("int8")
).astype("int8")

Interval, Velocity, Deviation

In [24]:
# 1) 이전 거래 시점
df["prev_tx_time"] = df.groupby("client_id")["date"].shift(1)

# 2) 초 단위 간격
df["seconds_since_prev_tx"] = (
    (df["date"] - df["prev_tx_time"]).dt.total_seconds()
)

# 첫 거래는 간격 없음
df["seconds_since_prev_tx"] = df["seconds_since_prev_tx"].fillna(0)

In [25]:
df["log_interval"] = np.log1p(df["seconds_since_prev_tx"])
#  과거값 shift
df["log_interval_shift"] = df.groupby("client_id")["log_interval"].shift(1)

# 누적합
df["interval_cumsum"] = (
    df["log_interval_shift"].fillna(0)
      .groupby(df["client_id"])
      .cumsum()
)

# 과거 개수
df["interval_cnt_past"] = df.groupby("client_id").cumcount()

# 과거 평균
df["client_avg_interval_prev"] = np.where(
    df["interval_cnt_past"] > 0,
    df["interval_cumsum"] / df["interval_cnt_past"],
    df["log_interval"]  # 첫 거래는 자기 자신으로 중립 처리
)

In [26]:
df = df.sort_values(["client_id", "date"]).reset_index(drop=True)

# numpy index 준비
n = len(df)

client_tx_1h = np.zeros(n, dtype=np.int32)
card_tx_1h = np.zeros(n, dtype=np.int32)

In [27]:
import numpy as np
import pandas as pd

# 0) 정렬 (필수)
df = df.sort_values(["client_id", "date"]).reset_index(drop=True)

# 1) numpy 배열로 뽑기
cid = df["client_id"].to_numpy()
t = df["date"].to_numpy(dtype="datetime64[s]").astype(np.int64)  # 초 단위 int64

out = np.empty(len(df), dtype=np.int32)

# 2) client 경계만 따라가면서, 각 client 구간은 벡터화 searchsorted로 한 방에
n = len(df)
start = 0
while start < n:
    end = start + 1
    # 같은 client 구간 찾기
    while end < n and cid[end] == cid[start]:
        end += 1

    tt = t[start:end]
    left = np.searchsorted(tt, tt - 3600, side="left")  # 1시간 전 위치
    out[start:end] = (np.arange(end - start) - left + 1).astype(np.int32)

    start = end

df["client_tx_1h"] = out

In [28]:
df["client_tx_1h_shift"] = df.groupby("client_id")["client_tx_1h"].shift(1)

df["client_tx_1h_cumsum"] = (
    df["client_tx_1h_shift"].fillna(0)
      .groupby(df["client_id"])
      .cumsum()
)

df["client_tx_cnt_past"] = df.groupby("client_id").cumcount()

df["client_tx_1h_avg_prev"] = np.where(
    df["client_tx_cnt_past"] > 0,
    df["client_tx_1h_cumsum"] / df["client_tx_cnt_past"],
    df["client_tx_1h"]
)

In [29]:
# 0) 정렬 (필수)
df = df.sort_values(["card_id", "date"]).reset_index(drop=True)

# 1) numpy 배열로 뽑기
cid = df["card_id"].to_numpy()
t = df["date"].to_numpy(dtype="datetime64[s]").astype(np.int64)  # 초 단위 int64

out = np.empty(len(df), dtype=np.int32)

# 2) client 경계만 따라가면서, 각 client 구간은 벡터화 searchsorted로 한 방에
n = len(df)
start = 0
while start < n:
    end = start + 1
    # 같은 client 구간 찾기
    while end < n and cid[end] == cid[start]:
        end += 1

    tt = t[start:end]
    left = np.searchsorted(tt, tt - 3600, side="left")  # 1시간 전 위치
    out[start:end] = (np.arange(end - start) - left + 1).astype(np.int32)

    start = end

df["card_tx_1h"] = out

In [30]:
df["card_tx_1h_shift"] = df.groupby("card_id")["card_tx_1h"].shift(1)

df["card_tx_1h_cumsum"] = (
    df["card_tx_1h_shift"].fillna(0)
      .groupby(df["card_id"])
      .cumsum()
)

df["card_tx_cnt_past"] = df.groupby("card_id").cumcount()

df["card_tx_1h_avg_prev"] = np.where(
    df["card_tx_cnt_past"] > 0,
    df["card_tx_1h_cumsum"] / df["card_tx_cnt_past"],
    df["card_tx_1h"]
)

df["card_velocity_spike_ratio"] = (
    df["card_tx_1h"] /
    (df["card_tx_1h_avg_prev"] + 1e-6)
)

In [31]:
df = df.sort_values(["client_id", "date"]).reset_index(drop=True)
AMT = "log_abs_amount"

df["amt_shift"] = df.groupby("client_id")[AMT].shift(1)

df["amt_cumsum"] = (
    df["amt_shift"].fillna(0)
      .groupby(df["client_id"])
      .cumsum()
)

df["amt_cnt_past"] = df.groupby("client_id").cumcount()

df["client_avg_amt_prev"] = np.where(
    df["amt_cnt_past"] > 0,
    df["amt_cumsum"] / df["amt_cnt_past"],
    df[AMT]  # 첫 거래는 중립 처리
).astype("float32")

df["amount_vs_client_avg_diff"] = (
    df[AMT] - df["client_avg_amt_prev"]
).astype("float32")


Fraud History

In [32]:
f1 = df.groupby("card_id")["fraud"].shift(1)
f2 = df.groupby("card_id")["fraud"].shift(2)
f3 = df.groupby("card_id")["fraud"].shift(3)

df["card_fraud_last3"] = (
    f1.fillna(0).astype("int8") +
    f2.fillna(0).astype("int8") +
    f3.fillna(0).astype("int8")
)

In [33]:
f1 = df.groupby("client_id")["fraud"].shift(1)
f2 = df.groupby("client_id")["fraud"].shift(2)
f3 = df.groupby("client_id")["fraud"].shift(3)

df["client_fraud_last3"] = (
    f1.fillna(0).astype("int8") +
    f2.fillna(0).astype("int8") +
    f3.fillna(0).astype("int8")
)

Interaction

In [34]:
client_amt_mean = df.groupby("client_id")["log_abs_amount"].transform("mean")
client_amt_std  = df.groupby("client_id")["log_abs_amount"].transform("std")

df["amount_deviation"] = (
    (df["log_abs_amount"] - client_amt_mean) / (client_amt_std + 1e-6)
)

In [35]:
df["client_tx_1h_shift"] = df.groupby("client_id")["client_tx_1h"].shift(1)

df["client_tx_1h_cumsum"] = (
    df["client_tx_1h_shift"].fillna(0)
      .groupby(df["client_id"])
      .cumsum()
)

df["client_tx_cnt_past"] = df.groupby("client_id").cumcount()

df["client_tx_1h_avg_prev"] = np.where(
    df["client_tx_cnt_past"] > 0,
    df["client_tx_1h_cumsum"] / df["client_tx_cnt_past"],
    df["client_tx_1h"]
)

df["velocity_spike_ratio"] = (
    df["client_tx_1h"] /
    (df["client_tx_1h_avg_prev"] + 1e-6)
)

In [36]:
df["mccnew_x_error"] = df["client_mcc_is_new"] * df["has_error"]
df["mccnew_x_velocity"] = df["client_mcc_is_new"] * df["velocity_spike_ratio"]

In [37]:
eps = 1e-6
K = 10  # 최근 10건 기준 
df["client_recent_avg_amt"] = (
    df.groupby("client_id")[AMT]
      .shift(1)
      .rolling(K, min_periods=1)
      .mean()
      .reset_index(level=0, drop=True)
).astype("float32")

# 첫 거래는 NaN → 중립 처리
df["client_recent_avg_amt"] = df["client_recent_avg_amt"].fillna(df["client_avg_amt_prev"]).astype("float32")

df["amount_vs_recent_window_avg"] = (
    df[AMT] / (df["client_recent_avg_amt"] + eps)
).astype("float32")

df["log_amount_vs_recent_window_avg"] = np.log1p(df["amount_vs_recent_window_avg"]).astype("float32")


In [38]:
DEV = "log_amount_vs_recent_window_avg" 

# 1) amount_deviation × mcc_is_new
df["dev_x_mccnew"] = (
    df[DEV] * df["client_mcc_is_new"]
).astype("float32")

# 2) amount_deviation × velocity_spike_ratio
df["dev_x_velocity"] = (
    df[DEV] * df["card_velocity_spike_ratio"]
).astype("float32")

In [39]:
df.drop(columns=["tx_day", "weekday", "tx_year", "client_sin_mean_past",
"sin_cumsum",
"client_cos_mean_past",
"client_weekday_prev",
"cos_cumsum",
"cnt_past",
"credit_limit",
"total_debt",
"yearly_income",
"per_capita_income",
"income_ratio_region",
"months_to_expire",
"months_from_account",
"male",
"expires_month",
"num_cards_issued",
"credit_score",
"cb_Visa",
"cb_Mastercard",
"is_credit",
"client_mcc_prior_count",
"card_mcc_prior_count",
"prev_tx_time",
"log_interval",
"log_interval_shift",
"interval_cumsum",
"interval_cnt_past",
"client_tx_1h",
"client_tx_1h_shift",
"client_tx_1h_cumsum",
"client_tx_cnt_past",
"card_tx_1h",
"card_tx_1h_shift",
"card_tx_1h_cumsum",
"card_tx_cnt_past",
"card_tx_1h_avg_prev",
"amt_shift",
"amt_cumsum",
"amt_cnt_past",
"client_avg_amt_prev",
"log_amount_vs_recent_window_avg",
"amount_vs_recent_window_avg",
"client_recent_avg_amt",
"client_tx_1h_shift",
"client_tx_1h_cumsum",
"client_tx_cnt_past",
"client_tx_1h_avg_prev",
"velocity_spike_ratio"
], inplace=True)

In [40]:
df.shape

(5312525, 69)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5312525 entries, 0 to 5312524
Data columns (total 69 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   date                           datetime64[ns]
 1   client_id                      int64         
 2   card_id                        int64         
 3   merchant_id                    int64         
 4   mcc                            object        
 5   fraud                          int8          
 6   has_error                      int8          
 7   err_bad_card_number            int8          
 8   err_bad_expiration             int8          
 9   err_bad_cvv                    int8          
 10  tx_month                       int8          
 11  tx_hour                        int8          
 12  is_refund                      int8          
 13  log_abs_amount                 float32       
 14  current_age                    int64         
 15  num_credit_card

In [42]:
data = df
df.drop("date", axis=1, inplace=True)

In [43]:
df.isnull().sum()

client_id                        0
card_id                          0
merchant_id                      0
mcc                              0
fraud                            0
has_error                        0
err_bad_card_number              0
err_bad_expiration               0
err_bad_cvv                      0
tx_month                         0
tx_hour                          0
is_refund                        0
log_abs_amount                   0
current_age                      0
num_credit_cards                 0
has_chip                         0
year_pin_last_changed            0
expires_year                     0
is_prepaid                       0
cb_Amex                          0
cb_Discover                      0
years_since_pin_change           0
years_to_retirement              0
log_yearly_income                0
log_income_ratio_region          0
abs_amount                       0
amount_income_ratio              0
amount_limit_ratio               0
log_amount_income_ra

In [44]:
drop_cols = [
    "client_id",
    "card_id",
    "merchant_id",
    "mcc" 
]
df_model = df.drop(columns=drop_cols)

In [45]:
df_model.drop(columns=["amount_income_ratio", "amount_limit_ratio", "abs_amount", "expires_year", "years_since_pin_change"], inplace=True)

In [46]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5312525 entries, 0 to 5312524
Data columns (total 59 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   fraud                          int8   
 1   has_error                      int8   
 2   err_bad_card_number            int8   
 3   err_bad_expiration             int8   
 4   err_bad_cvv                    int8   
 5   tx_month                       int8   
 6   tx_hour                        int8   
 7   is_refund                      int8   
 8   log_abs_amount                 float32
 9   current_age                    int64  
 10  num_credit_cards               int8   
 11  has_chip                       int8   
 12  year_pin_last_changed          Int16  
 13  is_prepaid                     int8   
 14  cb_Amex                        int8   
 15  cb_Discover                    int8   
 16  years_to_retirement            int16  
 17  log_yearly_income              float32
 18  lo

In [47]:
df_model.to_parquet("DATA/full_dataset")

In [49]:
stage1_cols = [
    "fraud",
    "has_error",
    "err_bad_card_number",
    "err_bad_expiration",
    "err_bad_cvv",
    "tx_month",
    "tx_hour",
    "is_refund",
    "log_abs_amount",
    "hour_sin",
    "hour_cos",
    "sin_shift",
    "cos_shift",
    "hour_circular_distance",
    "mcc_highrisk_90",
    "is_highrisk_weekday",
]

df_stage1 = (
    df_model[stage1_cols]
    .copy()
)

print(df_stage1.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5312525 entries, 0 to 5312524
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   fraud                   int8   
 1   has_error               int8   
 2   err_bad_card_number     int8   
 3   err_bad_expiration      int8   
 4   err_bad_cvv             int8   
 5   tx_month                int8   
 6   tx_hour                 int8   
 7   is_refund               int8   
 8   log_abs_amount          float32
 9   hour_sin                float32
 10  hour_cos                float32
 11  sin_shift               float32
 12  cos_shift               float32
 13  hour_circular_distance  float64
 14  mcc_highrisk_90         int8   
 15  is_highrisk_weekday     int8   
dtypes: float32(5), float64(1), int8(10)
memory usage: 192.5 MB
None


In [50]:
df_stage1.to_parquet("DATA/train_stage1")