In [1]:
import pandas as pd
import numpy as np
import ijson

In [4]:
trans = pd.read_csv("../../DATA/original/transactions_data.csv")

In [7]:
ids = []
labels = []

with open("../../DATA/original/train_fraud_labels.json", "rb") as f:
    for k, v in ijson.kvitems(f, "target"):
        kid = int(str(k).strip())
        ids.append(kid)
        labels.append(1 if v == "Yes" else 0)

labels_df = pd.DataFrame({"id": ids, "fraud": labels}).astype({"fraud": "int8"})
labels_df

Unnamed: 0,id,fraud
0,10649266,0
1,23410063,0
2,9316588,0
3,12478022,0
4,9558530,0
...,...,...
8914958,14064699,0
8914959,7676538,0
8914960,15131030,0
8914961,17244732,0


In [8]:
trans = trans.merge(labels_df, on="id", how="inner")

In [9]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 13 columns):
 #   Column          Dtype  
---  ------          -----  
 0   id              int64  
 1   date            object 
 2   client_id       int64  
 3   card_id         int64  
 4   amount          object 
 5   use_chip        object 
 6   merchant_id     int64  
 7   merchant_city   object 
 8   merchant_state  object 
 9   zip             float64
 10  mcc             float64
 11  errors          object 
 12  fraud           int8   
dtypes: float64(2), int64(4), int8(1), object(6)
memory usage: 707.4+ MB


In [11]:
trans.isnull().sum()

id                      0
date                    0
client_id               0
card_id                 0
amount                  0
use_chip                0
merchant_id             0
merchant_city           0
merchant_state     893521
zip                942920
mcc                     0
errors            7525963
fraud                   0
dtype: int64

In [12]:
trans.shape

(7647524, 13)

In [13]:
trans["is_online"] = (trans["use_chip"] == "Online Transaction").astype(int)

In [14]:
trans["amount"] = (trans["amount"].str.replace("$", "", regex=False).astype("float32"))

In [15]:
errors_type = trans["errors"].unique()
errors_type

array([nan, 'Bad Expiration', 'Bad Card Number', 'Insufficient Balance',
       'Bad PIN', 'Technical Glitch', 'Bad CVV',
       'Insufficient Balance,Technical Glitch',
       'Bad PIN,Insufficient Balance', 'Bad Zipcode',
       'Bad Expiration,Technical Glitch',
       'Bad Card Number,Bad Expiration', 'Bad PIN,Technical Glitch',
       'Bad Card Number,Insufficient Balance',
       'Bad Expiration,Insufficient Balance', 'Bad Card Number,Bad CVV',
       'Bad CVV,Technical Glitch', 'Bad CVV,Insufficient Balance',
       'Bad Card Number,Technical Glitch',
       'Bad Zipcode,Insufficient Balance',
       'Bad Card Number,Bad Expiration,Insufficient Balance',
       'Bad Expiration,Bad CVV', 'Bad Zipcode,Technical Glitch'],
      dtype=object)

In [16]:
unique_errors = (
    pd.Series(errors_type)
    .dropna()
    .str.split(",")
    .explode()
    .str.strip()
    .unique()
)
print(unique_errors)

['Bad Expiration' 'Bad Card Number' 'Insufficient Balance' 'Bad PIN'
 'Technical Glitch' 'Bad CVV' 'Bad Zipcode']


In [17]:
err = trans["errors"]

# 에러 존재 여부 
trans["has_error"] = err.notna().astype("int8")

trans["err_bad_card_number"] = err.str.contains(
    "Bad Card Number",
    na=False
).astype("int8")

trans["err_bad_expiration"] = err.str.contains(
    "Bad Expiration",
    na=False
).astype("int8")

trans["err_bad_cvv"] = err.str.contains(
    "Bad CVV",
    na=False
).astype("int8")

trans["err_bad_pin"] = err.str.contains(
    "Bad PIN",
    na=False
).astype("int8")

trans["err_bad_zipcode"] = err.str.contains(
    "Bad Zipcode",
    na=False
).astype("int8")

trans["err_insufficient_balance"] = err.str.contains(
    "Insufficient Balance",
    na=False
).astype("int8")

trans["err_technical_glitch"] = err.str.contains(
    "Technical Glitch",
    na=False
).astype("int8")

In [18]:
trans.drop("errors", axis=1, inplace=True)
trans.isnull().sum()

id                               0
date                             0
client_id                        0
card_id                          0
amount                           0
use_chip                         0
merchant_id                      0
merchant_city                    0
merchant_state              893521
zip                         942920
mcc                              0
fraud                            0
is_online                        0
has_error                        0
err_bad_card_number              0
err_bad_expiration               0
err_bad_cvv                      0
err_bad_pin                      0
err_bad_zipcode                  0
err_insufficient_balance         0
err_technical_glitch             0
dtype: int64

In [20]:
trans.drop(columns=["merchant_state", "zip"], inplace=True)

In [21]:
trans

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,mcc,fraud,is_online,has_error,err_bad_card_number,err_bad_expiration,err_bad_cvv,err_bad_pin,err_bad_zipcode,err_insufficient_balance,err_technical_glitch
0,7475327,2010-01-01 00:01:00,1556,2972,-77.000000,Swipe Transaction,59935,Beulah,5499.0,0,0,0,0,0,0,0,0,0,0
1,7475328,2010-01-01 00:02:00,561,4575,14.570000,Swipe Transaction,67570,Bettendorf,5311.0,0,0,0,0,0,0,0,0,0,0
2,7475329,2010-01-01 00:02:00,1129,102,80.000000,Swipe Transaction,27092,Vista,4829.0,0,0,0,0,0,0,0,0,0,0
3,7475332,2010-01-01 00:06:00,848,3915,46.410000,Swipe Transaction,13051,Harwood,5813.0,0,0,0,0,0,0,0,0,0,0
4,7475333,2010-01-01 00:07:00,1807,165,4.810000,Swipe Transaction,20519,Bronx,5942.0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647519,21421689,2018-06-23 11:12:00,227,5867,58.320000,Chip Transaction,64730,Minneapolis,5211.0,0,0,0,0,0,0,0,0,0,0
7647520,21421690,2018-06-23 11:12:00,371,3134,21.700001,Chip Transaction,3847,Richmond,5411.0,0,0,0,0,0,0,0,0,0,0
7647521,21421691,2018-06-23 11:12:00,1225,2338,37.000000,Chip Transaction,18586,San Rafael,5310.0,0,0,0,0,0,0,0,0,0,0
7647522,21421693,2018-06-23 11:12:00,1362,2274,23.209999,Chip Transaction,75316,Star Tannery,5812.0,0,0,0,0,0,0,0,0,0,0


In [25]:
trans["use_chip"].unique()

array(['Swipe Transaction', 'Online Transaction', 'Chip Transaction'],
      dtype=object)

In [27]:
trans["use_chip"].value_counts()

use_chip
Swipe Transaction     4453665
Chip Transaction      2303088
Online Transaction     890771
Name: count, dtype: int64

In [28]:
trans["is_online"] = (trans["use_chip"] == "Online Transaction").astype(int)

In [29]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 19 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   id                        int64  
 1   date                      object 
 2   client_id                 int64  
 3   card_id                   int64  
 4   amount                    float32
 5   use_chip                  object 
 6   merchant_id               int64  
 7   merchant_city             object 
 8   mcc                       float64
 9   fraud                     int8   
 10  is_online                 int64  
 11  has_error                 int8   
 12  err_bad_card_number       int8   
 13  err_bad_expiration        int8   
 14  err_bad_cvv               int8   
 15  err_bad_pin               int8   
 16  err_bad_zipcode           int8   
 17  err_insufficient_balance  int8   
 18  err_technical_glitch      int8   
dtypes: float32(1), float64(1), int64(5), int8(9), object(3)
memory 

In [30]:
trans["date"] = pd.to_datetime(
    trans["date"],
    format="%Y-%m-%d %H:%M:%S",
    errors="coerce"
)

In [31]:
trans["tx_ts"] = trans["date"].astype("datetime64[ns]")
trans["tx_year"]  = trans["tx_ts"].dt.year.astype("int16")
trans["tx_month"] = trans["tx_ts"].dt.month.astype("int8")
trans["tx_day"]   = trans["tx_ts"].dt.day.astype("int8")     
trans["tx_hour"]  = trans["tx_ts"].dt.hour.astype("int8")

In [32]:
trans = trans.sort_values(
    ["date"]
).reset_index(drop=True)

In [33]:
trans["is_weekend"] = (trans["date"].dt.weekday >= 5).astype("int8")

In [34]:
trans

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,mcc,fraud,...,err_bad_pin,err_bad_zipcode,err_insufficient_balance,err_technical_glitch,tx_ts,tx_year,tx_month,tx_day,tx_hour,is_weekend
0,7475327,2010-01-01 00:01:00,1556,2972,-77.000000,Swipe Transaction,59935,Beulah,5499.0,0,...,0,0,0,0,2010-01-01 00:01:00,2010,1,1,0,0
1,7475328,2010-01-01 00:02:00,561,4575,14.570000,Swipe Transaction,67570,Bettendorf,5311.0,0,...,0,0,0,0,2010-01-01 00:02:00,2010,1,1,0,0
2,7475329,2010-01-01 00:02:00,1129,102,80.000000,Swipe Transaction,27092,Vista,4829.0,0,...,0,0,0,0,2010-01-01 00:02:00,2010,1,1,0,0
3,7475332,2010-01-01 00:06:00,848,3915,46.410000,Swipe Transaction,13051,Harwood,5813.0,0,...,0,0,0,0,2010-01-01 00:06:00,2010,1,1,0,0
4,7475333,2010-01-01 00:07:00,1807,165,4.810000,Swipe Transaction,20519,Bronx,5942.0,0,...,0,0,0,0,2010-01-01 00:07:00,2010,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647519,21421691,2018-06-23 11:12:00,1225,2338,37.000000,Chip Transaction,18586,San Rafael,5310.0,0,...,0,0,0,0,2018-06-23 11:12:00,2018,6,23,11,1
7647520,21421693,2018-06-23 11:12:00,1362,2274,23.209999,Chip Transaction,75316,Star Tannery,5812.0,0,...,0,0,0,0,2018-06-23 11:12:00,2018,6,23,11,1
7647521,21421689,2018-06-23 11:12:00,227,5867,58.320000,Chip Transaction,64730,Minneapolis,5211.0,0,...,0,0,0,0,2018-06-23 11:12:00,2018,6,23,11,1
7647522,21421690,2018-06-23 11:12:00,371,3134,21.700001,Chip Transaction,3847,Richmond,5411.0,0,...,0,0,0,0,2018-06-23 11:12:00,2018,6,23,11,1


In [35]:
trans.isnull().sum()

id                          0
date                        0
client_id                   0
card_id                     0
amount                      0
use_chip                    0
merchant_id                 0
merchant_city               0
mcc                         0
fraud                       0
is_online                   0
has_error                   0
err_bad_card_number         0
err_bad_expiration          0
err_bad_cvv                 0
err_bad_pin                 0
err_bad_zipcode             0
err_insufficient_balance    0
err_technical_glitch        0
tx_ts                       0
tx_year                     0
tx_month                    0
tx_day                      0
tx_hour                     0
is_weekend                  0
dtype: int64

In [36]:
trans["use_chip"].value_counts()

use_chip
Swipe Transaction     4453665
Chip Transaction      2303088
Online Transaction     890771
Name: count, dtype: int64

In [37]:
trans = trans[trans["use_chip"]=="Online Transaction"].copy()

In [38]:
trans["use_chip"].value_counts()

use_chip
Online Transaction    890771
Name: count, dtype: int64

In [39]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890771 entries, 5 to 7647494
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   id                        890771 non-null  int64         
 1   date                      890771 non-null  datetime64[ns]
 2   client_id                 890771 non-null  int64         
 3   card_id                   890771 non-null  int64         
 4   amount                    890771 non-null  float32       
 5   use_chip                  890771 non-null  object        
 6   merchant_id               890771 non-null  int64         
 7   merchant_city             890771 non-null  object        
 8   mcc                       890771 non-null  float64       
 9   fraud                     890771 non-null  int8          
 10  is_online                 890771 non-null  int64         
 11  has_error                 890771 non-null  int8          
 12  err_ba

In [40]:
trans.drop("is_online", axis=1, inplace=True)

In [41]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890771 entries, 5 to 7647494
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   id                        890771 non-null  int64         
 1   date                      890771 non-null  datetime64[ns]
 2   client_id                 890771 non-null  int64         
 3   card_id                   890771 non-null  int64         
 4   amount                    890771 non-null  float32       
 5   use_chip                  890771 non-null  object        
 6   merchant_id               890771 non-null  int64         
 7   merchant_city             890771 non-null  object        
 8   mcc                       890771 non-null  float64       
 9   fraud                     890771 non-null  int8          
 10  has_error                 890771 non-null  int8          
 11  err_bad_card_number       890771 non-null  int8          
 12  err_ba

In [43]:
trans.drop("tx_ts", axis=1, inplace=True)

In [44]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890771 entries, 5 to 7647494
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   id                        890771 non-null  int64         
 1   date                      890771 non-null  datetime64[ns]
 2   client_id                 890771 non-null  int64         
 3   card_id                   890771 non-null  int64         
 4   amount                    890771 non-null  float32       
 5   use_chip                  890771 non-null  object        
 6   merchant_id               890771 non-null  int64         
 7   merchant_city             890771 non-null  object        
 8   mcc                       890771 non-null  float64       
 9   fraud                     890771 non-null  int8          
 10  has_error                 890771 non-null  int8          
 11  err_bad_card_number       890771 non-null  int8          
 12  err_ba

In [45]:
trans["merchant_city"].value_counts()

merchant_city
ONLINE    890771
Name: count, dtype: int64

In [46]:
trans.drop("merchant_city", axis=1, inplace=True)

In [47]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890771 entries, 5 to 7647494
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   id                        890771 non-null  int64         
 1   date                      890771 non-null  datetime64[ns]
 2   client_id                 890771 non-null  int64         
 3   card_id                   890771 non-null  int64         
 4   amount                    890771 non-null  float32       
 5   use_chip                  890771 non-null  object        
 6   merchant_id               890771 non-null  int64         
 7   mcc                       890771 non-null  float64       
 8   fraud                     890771 non-null  int8          
 9   has_error                 890771 non-null  int8          
 10  err_bad_card_number       890771 non-null  int8          
 11  err_bad_expiration        890771 non-null  int8          
 12  err_ba

In [48]:
trans["use_chip"].value_counts()

use_chip
Online Transaction    890771
Name: count, dtype: int64

In [49]:
trans.drop("use_chip", axis=1, inplace=True)

In [50]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890771 entries, 5 to 7647494
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   id                        890771 non-null  int64         
 1   date                      890771 non-null  datetime64[ns]
 2   client_id                 890771 non-null  int64         
 3   card_id                   890771 non-null  int64         
 4   amount                    890771 non-null  float32       
 5   merchant_id               890771 non-null  int64         
 6   mcc                       890771 non-null  float64       
 7   fraud                     890771 non-null  int8          
 8   has_error                 890771 non-null  int8          
 9   err_bad_card_number       890771 non-null  int8          
 10  err_bad_expiration        890771 non-null  int8          
 11  err_bad_cvv               890771 non-null  int8          
 12  err_ba

In [52]:
trans["is_refund"] = (trans["amount"] < 0).astype("int8")

trans["log_abs_amount"] = np.log1p(
    np.abs(trans["amount"])
).astype("float32")

In [53]:
trans["mcc"] = pd.to_numeric(trans["mcc"], errors="coerce")
trans["mcc"] = trans["mcc"].astype("Int64")

In [None]:
MCC_GROUP = {
    # 1) Food & Daily
    "Food & Daily": [
        "5812","5814","5813","5411","5499","5912","5921",
        "5300","5310","5311"
    ],

    # 2) Transport & Travel  (+ freight 일부 흡수, + 4112/4411 포함)
    "Transport & Travel": [
        "4111","4121","4131","4112",
        "3722","3771","3775",
        "4511","4411",
        "4722","7011","4784",
        "4214"  # Motor Freight -> 여기로 흡수
    ],

    # 3) Digital & Online
    "Digital & Online": [
        "5815","5816","4814","4899","3780"
    ],

    # 4) Financial
    "Financial": [
        "4829","6300","7276","8931"
    ],

    # 5) Retail
    "Retail": [
        "5045","5732","5733",
        "5941","5942","5947",
        "5661","5651","5655","5621",
        "5977","5970","5932",
        "5192","5193",
        "5712","5719","5722",
        "5094"
    ],

    # 6) Medical
    "Medical": [
        "8011","8021","8041","8043","8049","8062","8099"
    ],

    # 7) Entertainment 
    "Entertainment": [
        "7832","7922","7996","7801","7802","7995"
    ],

    # 8) Automotive & Home
    "Automotive & Home": [
        "5541", 
        "7531","7538","7542","7549","5533",
        "1711","5251","5261","5211","3504",
        "7210","7230","7349",
        "3640"
    ],

    # 9) Utilities & Government
    "Utilities & Government": [
        "4900","9402"
    ],

    # 10) Professional Services
    "Professional Services": [
        "8111","7393"
    ],

    # 11) Industrial / Manufacturing (나머지 제조/가공 계열)
    "Industrial / Manufacturing": [
        "3000","3001","3005","3006","3007","3008","3009",
        "3058","3066","3075",
        "3132","3144","3174",
        "3256","3260",
        "3359","3387","3389","3390","3393","3395","3405",
        "3509","3596","3684",
        "3730" 
    ],
}



trans["mcc"] = trans["mcc"].astype(str)

for group_name, mcc_list in MCC_GROUP.items():
    col_name = f"mccg_{group_name.replace(' & ', '_').replace(' ', '_')}"
    trans[col_name] = trans["mcc"].isin(mcc_list).astype(np.int8)


In [57]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

LABEL_COL = "fraud"
MCC_COL = "mcc"

def _collapse_rare_to_other(s: pd.Series, min_count: int = 50, other: str = "OTHER") -> pd.Series:
    s = s.astype(str)
    vc = s.value_counts(dropna=False)
    rare = vc[vc < min_count].index
    return s.where(~s.isin(rare), other)

def make_mcc_onehot(trans: pd.DataFrame,
                    mcc_col: str = MCC_COL,
                    prefix: str = "mcc",
                    drop_first: bool = True,
                    min_count: int = 50,
                    other: str = "OTHER") -> tuple[pd.DataFrame, list]:
    df = trans.copy()
    df[mcc_col] = df[mcc_col].astype(str)

    df[mcc_col] = _collapse_rare_to_other(df[mcc_col], min_count=min_count, other=other)

    dummies = pd.get_dummies(df[mcc_col], prefix=prefix, drop_first=drop_first, dtype=np.int8)

    nunique = dummies.nunique(dropna=False)
    dummies = dummies.loc[:, nunique > 1]
    dummies = dummies.loc[:, ~dummies.T.duplicated()]

    out = pd.concat([df, dummies], axis=1)
    x_cols = list(dummies.columns)
    return out, x_cols

def make_mcc_group(trans: pd.DataFrame,
                   mcc_group: dict,
                   mcc_col: str = MCC_COL,
                   prefix: str = "mccg_") -> tuple[pd.DataFrame, list]:
    df = trans.copy()
    df[mcc_col] = df[mcc_col].astype(str)

    cols = []
    for group_name, mcc_list in mcc_group.items():
        col_name = f"{prefix}{group_name.replace(' & ', '_').replace(' ', '_').replace('/', '_')}"
        df[col_name] = df[mcc_col].isin(list(map(str, mcc_list))).astype(np.int8)
        cols.append(col_name)

    dummies = df[cols].copy()
    nunique = dummies.nunique(dropna=False)
    keep = list(nunique[nunique > 1].index)
    return df, keep

def fit_logit_and_print_summary(df: pd.DataFrame, y_col: str, x_cols: list, maxiter: int = 200, method: str = "lbfgs"):
    X = df[x_cols].copy()
    y = df[y_col].astype(int)

    X = sm.add_constant(X, has_constant="add")
    model = sm.Logit(y, X)
    res = model.fit(disp=False, maxiter=maxiter, method=method)
    print(res.summary())
    return res

missing = [c for c in [LABEL_COL, MCC_COL] if c not in trans.columns]
if missing:
    raise KeyError(f"Missing columns in trans: {missing}")

df_onehot, mcc_cols = make_mcc_onehot(
    trans,
    mcc_col=MCC_COL,
    prefix="mcc",
    drop_first=True,
    min_count=50,
    other="OTHER",
)
print("\n==============================")
print("LOGIT SUMMARY: MCC one-hot")
print("==============================\n")
res_onehot = fit_logit_and_print_summary(df_onehot, y_col=LABEL_COL, x_cols=mcc_cols, maxiter=200, method="lbfgs")

df_group, group_cols = make_mcc_group(
    trans,
    MCC_GROUP,
    mcc_col=MCC_COL,
    prefix="mccg_",
)


if len(group_cols) >= 2:
    group_cols_use = group_cols[1:]
else:
    group_cols_use = group_cols

print("\n==============================")
print("LOGIT SUMMARY: MCC grouped (mccg_*)")
print("==============================\n")
res_group = fit_logit_and_print_summary(df_group, y_col=LABEL_COL, x_cols=group_cols_use, maxiter=200, method="lbfgs")



LOGIT SUMMARY: MCC one-hot

                           Logit Regression Results                           
Dep. Variable:                  fraud   No. Observations:               890771
Model:                          Logit   Df Residuals:                   890720
Method:                           MLE   Df Model:                           50
Date:                Thu, 12 Feb 2026   Pseudo R-squ.:                  0.6358
Time:                        14:50:14   Log-Likelihood:                -17810.
converged:                       True   LL-Null:                       -48900.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.2617      0.729      4.477      0.000       1.834       4.690
mcc_3390       5.6663     11.952      0.474      0.635     -17.758      29.091
mcc_3504       7.2114  

In [62]:
LABEL_COL = "fraud"
MCC_COL = "mcc"

def _collapse_rare_to_other(s: pd.Series, min_count: int = 50, other: str = "OTHER") -> pd.Series:
    s = s.astype(str)
    vc = s.value_counts(dropna=False)
    rare = vc[vc < min_count].index
    return s.where(~s.isin(rare), other)

def make_mcc_onehot(trans: pd.DataFrame,
                    mcc_col: str = MCC_COL,
                    prefix: str = "mcc",
                    drop_first: bool = True,
                    min_count: int = 50,
                    other: str = "OTHER") -> tuple[pd.DataFrame, list]:
    df = trans.copy()
    df[mcc_col] = df[mcc_col].astype(str)
    df[mcc_col] = _collapse_rare_to_other(df[mcc_col], min_count=min_count, other=other)

    dummies = pd.get_dummies(df[mcc_col], prefix=prefix, drop_first=drop_first, dtype=np.int8)
    nunique = dummies.nunique(dropna=False)
    dummies = dummies.loc[:, nunique > 1]
    dummies = dummies.loc[:, ~dummies.T.duplicated()]

    out = pd.concat([df, dummies], axis=1)
    x_cols = list(dummies.columns)
    return out, x_cols

def make_mcc_group(trans: pd.DataFrame,
                   mcc_group: dict,
                   mcc_col: str = MCC_COL,
                   prefix: str = "mccg_") -> tuple[pd.DataFrame, list]:
    df = trans.copy()
    df[mcc_col] = df[mcc_col].astype(str)

    cols = []
    for group_name, mcc_list in mcc_group.items():
        col_name = f"{prefix}{group_name.replace(' & ', '_').replace(' ', '_').replace('/', '_')}"
        df[col_name] = df[mcc_col].isin(list(map(str, mcc_list))).astype(np.int8)
        cols.append(col_name)

    dummies = df[cols].copy()
    nunique = dummies.nunique(dropna=False)
    keep = list(nunique[nunique > 1].index)
    return df, keep

def fit_logit_and_print_summary(df: pd.DataFrame, y_col: str, x_cols: list, maxiter: int = 200, method: str = "lbfgs"):
    X = df[x_cols].copy()
    y = df[y_col].astype(int)

    X = sm.add_constant(X, has_constant="add")
    model = sm.Logit(y, X)
    res = model.fit(disp=False, maxiter=maxiter, method=method)
    print(res.summary())
    return res

def to_or_table(res) -> pd.DataFrame:
    params = res.params
    conf = res.conf_int()
    out = pd.DataFrame({
        "coef": params,
        "OR": np.exp(params),
        "OR_CI_low": np.exp(conf[0]),
        "OR_CI_high": np.exp(conf[1]),
        "p_value": res.pvalues
    })
    return out

missing = [c for c in [LABEL_COL, MCC_COL] if c not in trans.columns]
if missing:
    raise KeyError(f"Missing columns in trans: {missing}")

df_onehot, mcc_cols = make_mcc_onehot(
    trans,
    mcc_col=MCC_COL,
    prefix="mcc",
    drop_first=True,
    min_count=50,
    other="OTHER",
)
print("\n==============================")
print("LOGIT SUMMARY: MCC one-hot")
print("==============================\n")
res_onehot = fit_logit_and_print_summary(df_onehot, y_col=LABEL_COL, x_cols=mcc_cols, maxiter=200, method="lbfgs")

df_group, group_cols = make_mcc_group(
    trans,
    MCC_GROUP,
    mcc_col=MCC_COL,
    prefix="mccg_",
)

group_cols_use = group_cols[1:] if len(group_cols) >= 2 else group_cols

print("\n==============================")
print("LOGIT SUMMARY: MCC grouped (mccg_*)")
print("==============================\n")
res_group = fit_logit_and_print_summary(df_group, y_col=LABEL_COL, x_cols=group_cols_use, maxiter=200, method="lbfgs")

or_group = to_or_table(res_group).drop(index="const", errors="ignore")
print("\n==============================")
print("OR TABLE: MCC grouped (mccg_*)")
print("==============================\n")
print(or_group.sort_values("OR", ascending=False).to_string())

or_group.to_csv("mcc_group_or_table.csv", index=True)



LOGIT SUMMARY: MCC one-hot

                           Logit Regression Results                           
Dep. Variable:                  fraud   No. Observations:               890771
Model:                          Logit   Df Residuals:                   890720
Method:                           MLE   Df Model:                           50
Date:                Thu, 12 Feb 2026   Pseudo R-squ.:                  0.6358
Time:                        15:03:54   Log-Likelihood:                -17810.
converged:                       True   LL-Null:                       -48900.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.2617      0.729      4.477      0.000       1.834       4.690
mcc_3390       5.6663     11.952      0.474      0.635     -17.758      29.091
mcc_3504       7.2114  

In [63]:
def to_or_table(res):
    params = res.params
    conf = res.conf_int()
    out = pd.DataFrame({
        "coef": params,
        "OR": np.exp(params),
        "OR_CI_low": np.exp(conf[0]),
        "OR_CI_high": np.exp(conf[1]),
        "p_value": res.pvalues
    })
    return out


# -----------------------------
# ONE-HOT OR TABLE
# -----------------------------
or_onehot = to_or_table(res_onehot).drop(index="const", errors="ignore")

print("\n==============================")
print("OR TABLE: MCC one-hot")
print("==============================\n")

print(or_onehot.sort_values("OR", ascending=False).to_string())



OR TABLE: MCC one-hot

                coef            OR      OR_CI_low     OR_CI_high       p_value
mcc_5300   17.809791  5.428677e+07   0.000000e+00            inf  9.896399e-01
mcc_4829   17.111329  2.699949e+07   0.000000e+00            inf  9.862950e-01
mcc_OTHER  16.511172  1.481531e+07   0.000000e+00            inf  9.826196e-01
mcc_5310   13.345797  6.251812e+05  4.142116e-156  9.436035e+166  9.438112e-01
mcc_5719   11.814779  1.352363e+05   7.379689e-87   2.478270e+96  9.122545e-01
mcc_5912   10.967363  5.795156e+04   1.533991e-64   2.189311e+73  8.917183e-01
mcc_5814   10.443213  3.431072e+04   1.964773e-53   5.991662e+61  8.765906e-01
mcc_5094    8.625377  5.571265e+03   1.293757e-26   2.399136e+33  8.043256e-01
mcc_4214    8.527466  5.051631e+03   1.334043e-25   1.912905e+32  7.995033e-01
mcc_3722    7.430790  1.687140e+03   1.505677e-16   1.890473e+22  7.398618e-01
mcc_4112    7.300312  1.480761e+03   1.025502e-15   2.138128e+21  7.322071e-01
mcc_3504    7.211388  1.3547

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [64]:
or_group

Unnamed: 0,coef,OR,OR_CI_low,OR_CI_high,p_value
mccg_Transport_Travel,-3.2958,0.037038,0.034455,0.03981545,0.0
mccg_Digital_Online,-1.985396,0.137326,0.127029,0.1484584,0.0
mccg_Financial,0.028032,1.028428,0.946204,1.117798,0.5096854
mccg_Retail,0.772092,2.164289,2.029681,2.307824,8.537535e-123
mccg_Entertainment,-1.277008,0.27887,0.248317,0.3131833,3.515858e-103
mccg_Automotive_Home,-0.831853,0.435242,0.387902,0.4883598,1.645215e-45
mccg_Utilities_Government,-3.835209,0.021597,0.016064,0.02903472,2.555056e-142
mccg_Professional_Services,-5.53392,0.00395,0.000387,0.04037414,3.065752e-06
mccg_Industrial___Manufacturing,11.856829,141044.188471,638.617164,31150840.0,1.666201e-05


In [65]:
or_onehot

Unnamed: 0,coef,OR,OR_CI_low,OR_CI_high,p_value
mcc_3390,5.666314,288.9675,1.939066e-08,4306313000000.0,0.6354262
mcc_3504,7.211388,1354.771,3.575008e-15,5.133988e+20,0.7269434
mcc_3509,6.388399,594.9031,5.063322e-11,6989675000000000.0,0.6773714
mcc_3596,7.028766,1128.637,4.029511e-14,3.161231e+19,0.7160361
mcc_3640,6.542871,694.277,1.081782e-11,4.455801e+16,0.686686
mcc_3684,6.175704,480.9214,3.590197e-10,644213500000000.0,0.6646675
mcc_3722,7.43079,1687.14,1.505677e-16,1.890473e+22,0.7398618
mcc_3730,7.075027,1182.076,2.220363e-14,6.293129e+19,0.7188098
mcc_3771,6.982094,1077.172,7.265335e-14,1.597035e+19,0.7132315
mcc_3775,5.954848,385.6183,2.270848e-09,65482810000000.0,0.651729


In [68]:
def filter_sig_or(or_df: pd.DataFrame,
                  p_thresh: float = 0.05,
                  ci_low: str = "OR_CI_low",
                  ci_high: str = "OR_CI_high",
                  p_col: str = "p_value",
                  or_col: str = "OR") -> pd.DataFrame:
    df = or_df.copy()

    need = {p_col, ci_low, ci_high, or_col}
    missing = need - set(df.columns)
    if missing:
        raise KeyError(f"Missing columns in OR table: {missing}")

    df = df.dropna(subset=[p_col, ci_low, ci_high, or_col])

    sig = df[p_col] < p_thresh
    ci_not_cross_1 = (df[ci_low] > 1.0) | (df[ci_high] < 1.0)

    out = df.loc[sig & ci_not_cross_1].copy()

    out["direction"] = np.where(out[or_col] > 1.0, "risk↑ (OR>1)", "risk↓ (OR<1)")
    out = out.sort_values([p_col, or_col], ascending=[True, False])

    return out

sig_group = filter_sig_or(or_group, p_thresh=0.05)
sig_onehot = filter_sig_or(or_onehot, p_thresh=0.05)

In [69]:
sig_group

Unnamed: 0,coef,OR,OR_CI_low,OR_CI_high,p_value,direction
mccg_Digital_Online,-1.985396,0.137326,0.127029,0.1484584,0.0,risk↓ (OR<1)
mccg_Transport_Travel,-3.2958,0.037038,0.034455,0.03981545,0.0,risk↓ (OR<1)
mccg_Utilities_Government,-3.835209,0.021597,0.016064,0.02903472,2.555056e-142,risk↓ (OR<1)
mccg_Retail,0.772092,2.164289,2.029681,2.307824,8.537535e-123,risk↑ (OR>1)
mccg_Entertainment,-1.277008,0.27887,0.248317,0.3131833,3.515858e-103,risk↓ (OR<1)
mccg_Automotive_Home,-0.831853,0.435242,0.387902,0.4883598,1.645215e-45,risk↓ (OR<1)
mccg_Professional_Services,-5.53392,0.00395,0.000387,0.04037414,3.065752e-06,risk↓ (OR<1)
mccg_Industrial___Manufacturing,11.856829,141044.188471,638.617164,31150840.0,1.666201e-05,risk↑ (OR>1)


| 그룹                         | OR      | 해석                           |
| -------------------------- | ------- | ---------------------------- |
| Industrial / Manufacturing | 141,044 | 사실상 완전 분리 (quasi-separation) |
| Retail                     | 2.16    | baseline 대비 약 2.2배 위험        |

In [70]:
sig_onehot

Unnamed: 0,coef,OR,OR_CI_low,OR_CI_high,p_value,direction
mcc_4121,-10.301286,3.358987e-05,7.980917e-06,0.000141,7.86447e-45,risk↓ (OR<1)
mcc_4899,-10.866078,1.909512e-05,4.192457e-06,8.7e-05,8.050454e-45,risk↓ (OR<1)
mcc_4814,-9.181984,0.0001028762,2.446358e-05,0.000433,5.1612689999999996e-36,risk↓ (OR<1)
mcc_7801,-8.066037,0.0003140253,7.408319e-05,0.001331,6.944637e-28,risk↓ (OR<1)
mcc_7922,-8.007457,0.0003329702,7.875918e-05,0.001408,1.33969e-27,risk↓ (OR<1)
mcc_4722,-7.838819,0.0003941344,9.370917e-05,0.001658,1.0703489999999999e-26,risk↓ (OR<1)
mcc_5815,-7.642223,0.0004797609,0.000114695,0.002007,1.223877e-25,risk↓ (OR<1)
mcc_5661,-7.618738,0.0004911612,0.000111509,0.002163,7.396075e-24,risk↓ (OR<1)
mcc_5311,-7.058003,0.0008604945,0.0002061595,0.003592,3.614865e-22,risk↓ (OR<1)
mcc_5193,-7.056315,0.0008619488,0.0002022745,0.003673,1.416754e-21,risk↓ (OR<1)


rule + one-hot 조합으로 가야함

In [71]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890771 entries, 5 to 7647494
Data columns (total 34 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   id                               890771 non-null  int64         
 1   date                             890771 non-null  datetime64[ns]
 2   client_id                        890771 non-null  int64         
 3   card_id                          890771 non-null  int64         
 4   amount                           890771 non-null  float32       
 5   merchant_id                      890771 non-null  int64         
 6   mcc                              890771 non-null  object        
 7   fraud                            890771 non-null  int8          
 8   has_error                        890771 non-null  int8          
 9   err_bad_card_number              890771 non-null  int8          
 10  err_bad_expiration               890771 non-null