In [94]:
import pandas as pd
import matplotlib.pyplot as plt
import ijson
import numpy as np
import os

In [2]:
card = pd.read_csv("cards_data.csv")
transaction = pd.read_csv("transactions_data.csv")
user = pd.read_csv("users_data.csv")

# Dataset setting according to user intersection

In [3]:
card = card.sort_values(by="client_id", ascending=True)
transaction = transaction.sort_values(by="client_id", ascending=True)
user = user.sort_values(by="id", ascending=True)

In [4]:
card_id = card["client_id"].unique()
trans_id = transaction["client_id"].unique()
user_id = user["id"].unique()

In [5]:
common = set(card_id) & set(trans_id) & set(user_id)
print("len of common: ", len(common))
print("unique of common:", common)

len of common:  1219
unique of common: {np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(11), np.int64(13), np.int64(14), np.int64(16), np.int64(17), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(24), np.int64(27), np.int64(28), np.int64(32), np.int64(33), np.int64(34), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(42), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(55), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(68), np.int64(69), np.int64(73), np.int64(74), np.int64(75), np.int64(77), np.int64(78), np.int64(79), np.int64(80), np.int64(81), np.int64(84), np.int64(86), np.int64(87), np.int64(89), np.int64(90), np.int64(92), np.int64(94), np.int64(96), np.int64(98), np.int64(100), np.int64(103), np.int64(104), np.int64(106), np.int64(1

In [6]:
card_common = card[card["client_id"].isin(common)]
trans_common = transaction[transaction["client_id"].isin(common)]
user_common = user[user["id"].isin(common)]
print("nunique check:", card_common["client_id"].nunique(), trans_common["client_id"].nunique(), user_common["id"].nunique())
print("card shape:", card_common.shape)
print("trans shape:", trans_common.shape)
print("user shape:", user_common.shape)

nunique check: 1219 1219 1219
card shape: (4514, 13)
trans shape: (11413790, 12)
user shape: (1219, 14)


# Dataset setting according to card intersection

In [7]:
trans_common.columns

Index(['id', 'date', 'client_id', 'card_id', 'amount', 'use_chip',
       'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc',
       'errors'],
      dtype='object')

In [8]:
card_common.columns

Index(['id', 'client_id', 'card_brand', 'card_type', 'card_number', 'expires',
       'cvv', 'has_chip', 'num_cards_issued', 'credit_limit', 'acct_open_date',
       'year_pin_last_changed', 'card_on_dark_web'],
      dtype='object')

In [9]:
trans_card = trans_common["card_id"].unique()
card_common = card_common[card_common["id"].isin(trans_card)]

In [10]:
card_common.shape

(3992, 13)

In [11]:
card_common["id"].duplicated().sum()

np.int64(0)

In [12]:
common_id = set(card_common["id"].unique()) & set(trans_common["card_id"].unique())
print(len(common_id))

3992


# Adding target column to trans_common

In [13]:
ids = []
labels = []

with open("train_fraud_labels.json", "rb") as f:
    for k, v in ijson.kvitems(f, "target"):
        kid = int(str(k).strip())
        ids.append(kid)
        labels.append(1 if v == "Yes" else 0)

labels_df = pd.DataFrame({"id": ids, "fraud": labels}).astype({"fraud": "int8"})
labels_df

Unnamed: 0,id,fraud
0,10649266,0
1,23410063,0
2,9316588,0
3,12478022,0
4,9558530,0
...,...,...
8914958,14064699,0
8914959,7676538,0
8914960,15131030,0
8914961,17244732,0


In [14]:
trans_common = trans_common.merge(labels_df, on="id", how="inner")
trans_common

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,fraud
0,18632463,2016-11-08 19:59:00,0,1271,$7.28,Chip Transaction,13153,Scarborough,ME,4074.0,5812.0,,0
1,15915081,2015-04-06 20:35:00,0,4639,$9.03,Chip Transaction,44919,Philadelphia,PA,19139.0,5814.0,,0
2,20356840,2017-11-09 13:02:00,0,1271,$42.28,Chip Transaction,60569,Lewiston,ME,4240.0,5300.0,,0
3,14789943,2014-08-05 19:39:00,0,1271,$9.73,Swipe Transaction,887,Detroit,MI,48227.0,5812.0,,0
4,13143772,2013-08-09 19:50:00,0,1271,$8.89,Swipe Transaction,98648,Scarborough,ME,4074.0,5814.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647519,9140827,2011-02-11 08:18:00,1998,2160,$13.28,Swipe Transaction,59935,Burlingame,CA,94010.0,5499.0,,0
7647520,9417002,2011-04-17 07:03:00,1998,2160,$6.61,Online Transaction,9932,ONLINE,,,5311.0,,0
7647521,8459267,2010-09-01 13:34:00,1998,2160,$3.52,Swipe Transaction,59935,Burlingame,CA,94010.0,5499.0,,0
7647522,13513522,2013-10-30 08:41:00,1998,2160,$5.05,Online Transaction,47399,ONLINE,,,5815.0,,0


# [Setting trans_common]

In [15]:
trans_common = trans_common.drop("id", axis=1)

In [16]:
trans_common.columns

Index(['date', 'client_id', 'card_id', 'amount', 'use_chip', 'merchant_id',
       'merchant_city', 'merchant_state', 'zip', 'mcc', 'errors', 'fraud'],
      dtype='object')

In [17]:
trans_common.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   client_id       int64  
 2   card_id         int64  
 3   amount          object 
 4   use_chip        object 
 5   merchant_id     int64  
 6   merchant_city   object 
 7   merchant_state  object 
 8   zip             float64
 9   mcc             float64
 10  errors          object 
 11  fraud           int8   
dtypes: float64(2), int64(3), int8(1), object(6)
memory usage: 649.1+ MB


### date
### amount

In [18]:
trans_common["date"] = pd.to_datetime(trans_common["date"])
trans_common["amount"] = (trans_common["amount"].str.replace("$", "", regex=False).astype("float32"))

In [19]:
trans_common.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 12 columns):
 #   Column          Dtype         
---  ------          -----         
 0   date            datetime64[ns]
 1   client_id       int64         
 2   card_id         int64         
 3   amount          float32       
 4   use_chip        object        
 5   merchant_id     int64         
 6   merchant_city   object        
 7   merchant_state  object        
 8   zip             float64       
 9   mcc             float64       
 10  errors          object        
 11  fraud           int8          
dtypes: datetime64[ns](1), float32(1), float64(2), int64(3), int8(1), object(4)
memory usage: 619.9+ MB


### error

In [21]:
trans_common["errors"].value_counts()

errors
Insufficient Balance                                   75237
Bad PIN                                                18462
Technical Glitch                                       15196
Bad Card Number                                         4463
Bad CVV                                                 3504
Bad Expiration                                          3492
Bad Zipcode                                              669
Bad PIN,Insufficient Balance                             166
Insufficient Balance,Technical Glitch                    136
Bad Card Number,Insufficient Balance                      43
Bad PIN,Technical Glitch                                  36
Bad CVV,Insufficient Balance                              35
Bad Expiration,Insufficient Balance                       30
Bad Card Number,Bad Expiration                            27
Bad Card Number,Bad CVV                                   20
Bad Expiration,Bad CVV                                    16
Bad Expiration,Te

In [22]:
errors_type = trans_common["errors"].unique()
errors_type

array([nan, 'Insufficient Balance', 'Bad PIN', 'Bad CVV',
       'Technical Glitch', 'Bad Card Number', 'Bad Zipcode',
       'Bad Expiration', 'Bad Expiration,Technical Glitch',
       'Bad PIN,Technical Glitch', 'Bad PIN,Insufficient Balance',
       'Bad Card Number,Bad Expiration',
       'Insufficient Balance,Technical Glitch', 'Bad Expiration,Bad CVV',
       'Bad Expiration,Insufficient Balance',
       'Bad CVV,Insufficient Balance', 'Bad Card Number,Bad CVV',
       'Bad Zipcode,Insufficient Balance',
       'Bad Card Number,Insufficient Balance',
       'Bad Card Number,Technical Glitch',
       'Bad Card Number,Bad Expiration,Insufficient Balance',
       'Bad Zipcode,Technical Glitch', 'Bad CVV,Technical Glitch'],
      dtype=object)

In [23]:
unique_errors = (
    pd.Series(errors_type)
    .dropna()
    .str.split(",")
    .explode()
    .str.strip()
    .unique()
)
print(unique_errors)

['Insufficient Balance' 'Bad PIN' 'Bad CVV' 'Technical Glitch'
 'Bad Card Number' 'Bad Zipcode' 'Bad Expiration']


In [24]:
err = trans_common["errors"]

# 에러 존재 여부 
trans_common["has_error"] = err.notna().astype("int8")

# 1. Card Credential Errors
trans_common["err_card_credential"] = err.str.contains(
    "Bad Card Number|Bad Expiration|Bad CVV",
    na=False
).astype("int8")

# 2. Authentication Errors
trans_common["err_authentication"] = err.str.contains(
    "Bad PIN|Bad Zipcode",
    na=False
).astype("int8")

# 3. Financial Constraint Errors
trans_common["err_financial"] = err.str.contains(
    "Insufficient Balance",
    na=False
).astype("int8")

# 4. System Errors
trans_common["err_system"] = err.str.contains(
    "Technical Glitch",
    na=False
).astype("int8")

In [25]:
trans_common.head()

Unnamed: 0,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,fraud,has_error,err_card_credential,err_authentication,err_financial,err_system
0,2016-11-08 19:59:00,0,1271,7.28,Chip Transaction,13153,Scarborough,ME,4074.0,5812.0,,0,0,0,0,0,0
1,2015-04-06 20:35:00,0,4639,9.03,Chip Transaction,44919,Philadelphia,PA,19139.0,5814.0,,0,0,0,0,0,0
2,2017-11-09 13:02:00,0,1271,42.279999,Chip Transaction,60569,Lewiston,ME,4240.0,5300.0,,0,0,0,0,0,0
3,2014-08-05 19:39:00,0,1271,9.73,Swipe Transaction,887,Detroit,MI,48227.0,5812.0,,0,0,0,0,0,0
4,2013-08-09 19:50:00,0,1271,8.89,Swipe Transaction,98648,Scarborough,ME,4074.0,5814.0,,0,0,0,0,0,0


In [26]:
# error 없는데 있다고 체크된 것 있는지 확인 
bad_rows = trans_common[(trans_common["errors"].notna()) & (trans_common["has_error"] == 0)]
len(bad_rows)

0

In [27]:
trans_common = trans_common.drop(columns=["errors"])
trans_common.columns

Index(['date', 'client_id', 'card_id', 'amount', 'use_chip', 'merchant_id',
       'merchant_city', 'merchant_state', 'zip', 'mcc', 'fraud', 'has_error',
       'err_card_credential', 'err_authentication', 'err_financial',
       'err_system'],
      dtype='object')

### mcc

In [28]:
trans_common["mcc"].nunique()

109

In [71]:
trans_common["mcc"] = trans_common["mcc"].astype("Int64").astype("category")
trans_common.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 16 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date                 datetime64[ns]
 1   client_id            int64         
 2   card_id              int64         
 3   amount               float32       
 4   use_chip             category      
 5   merchant_id          int64         
 6   mcc                  category      
 7   fraud                int8          
 8   has_error            int8          
 9   err_card_credential  int8          
 10  err_authentication   int8          
 11  err_financial        int8          
 12  err_system           int8          
 13  is_online            int8          
 14  home_state           category      
 15  is_out_of_state      int8          
dtypes: category(3), datetime64[ns](1), float32(1), int64(3), int8(8)
memory usage: 342.8 MB


### use_chip 

In [20]:
trans_common["use_chip"].unique()

array(['Chip Transaction', 'Swipe Transaction', 'Online Transaction'],
      dtype=object)

In [30]:
trans_common["use_chip"] = trans_common["use_chip"].astype("category")
trans_common.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 16 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date                 datetime64[ns]
 1   client_id            int64         
 2   card_id              int64         
 3   amount               float32       
 4   use_chip             category      
 5   merchant_id          int64         
 6   merchant_city        object        
 7   merchant_state       object        
 8   zip                  float64       
 9   mcc                  category      
 10  fraud                int8          
 11  has_error            int8          
 12  err_card_credential  int8          
 13  err_authentication   int8          
 14  err_financial        int8          
 15  err_system           int8          
dtypes: category(2), datetime64[ns](1), float32(1), float64(1), int64(3), int8(6), object(2)
memory usage: 495.9+ MB


### zip (우편번호)

In [31]:
trans_common = trans_common.drop(columns=["zip"])

In [32]:
trans_common.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 15 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date                 datetime64[ns]
 1   client_id            int64         
 2   card_id              int64         
 3   amount               float32       
 4   use_chip             category      
 5   merchant_id          int64         
 6   merchant_city        object        
 7   merchant_state       object        
 8   mcc                  category      
 9   fraud                int8          
 10  has_error            int8          
 11  err_card_credential  int8          
 12  err_authentication   int8          
 13  err_financial        int8          
 14  err_system           int8          
dtypes: category(2), datetime64[ns](1), float32(1), int64(3), int8(6), object(2)
memory usage: 437.6+ MB


### merchant_city & state

In [33]:
trans_common["merchant_city"].nunique()

11890

In [34]:
trans_common["merchant_state"].nunique()

191

In [35]:
def build_home_state(trans_common,
                     client_col="client_id",
                     state_col="merchant_state"):
    tmp = trans_common.copy()
    tmp[state_col] = tmp[state_col].astype(str).str.strip()

    tmp = tmp[tmp[state_col].str.upper() != "ONLINE"]

    home_state = (
        tmp.groupby([client_col, state_col])
           .size()
           .reset_index(name="cnt")
           .sort_values([client_col, "cnt"], ascending=[True, False])
           .drop_duplicates(client_col)
           .rename(columns={state_col: "home_state"})
           [[client_col, "home_state"]]
    )
    return home_state


def add_location_features(df,
                          home_state_df,
                          client_col="client_id",
                          state_col="merchant_state",
                          drop_raw_cols=True):
    out = df.copy()
    out[state_col] = out[state_col].astype(str).str.strip()

    out["is_online"] = (out[state_col].str.upper() == "ONLINE").astype(np.int8)

    out = out.merge(home_state_df, on=client_col, how="left")

    out["is_out_of_state"] = (
        (out["is_online"] == 0) &
        (out[state_col].str.upper() != out["home_state"].astype(str).str.upper())
    ).astype(np.int8)

    out["home_state"] = out["home_state"].astype("category")

    if drop_raw_cols:
        out = out.drop(columns=[state_col])

    return out

home_state_df = build_home_state(trans_common)

trans_common = add_location_features(
    trans_common,
    home_state_df
)

In [36]:
trans_common = trans_common.drop(columns="merchant_city")

In [37]:
trans_common.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 16 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date                 datetime64[ns]
 1   client_id            int64         
 2   card_id              int64         
 3   amount               float32       
 4   use_chip             category      
 5   merchant_id          int64         
 6   mcc                  category      
 7   fraud                int8          
 8   has_error            int8          
 9   err_card_credential  int8          
 10  err_authentication   int8          
 11  err_financial        int8          
 12  err_system           int8          
 13  is_online            int8          
 14  home_state           category      
 15  is_out_of_state      int8          
dtypes: category(3), datetime64[ns](1), float32(1), int64(3), int8(8)
memory usage: 342.8 MB


In [38]:
trans_common

Unnamed: 0,date,client_id,card_id,amount,use_chip,merchant_id,mcc,fraud,has_error,err_card_credential,err_authentication,err_financial,err_system,is_online,home_state,is_out_of_state
0,2016-11-08 19:59:00,0,1271,7.280000,Chip Transaction,13153,5812.0,0,0,0,0,0,0,0,ME,0
1,2015-04-06 20:35:00,0,4639,9.030000,Chip Transaction,44919,5814.0,0,0,0,0,0,0,0,ME,1
2,2017-11-09 13:02:00,0,1271,42.279999,Chip Transaction,60569,5300.0,0,0,0,0,0,0,0,ME,0
3,2014-08-05 19:39:00,0,1271,9.730000,Swipe Transaction,887,5812.0,0,0,0,0,0,0,0,ME,1
4,2013-08-09 19:50:00,0,1271,8.890000,Swipe Transaction,98648,5814.0,0,0,0,0,0,0,0,ME,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647519,2011-02-11 08:18:00,1998,2160,13.280000,Swipe Transaction,59935,5499.0,0,0,0,0,0,0,0,CA,0
7647520,2011-04-17 07:03:00,1998,2160,6.610000,Online Transaction,9932,5311.0,0,0,0,0,0,0,0,CA,1
7647521,2010-09-01 13:34:00,1998,2160,3.520000,Swipe Transaction,59935,5499.0,0,0,0,0,0,0,0,CA,0
7647522,2013-10-30 08:41:00,1998,2160,5.050000,Online Transaction,47399,5815.0,0,0,0,0,0,0,0,CA,1


## - Null

In [39]:
trans_common.isnull().sum()

date                   0
client_id              0
card_id                0
amount                 0
use_chip               0
merchant_id            0
mcc                    0
fraud                  0
has_error              0
err_card_credential    0
err_authentication     0
err_financial          0
err_system             0
is_online              0
home_state             0
is_out_of_state        0
dtype: int64

# [Setting cards_common]

In [40]:
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     3992 non-null   int64 
 1   client_id              3992 non-null   int64 
 2   card_brand             3992 non-null   object
 3   card_type              3992 non-null   object
 4   card_number            3992 non-null   int64 
 5   expires                3992 non-null   object
 6   cvv                    3992 non-null   int64 
 7   has_chip               3992 non-null   object
 8   num_cards_issued       3992 non-null   int64 
 9   credit_limit           3992 non-null   object
 10  acct_open_date         3992 non-null   object
 11  year_pin_last_changed  3992 non-null   int64 
 12  card_on_dark_web       3992 non-null   object
dtypes: int64(6), object(7)
memory usage: 436.6+ KB


### name setting

In [41]:
card_common.head()

Unnamed: 0,id,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
2911,1271,0,Mastercard,Debit,5050211780967429,04/2021,316,YES,2,$31490,02/2011,2011,No
2910,4639,0,Mastercard,Credit,5802759460691737,12/2019,312,YES,1,$17600,09/2007,2014,No
3986,4652,1,Visa,Credit,4419924074647230,12/2021,1,NO,1,$12800,09/2007,2011,No
3985,3682,1,Visa,Credit,4417513283605637,04/2014,84,YES,1,$10900,07/2002,2013,No
3984,4400,1,Visa,Debit,4843491272960882,01/2017,625,YES,1,$18105,08/2012,2012,No


In [42]:
card_common = (
    card_common
    .rename(columns={"id": "card_id"})
)
card_common

Unnamed: 0,card_id,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
2911,1271,0,Mastercard,Debit,5050211780967429,04/2021,316,YES,2,$31490,02/2011,2011,No
2910,4639,0,Mastercard,Credit,5802759460691737,12/2019,312,YES,1,$17600,09/2007,2014,No
3986,4652,1,Visa,Credit,4419924074647230,12/2021,1,NO,1,$12800,09/2007,2011,No
3985,3682,1,Visa,Credit,4417513283605637,04/2014,84,YES,1,$10900,07/2002,2013,No
3984,4400,1,Visa,Debit,4843491272960882,01/2017,625,YES,1,$18105,08/2012,2012,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,5106,1997,Mastercard,Debit,5138861544730253,09/2014,695,YES,2,$3991,10/2008,2013,No
1091,4807,1997,Mastercard,Debit,5974521557482725,08/2020,89,YES,1,$25048,09/2011,2011,No
4410,2160,1998,Visa,Debit (Prepaid),4969817527377037,11/2022,756,YES,1,$110,03/2007,2007,No
4412,1258,1998,Visa,Credit,4573665962578726,11/2020,452,YES,2,$100,02/2010,2011,No


In [43]:
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   card_id                3992 non-null   int64 
 1   client_id              3992 non-null   int64 
 2   card_brand             3992 non-null   object
 3   card_type              3992 non-null   object
 4   card_number            3992 non-null   int64 
 5   expires                3992 non-null   object
 6   cvv                    3992 non-null   int64 
 7   has_chip               3992 non-null   object
 8   num_cards_issued       3992 non-null   int64 
 9   credit_limit           3992 non-null   object
 10  acct_open_date         3992 non-null   object
 11  year_pin_last_changed  3992 non-null   int64 
 12  card_on_dark_web       3992 non-null   object
dtypes: int64(6), object(7)
memory usage: 436.6+ KB


### card_brand

In [44]:
card_common["card_brand"].unique()

array(['Mastercard', 'Visa', 'Amex', 'Discover'], dtype=object)

In [45]:
card_common["card_brand"] = card_common["card_brand"].astype("category")
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   card_id                3992 non-null   int64   
 1   client_id              3992 non-null   int64   
 2   card_brand             3992 non-null   category
 3   card_type              3992 non-null   object  
 4   card_number            3992 non-null   int64   
 5   expires                3992 non-null   object  
 6   cvv                    3992 non-null   int64   
 7   has_chip               3992 non-null   object  
 8   num_cards_issued       3992 non-null   int64   
 9   credit_limit           3992 non-null   object  
 10  acct_open_date         3992 non-null   object  
 11  year_pin_last_changed  3992 non-null   int64   
 12  card_on_dark_web       3992 non-null   object  
dtypes: category(1), int64(6), object(6)
memory usage: 409.5+ KB


### card_type

In [46]:
card_common["card_type"].unique()

array(['Debit', 'Credit', 'Debit (Prepaid)'], dtype=object)

In [47]:
card_common["card_type"] = card_common["card_type"].astype("category")
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   card_id                3992 non-null   int64   
 1   client_id              3992 non-null   int64   
 2   card_brand             3992 non-null   category
 3   card_type              3992 non-null   category
 4   card_number            3992 non-null   int64   
 5   expires                3992 non-null   object  
 6   cvv                    3992 non-null   int64   
 7   has_chip               3992 non-null   object  
 8   num_cards_issued       3992 non-null   int64   
 9   credit_limit           3992 non-null   object  
 10  acct_open_date         3992 non-null   object  
 11  year_pin_last_changed  3992 non-null   int64   
 12  card_on_dark_web       3992 non-null   object  
dtypes: category(2), int64(6), object(5)
memory usage: 382.4+ KB


### card_number, cvv

In [48]:
card_common = card_common.drop(columns=["card_number", "cvv"])
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   card_id                3992 non-null   int64   
 1   client_id              3992 non-null   int64   
 2   card_brand             3992 non-null   category
 3   card_type              3992 non-null   category
 4   expires                3992 non-null   object  
 5   has_chip               3992 non-null   object  
 6   num_cards_issued       3992 non-null   int64   
 7   credit_limit           3992 non-null   object  
 8   acct_open_date         3992 non-null   object  
 9   year_pin_last_changed  3992 non-null   int64   
 10  card_on_dark_web       3992 non-null   object  
dtypes: category(2), int64(4), object(5)
memory usage: 320.0+ KB


In [49]:
card_common.head()

Unnamed: 0,card_id,client_id,card_brand,card_type,expires,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
2911,1271,0,Mastercard,Debit,04/2021,YES,2,$31490,02/2011,2011,No
2910,4639,0,Mastercard,Credit,12/2019,YES,1,$17600,09/2007,2014,No
3986,4652,1,Visa,Credit,12/2021,NO,1,$12800,09/2007,2011,No
3985,3682,1,Visa,Credit,04/2014,YES,1,$10900,07/2002,2013,No
3984,4400,1,Visa,Debit,01/2017,YES,1,$18105,08/2012,2012,No


### credit_limit

In [50]:
card_common["credit_limit"] = (card_common["credit_limit"].str.replace("$", "", regex=False).astype("float32"))
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   card_id                3992 non-null   int64   
 1   client_id              3992 non-null   int64   
 2   card_brand             3992 non-null   category
 3   card_type              3992 non-null   category
 4   expires                3992 non-null   object  
 5   has_chip               3992 non-null   object  
 6   num_cards_issued       3992 non-null   int64   
 7   credit_limit           3992 non-null   float32 
 8   acct_open_date         3992 non-null   object  
 9   year_pin_last_changed  3992 non-null   int64   
 10  card_on_dark_web       3992 non-null   object  
dtypes: category(2), float32(1), int64(4), object(4)
memory usage: 304.4+ KB


### has chip

In [51]:
card_common["has_chip"].unique()

array(['YES', 'NO'], dtype=object)

In [52]:
card_common["has_chip"] = (card_common["has_chip"] == "YES").astype(np.int8)

### card_on_dark_web

In [53]:
card_common["card_on_dark_web"].unique()

array(['No'], dtype=object)

In [54]:
card_common = card_common.drop(columns=["card_on_dark_web"])

### acct_open_date

In [55]:
card_common["acct_open_date"] = pd.to_datetime(
    card_common["acct_open_date"],
    format="%m/%Y",
    errors="coerce"
)

In [56]:
card_common

Unnamed: 0,card_id,client_id,card_brand,card_type,expires,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed
2911,1271,0,Mastercard,Debit,04/2021,1,2,31490.0,2011-02-01,2011
2910,4639,0,Mastercard,Credit,12/2019,1,1,17600.0,2007-09-01,2014
3986,4652,1,Visa,Credit,12/2021,0,1,12800.0,2007-09-01,2011
3985,3682,1,Visa,Credit,04/2014,1,1,10900.0,2002-07-01,2013
3984,4400,1,Visa,Debit,01/2017,1,1,18105.0,2012-08-01,2012
...,...,...,...,...,...,...,...,...,...,...
1095,5106,1997,Mastercard,Debit,09/2014,1,2,3991.0,2008-10-01,2013
1091,4807,1997,Mastercard,Debit,08/2020,1,1,25048.0,2011-09-01,2011
4410,2160,1998,Visa,Debit (Prepaid),11/2022,1,1,110.0,2007-03-01,2007
4412,1258,1998,Visa,Credit,11/2020,1,2,100.0,2010-02-01,2011


### expires

In [57]:
card_common["expires"] = pd.to_datetime(
    card_common["expires"],
    format="%m/%Y",
    errors="coerce"
)

In [58]:
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   card_id                3992 non-null   int64         
 1   client_id              3992 non-null   int64         
 2   card_brand             3992 non-null   category      
 3   card_type              3992 non-null   category      
 4   expires                3992 non-null   datetime64[ns]
 5   has_chip               3992 non-null   int8          
 6   num_cards_issued       3992 non-null   int64         
 7   credit_limit           3992 non-null   float32       
 8   acct_open_date         3992 non-null   datetime64[ns]
 9   year_pin_last_changed  3992 non-null   int64         
dtypes: category(2), datetime64[ns](2), float32(1), int64(4), int8(1)
memory usage: 245.9 KB


In [59]:
card_common

Unnamed: 0,card_id,client_id,card_brand,card_type,expires,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed
2911,1271,0,Mastercard,Debit,2021-04-01,1,2,31490.0,2011-02-01,2011
2910,4639,0,Mastercard,Credit,2019-12-01,1,1,17600.0,2007-09-01,2014
3986,4652,1,Visa,Credit,2021-12-01,0,1,12800.0,2007-09-01,2011
3985,3682,1,Visa,Credit,2014-04-01,1,1,10900.0,2002-07-01,2013
3984,4400,1,Visa,Debit,2017-01-01,1,1,18105.0,2012-08-01,2012
...,...,...,...,...,...,...,...,...,...,...
1095,5106,1997,Mastercard,Debit,2014-09-01,1,2,3991.0,2008-10-01,2013
1091,4807,1997,Mastercard,Debit,2020-08-01,1,1,25048.0,2011-09-01,2011
4410,2160,1998,Visa,Debit (Prepaid),2022-11-01,1,1,110.0,2007-03-01,2007
4412,1258,1998,Visa,Credit,2020-11-01,1,2,100.0,2010-02-01,2011


In [60]:
card_common.isnull().sum()

card_id                  0
client_id                0
card_brand               0
card_type                0
expires                  0
has_chip                 0
num_cards_issued         0
credit_limit             0
acct_open_date           0
year_pin_last_changed    0
dtype: int64

# [Setting user common]

In [61]:
user_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1219 entries, 958 to 1426
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1219 non-null   int64  
 1   current_age        1219 non-null   int64  
 2   retirement_age     1219 non-null   int64  
 3   birth_year         1219 non-null   int64  
 4   birth_month        1219 non-null   int64  
 5   gender             1219 non-null   object 
 6   address            1219 non-null   object 
 7   latitude           1219 non-null   float64
 8   longitude          1219 non-null   float64
 9   per_capita_income  1219 non-null   object 
 10  yearly_income      1219 non-null   object 
 11  total_debt         1219 non-null   object 
 12  credit_score       1219 non-null   int64  
 13  num_credit_cards   1219 non-null   int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 142.9+ KB


In [62]:
user_common = (
    user_common
    .rename(columns={"id": "client_id"})
)

In [63]:
user_common

Unnamed: 0,client_id,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
958,0,33,69,1986,3,Male,858 Plum Avenue,43.59,-70.33,$29237,$59613,$36199,763,4
1298,1,43,74,1976,4,Female,113 Burns Lane,30.44,-87.18,$22247,$45360,$14587,704,3
1876,2,48,64,1971,8,Male,6035 Forest Avenue,40.84,-73.87,$13461,$27447,$80850,673,5
758,3,49,65,1970,12,Male,840 Elm Avenue,33.89,-98.51,$13705,$27943,$18693,681,4
424,4,54,72,1965,3,Female,6016 Little Creek Boulevard,47.61,-122.30,$37485,$76431,$115362,716,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174,1994,48,66,1971,9,Female,9716 Third Lane,39.38,-119.87,$38345,$78180,$102759,740,3
996,1995,64,62,1955,7,Female,28 First Lane,39.92,-77.71,$20943,$39206,$9219,605,4
1361,1996,46,66,1973,6,Female,7853 Grant Street,29.43,-95.24,$21956,$44768,$59862,728,3
365,1997,76,69,1943,8,Male,765 Forest Street,44.92,-93.40,$39155,$38800,$25867,758,7


### gender

In [64]:
user_common["gender"] = user_common["gender"].astype("category")

### address

In [65]:
user_common = user_common.drop(columns = "address")

In [66]:
user_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1219 entries, 958 to 1426
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   client_id          1219 non-null   int64   
 1   current_age        1219 non-null   int64   
 2   retirement_age     1219 non-null   int64   
 3   birth_year         1219 non-null   int64   
 4   birth_month        1219 non-null   int64   
 5   gender             1219 non-null   category
 6   latitude           1219 non-null   float64 
 7   longitude          1219 non-null   float64 
 8   per_capita_income  1219 non-null   object  
 9   yearly_income      1219 non-null   object  
 10  total_debt         1219 non-null   object  
 11  credit_score       1219 non-null   int64   
 12  num_credit_cards   1219 non-null   int64   
dtypes: category(1), float64(2), int64(7), object(3)
memory usage: 125.1+ KB


### per_capita_income

In [67]:
user_common["per_capita_income"] = (user_common["per_capita_income"].str.replace("$", "", regex=False).astype("float32"))
user_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1219 entries, 958 to 1426
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   client_id          1219 non-null   int64   
 1   current_age        1219 non-null   int64   
 2   retirement_age     1219 non-null   int64   
 3   birth_year         1219 non-null   int64   
 4   birth_month        1219 non-null   int64   
 5   gender             1219 non-null   category
 6   latitude           1219 non-null   float64 
 7   longitude          1219 non-null   float64 
 8   per_capita_income  1219 non-null   float32 
 9   yearly_income      1219 non-null   object  
 10  total_debt         1219 non-null   object  
 11  credit_score       1219 non-null   int64   
 12  num_credit_cards   1219 non-null   int64   
dtypes: category(1), float32(1), float64(2), int64(7), object(2)
memory usage: 120.4+ KB


### yearly_income

In [68]:
user_common["yearly_income"] = (user_common["yearly_income"].str.replace("$", "", regex=False).astype("float32"))
user_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1219 entries, 958 to 1426
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   client_id          1219 non-null   int64   
 1   current_age        1219 non-null   int64   
 2   retirement_age     1219 non-null   int64   
 3   birth_year         1219 non-null   int64   
 4   birth_month        1219 non-null   int64   
 5   gender             1219 non-null   category
 6   latitude           1219 non-null   float64 
 7   longitude          1219 non-null   float64 
 8   per_capita_income  1219 non-null   float32 
 9   yearly_income      1219 non-null   float32 
 10  total_debt         1219 non-null   object  
 11  credit_score       1219 non-null   int64   
 12  num_credit_cards   1219 non-null   int64   
dtypes: category(1), float32(2), float64(2), int64(7), object(1)
memory usage: 115.6+ KB


### total_dept

In [69]:
user_common["total_debt"] = (user_common["total_debt"].str.replace("$", "", regex=False).astype("float32"))
user_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1219 entries, 958 to 1426
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   client_id          1219 non-null   int64   
 1   current_age        1219 non-null   int64   
 2   retirement_age     1219 non-null   int64   
 3   birth_year         1219 non-null   int64   
 4   birth_month        1219 non-null   int64   
 5   gender             1219 non-null   category
 6   latitude           1219 non-null   float64 
 7   longitude          1219 non-null   float64 
 8   per_capita_income  1219 non-null   float32 
 9   yearly_income      1219 non-null   float32 
 10  total_debt         1219 non-null   float32 
 11  credit_score       1219 non-null   int64   
 12  num_credit_cards   1219 non-null   int64   
dtypes: category(1), float32(3), float64(2), int64(7)
memory usage: 110.8 KB


In [70]:
user_common.isnull().sum()

client_id            0
current_age          0
retirement_age       0
birth_year           0
birth_month          0
gender               0
latitude             0
longitude            0
per_capita_income    0
yearly_income        0
total_debt           0
credit_score         0
num_credit_cards     0
dtype: int64

---

In [72]:
trans_common.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 16 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date                 datetime64[ns]
 1   client_id            int64         
 2   card_id              int64         
 3   amount               float32       
 4   use_chip             category      
 5   merchant_id          int64         
 6   mcc                  category      
 7   fraud                int8          
 8   has_error            int8          
 9   err_card_credential  int8          
 10  err_authentication   int8          
 11  err_financial        int8          
 12  err_system           int8          
 13  is_online            int8          
 14  home_state           category      
 15  is_out_of_state      int8          
dtypes: category(3), datetime64[ns](1), float32(1), int64(3), int8(8)
memory usage: 342.8 MB


In [73]:
user_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1219 entries, 958 to 1426
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   client_id          1219 non-null   int64   
 1   current_age        1219 non-null   int64   
 2   retirement_age     1219 non-null   int64   
 3   birth_year         1219 non-null   int64   
 4   birth_month        1219 non-null   int64   
 5   gender             1219 non-null   category
 6   latitude           1219 non-null   float64 
 7   longitude          1219 non-null   float64 
 8   per_capita_income  1219 non-null   float32 
 9   yearly_income      1219 non-null   float32 
 10  total_debt         1219 non-null   float32 
 11  credit_score       1219 non-null   int64   
 12  num_credit_cards   1219 non-null   int64   
dtypes: category(1), float32(3), float64(2), int64(7)
memory usage: 110.8 KB


In [74]:
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   card_id                3992 non-null   int64         
 1   client_id              3992 non-null   int64         
 2   card_brand             3992 non-null   category      
 3   card_type              3992 non-null   category      
 4   expires                3992 non-null   datetime64[ns]
 5   has_chip               3992 non-null   int8          
 6   num_cards_issued       3992 non-null   int64         
 7   credit_limit           3992 non-null   float32       
 8   acct_open_date         3992 non-null   datetime64[ns]
 9   year_pin_last_changed  3992 non-null   int64         
dtypes: category(2), datetime64[ns](2), float32(1), int64(4), int8(1)
memory usage: 245.9 KB


In [81]:
trans_common["client_id"] = trans_common["client_id"].astype("int16")
trans_common["card_id"]   = trans_common["card_id"].astype("int16")
trans_common.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 16 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date                 datetime64[ns]
 1   client_id            int16         
 2   card_id              int16         
 3   amount               float32       
 4   use_chip             category      
 5   merchant_id          int64         
 6   mcc                  category      
 7   fraud                int8          
 8   has_error            int8          
 9   err_card_credential  int8          
 10  err_authentication   int8          
 11  err_financial        int8          
 12  err_system           int8          
 13  is_online            int8          
 14  home_state           category      
 15  is_out_of_state      int8          
dtypes: category(3), datetime64[ns](1), float32(1), int16(2), int64(1), int8(8)
memory usage: 255.3 MB


In [83]:
user_common["client_id"] = user_common["client_id"].astype("int16")
user_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1219 entries, 958 to 1426
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   client_id          1219 non-null   int16   
 1   current_age        1219 non-null   int64   
 2   retirement_age     1219 non-null   int64   
 3   birth_year         1219 non-null   int64   
 4   birth_month        1219 non-null   int64   
 5   gender             1219 non-null   category
 6   latitude           1219 non-null   float64 
 7   longitude          1219 non-null   float64 
 8   per_capita_income  1219 non-null   float32 
 9   yearly_income      1219 non-null   float32 
 10  total_debt         1219 non-null   float32 
 11  credit_score       1219 non-null   int64   
 12  num_credit_cards   1219 non-null   int64   
dtypes: category(1), float32(3), float64(2), int16(1), int64(6)
memory usage: 103.7 KB


In [82]:
card_common["client_id"] = card_common["client_id"].astype("int16")
card_common["card_id"]   = card_common["card_id"].astype("int16")
card_common.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3992 entries, 2911 to 4411
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   card_id                3992 non-null   int16         
 1   client_id              3992 non-null   int16         
 2   card_brand             3992 non-null   category      
 3   card_type              3992 non-null   category      
 4   expires                3992 non-null   datetime64[ns]
 5   has_chip               3992 non-null   int8          
 6   num_cards_issued       3992 non-null   int64         
 7   credit_limit           3992 non-null   float32       
 8   acct_open_date         3992 non-null   datetime64[ns]
 9   year_pin_last_changed  3992 non-null   int64         
dtypes: category(2), datetime64[ns](2), float32(1), int16(2), int64(2), int8(1)
memory usage: 199.1 KB


---

In [88]:
trans_common.to_csv("trans_common.csv")

In [89]:
user_common.to_csv("user_common.csv")

In [90]:
card_common.to_csv("card_common.csv")

---

# Dataset _ trans

In [91]:
trans_common["fraud"].value_counts()

fraud
0    7636461
1      11063
Name: count, dtype: int64

## 불균형 대응

1) 데이터 그대로 
    - Accuracy는 높게 나옴 
    - Recall 붕괴
    (X)
2) SMOTE / 합성 오버샘플링
    - fraud는 행동 + 시점 + 맥락 의존 
    - 데이터 왜곡 위험 있음
    (X)
3) Class Weight만 사용
    - 모델 내부 최적화에는 도움
    - 데이터 분포 자체는 그대로 
    - EDA, feature validation 등 검증 부적합
    (X)

SOL)

>1:N Sampling

    -> 단일 비율이 아니라 시퀀스를 만듦
    - 불균형 비율은 모델 성능의 민감 변수
    - 특정 비율에서만 성능이 좋은 모델은 현업에서 불안정
    - 따라서
        - fraud 대비 정상 비율이 바뀔 때
        - feature의 방향성과 영향력이 유지되는지 확인 

In [97]:
def make_ratio_disjoint_sets_and_save_csv(
    df,
    target_col="fraud",
    ratios=(1, 3, 5, 7),
    n_sets=5,
    shuffle_seed=42,
    save_dir="./ratio_datasets_disjoint"
):
    os.makedirs(save_dir, exist_ok=True)

    fraud_df = df[df[target_col] == 1]
    normal_df = df[df[target_col] == 0]

    n_fraud = len(fraud_df)
    n_normal_total = len(normal_df)

    normal_idx = normal_df.index.to_numpy()
    rng = np.random.default_rng(shuffle_seed)
    rng.shuffle(normal_idx)

    paths = {}

    for r in ratios:
        need_per_set = n_fraud * r
        total_need = need_per_set * n_sets

        if total_need > n_normal_total:
            raise ValueError(
                f"ratio 1:{r} with n_sets={n_sets} needs {total_need} normals "
                f"but only {n_normal_total} available. Reduce n_sets or ratio."
            )

        for s in range(n_sets):
            start = s * need_per_set
            end = (s + 1) * need_per_set
            take_idx = normal_idx[start:end]

            normal_part = normal_df.loc[take_idx]
            out = (
                pd.concat([fraud_df, normal_part], axis=0)
                  .sample(frac=1, random_state=shuffle_seed + s)
                  .reset_index(drop=True)
            )

            name = f"fraud_1_to_{r}_set_{s+1}"
            path = os.path.join(save_dir, f"{name}.csv")
            out.to_csv(path, index=False)
            paths[name] = path

    return paths

In [98]:
csv_paths = make_ratio_disjoint_sets_and_save_csv(
    trans_common,
    ratios=(1, 3, 5, 7),
    n_sets=5,
    shuffle_seed=42,
    save_dir="./ratio_datasets_disjoint"
)


fraud = 1 데이터 => 모든 비율(1:1, 1:3, 1:5, 1:7)에 동일하게 전부 사용

fraud = 0 데이터 => 각 비율 안에서는 안 겹침

선택되지 않은 정상 데이터 => 그 비율 데이터셋에서는 버려짐

| 단계             | 사용하는 데이터          |
| -------------- | ----------------- |
| 현실 이해 / 리스크 파악 | **원본 분포 데이터**     |
| 모델 학습 안정화      | **불균형 보정 데이터**    |
| 성능 평가          | **원본 분포 검증셋**     |


---

In [99]:
import os
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [100]:
 FEATURES = [
    "amount",
    "is_online",
    "is_out_of_state",
    "has_error",
    "err_card_credential",
    "err_authentication",
    "err_financial",
    "err_system",
]

TARGET = "fraud"


### 원본 데이터셋에 대해 평가

In [101]:
X_eval = trans_common[FEATURES]
y_eval = trans_common[TARGET]


In [102]:
def recall_at_top_k(y_true, y_score, k=0.01):
    n = int(len(y_score) * k)
    idx = np.argsort(y_score)[::-1][:n]
    return y_true.iloc[idx].sum() / y_true.sum()


In [None]:
results = []

base_dir = "./ratio_datasets_disjoint"

for r in (1, 3, 5, 7):
    for s in range(1, 6):
        path = f"{base_dir}/fraud_1_to_{r}_set_{s}.csv"
        if not os.path.exists(path):
            continue

        df_train = pd.read_csv(path)

        X_train = df_train[FEATURES]
        y_train = df_train[TARGET]

        model = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(
                max_iter=500,
                n_jobs=-1,
                solver="lbfgs"
            ))
        ])

        model.fit(X_train, y_train)

        y_score = model.predict_proba(X_eval)[:, 1]

        res = {
            "ratio": f"1:{r}",
            "set": s,
            "roc_auc": roc_auc_score(y_eval, y_score),
            "pr_auc": average_precision_score(y_eval, y_score),
            "recall_top_1pct": recall_at_top_k(y_eval, y_score, k=0.01),
            "recall_top_5pct": recall_at_top_k(y_eval, y_score, k=0.05),
        }

        results.append(res)

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,ratio,set,roc_auc,pr_auc,recall_top_1pct,recall_top_5pct
0,1:1,1,0.890091,0.013101,0.124017,0.472295
1,1:1,2,0.89022,0.012982,0.125373,0.47121
2,1:1,3,0.890738,0.013548,0.127181,0.476091
3,1:1,4,0.890658,0.013799,0.126638,0.476182
4,1:1,5,0.890304,0.013496,0.128356,0.475097
5,1:3,1,0.890482,0.013208,0.124379,0.474103
6,1:3,2,0.890982,0.013755,0.129621,0.476272
7,1:3,3,0.890384,0.013274,0.124831,0.474012
8,1:3,4,0.890534,0.013349,0.125282,0.475459
9,1:3,5,0.890318,0.013178,0.124921,0.47347


In [104]:
summary = (
    results_df
    .groupby("ratio")
    .agg(
        roc_auc_mean=("roc_auc", "mean"),
        roc_auc_std=("roc_auc", "std"),
        pr_auc_mean=("pr_auc", "mean"),
        pr_auc_std=("pr_auc", "std"),
        recall1_mean=("recall_top_1pct", "mean"),
        recall1_std=("recall_top_1pct", "std"),
    )
    .reset_index()
)

summary


Unnamed: 0,ratio,roc_auc_mean,roc_auc_std,pr_auc_mean,pr_auc_std,recall1_mean,recall1_std
0,1:1,0.890402,0.000282,0.013385,0.000337,0.126313,0.001672
1,1:3,0.89054,0.000261,0.013353,0.000234,0.125807,0.002157
2,1:5,0.890479,0.000216,0.01326,0.000187,0.124993,0.001347
3,1:7,0.890605,0.000263,0.013354,0.000201,0.125734,0.002142


### 불균형 대응 데이터셋에서의 평가

In [106]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score

FEATURES = [
    "amount",
    "is_online",
    "is_out_of_state",
    "has_error",
    "err_card_credential",
    "err_authentication",
    "err_financial",
    "err_system",
]
TARGET = "fraud"

def recall_at_top_k(y_true, y_score, k=0.01):
    n = max(1, int(len(y_score) * k))
    idx = np.argsort(y_score)[::-1][:n]
    return y_true.iloc[idx].sum() / max(1, y_true.sum())

def eval_within_dataset(df, test_size=0.2, seed=42):
    X = df[FEATURES]
    y = df[TARGET]

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500, n_jobs=-1, solver="lbfgs"))
    ])

    model.fit(X_tr, y_tr)
    y_score = model.predict_proba(X_te)[:, 1]

    return {
        "roc_auc": roc_auc_score(y_te, y_score),
        "pr_auc": average_precision_score(y_te, y_score),
        "recall_top_1pct": recall_at_top_k(y_te, y_score, k=0.01),
        "recall_top_5pct": recall_at_top_k(y_te, y_score, k=0.05),
        "n_test": len(y_te),
        "fraud_rate_test": float(y_te.mean()),
    }

def run_within_eval_for_ratio_sets(
    base_dir="./ratio_datasets_disjoint",
    ratios=(1, 3, 5, 7),
    n_sets=5,
    test_size=0.2,
    seed=42
):
    rows = []
    for r in ratios:
        for s in range(1, n_sets + 1):
            path = os.path.join(base_dir, f"fraud_1_to_{r}_set_{s}.csv")
            if not os.path.exists(path):
                continue

            df = pd.read_csv(path)
            metrics = eval_within_dataset(df, test_size=test_size, seed=seed+s)

            rows.append({
                "ratio": f"1:{r}",
                "set": s,
                **metrics
            })

    res = pd.DataFrame(rows)

    summary = (
        res.groupby("ratio")
           .agg(
               roc_auc_mean=("roc_auc", "mean"),
               roc_auc_std=("roc_auc", "std"),
               pr_auc_mean=("pr_auc", "mean"),
               pr_auc_std=("pr_auc", "std"),
               recall1_mean=("recall_top_1pct", "mean"),
               recall1_std=("recall_top_1pct", "std"),
               fraud_rate_test_mean=("fraud_rate_test", "mean"),
           )
           .reset_index()
    )

    return res, summary


In [107]:
res_within, summary_within = run_within_eval_for_ratio_sets(
    base_dir="./ratio_datasets_disjoint",
    ratios=(1, 3, 5, 7),
    n_sets=5,
    test_size=0.2,
    seed=42
)

summary_within


Unnamed: 0,ratio,roc_auc_mean,roc_auc_std,pr_auc_mean,pr_auc_std,recall1_mean,recall1_std,fraud_rate_test_mean
0,1:1,0.889267,0.028864,0.870357,0.033451,0.01916,0.000685,0.5
1,1:3,0.890766,0.016689,0.713899,0.033386,0.034885,0.00161,0.250028
2,1:5,0.88823,0.004253,0.607729,0.006751,0.047628,0.001617,0.166692
3,1:7,0.891527,0.014933,0.540406,0.034779,0.057659,0.004149,0.125021
