In [1]:
import pandas as pd
import numpy as np
import ijson

In [2]:
trans = pd.read_csv("data/original/transactions_data.csv")
users = pd.read_csv("data/original/users_data.csv")
cards = pd.read_csv("data/original/cards_data.csv")

In [3]:
trans.columns

Index(['id', 'date', 'client_id', 'card_id', 'amount', 'use_chip',
       'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc',
       'errors'],
      dtype='object')

In [4]:
cards.columns

Index(['id', 'client_id', 'card_brand', 'card_type', 'card_number', 'expires',
       'cvv', 'has_chip', 'num_cards_issued', 'credit_limit', 'acct_open_date',
       'year_pin_last_changed', 'card_on_dark_web'],
      dtype='object')

In [5]:
users.columns

Index(['id', 'current_age', 'retirement_age', 'birth_year', 'birth_month',
       'gender', 'address', 'latitude', 'longitude', 'per_capita_income',
       'yearly_income', 'total_debt', 'credit_score', 'num_credit_cards'],
      dtype='object')

# client id에 대해서 dataset 병합

In [6]:
trans = trans[trans["client_id"].isin(users["id"])]
trans = trans[trans["card_id"].isin(cards["id"])]

In [7]:
users_renamed = users.rename(columns={"id": "client_id"})
trans = trans.merge(
    users_renamed,
    on="client_id",
    how="left",
    validate="m:1"
)

In [8]:
cards_renamed = cards.rename(columns={"id": "card_id"})
trans = trans.merge(
    cards_renamed,
    on="card_id",
    how="left",
    validate="m:1"
)
trans.shape

(11413790, 37)

# add target _ "fraud"

In [9]:
ids = []
labels = []

with open("data/original/train_fraud_labels.json", "rb") as f:
    for k, v in ijson.kvitems(f, "target"):
        kid = int(str(k).strip())
        ids.append(kid)
        labels.append(1 if v == "Yes" else 0)

labels_df = pd.DataFrame({"id": ids, "fraud": labels}).astype({"fraud": "int8"})
labels_df

Unnamed: 0,id,fraud
0,10649266,0
1,23410063,0
2,9316588,0
3,12478022,0
4,9558530,0
...,...,...
8914958,14064699,0
8914959,7676538,0
8914960,15131030,0
8914961,17244732,0


In [10]:
trans = trans.merge(labels_df, on="id", how="inner")

In [11]:
trans[["client_id_x", "client_id_y"]]

Unnamed: 0,client_id_x,client_id_y
0,1556,1556
1,561,561
2,1129,1129
3,848,848
4,1807,1807
...,...,...
7647519,227,227
7647520,371,371
7647521,1225,1225
7647522,1362,1362


In [12]:
trans.drop("client_id_y", axis=1, inplace=True)

In [13]:
trans.drop("id", axis=1, inplace=True)
trans.shape

(7647524, 36)

# type change

In [14]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647524 entries, 0 to 7647523
Data columns (total 36 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   date                   object 
 1   client_id_x            int64  
 2   card_id                int64  
 3   amount                 object 
 4   use_chip               object 
 5   merchant_id            int64  
 6   merchant_city          object 
 7   merchant_state         object 
 8   zip                    float64
 9   mcc                    float64
 10  errors                 object 
 11  current_age            int64  
 12  retirement_age         int64  
 13  birth_year             int64  
 14  birth_month            int64  
 15  gender                 object 
 16  address                object 
 17  latitude               float64
 18  longitude              float64
 19  per_capita_income      object 
 20  yearly_income          object 
 21  total_debt             object 
 22  credit_score      

In [15]:
trans["is_online"] = (trans["use_chip"] == "Online Transaction").astype(int)

In [16]:
trans["amount"] = (trans["amount"].str.replace("$", "", regex=False).astype("float32"))

In [17]:
errors_type = trans["errors"].unique()
errors_type

array([nan, 'Bad Expiration', 'Bad Card Number', 'Insufficient Balance',
       'Bad PIN', 'Technical Glitch', 'Bad CVV',
       'Insufficient Balance,Technical Glitch',
       'Bad PIN,Insufficient Balance', 'Bad Zipcode',
       'Bad Expiration,Technical Glitch',
       'Bad Card Number,Bad Expiration', 'Bad PIN,Technical Glitch',
       'Bad Card Number,Insufficient Balance',
       'Bad Expiration,Insufficient Balance', 'Bad Card Number,Bad CVV',
       'Bad CVV,Technical Glitch', 'Bad CVV,Insufficient Balance',
       'Bad Card Number,Technical Glitch',
       'Bad Zipcode,Insufficient Balance',
       'Bad Card Number,Bad Expiration,Insufficient Balance',
       'Bad Expiration,Bad CVV', 'Bad Zipcode,Technical Glitch'],
      dtype=object)

In [18]:
unique_errors = (
    pd.Series(errors_type)
    .dropna()
    .str.split(",")
    .explode()
    .str.strip()
    .unique()
)
print(unique_errors)

['Bad Expiration' 'Bad Card Number' 'Insufficient Balance' 'Bad PIN'
 'Technical Glitch' 'Bad CVV' 'Bad Zipcode']


In [19]:
err = trans["errors"]

# 에러 존재 여부 
trans["has_error"] = err.notna().astype("int8")

trans["err_bad_card_number"] = err.str.contains(
    "Bad Card Number",
    na=False
).astype("int8")

trans["err_bad_expiration"] = err.str.contains(
    "Bad Expiration",
    na=False
).astype("int8")

trans["err_bad_cvv"] = err.str.contains(
    "Bad CVV",
    na=False
).astype("int8")

trans["err_bad_pin"] = err.str.contains(
    "Bad PIN",
    na=False
).astype("int8")

trans["err_bad_zipcode"] = err.str.contains(
    "Bad Zipcode",
    na=False
).astype("int8")

trans["err_insufficient_balance"] = err.str.contains(
    "Insufficient Balance",
    na=False
).astype("int8")

trans["err_technical_glitch"] = err.str.contains(
    "Technical Glitch",
    na=False
).astype("int8")

In [20]:
trans.drop("errors", axis=1, inplace=True)
trans.isnull().sum()

date                             0
client_id_x                      0
card_id                          0
amount                           0
use_chip                         0
merchant_id                      0
merchant_city                    0
merchant_state              893521
zip                         942920
mcc                              0
current_age                      0
retirement_age                   0
birth_year                       0
birth_month                      0
gender                           0
address                          0
latitude                         0
longitude                        0
per_capita_income                0
yearly_income                    0
total_debt                       0
credit_score                     0
num_credit_cards                 0
card_brand                       0
card_type                        0
card_number                      0
expires                          0
cvv                              0
has_chip            

In [21]:
trans.drop("cvv", axis=1, inplace=True)

In [22]:
trans["card_on_dark_web"].unique()

array(['No'], dtype=object)

In [23]:
trans.drop("card_on_dark_web", axis=1, inplace=True)

In [24]:
trans["year_pin_last_changed"].astype("Int16")

0          2008
1          2015
2          2008
3          2014
4          2015
           ... 
7647519    2016
7647520    2013
7647521    2014
7647522    2016
7647523    2013
Name: year_pin_last_changed, Length: 7647524, dtype: Int16

In [25]:
trans["acct_open_date"] = pd.to_datetime(
    trans["acct_open_date"],
    format="%m/%Y",
    errors="coerce"
)

In [26]:
trans["acct_open_year"] = trans["acct_open_date"].dt.year.astype("int16")
trans["acct_open_month"] = trans["acct_open_date"].dt.month.astype("int8")

In [27]:
trans["expires"] = pd.to_datetime(
    trans["expires"],
    format="%m/%Y",
    errors="coerce"
)
trans["expires_year"] = trans["expires"].dt.year.astype("int16")
trans["expires_month"] = trans["expires"].dt.month.astype("int8")

In [28]:
trans["date"] = pd.to_datetime(
    trans["date"],
    format="%Y-%m-%d %H:%M:%S",
    errors="coerce"
)

ref_date = trans["date"]

months_to_expire = (
    (trans["expires_year"] - ref_date.dt.year) * 12 +
    (trans["expires_month"] - ref_date.dt.month)
)

trans["months_to_expire"] = months_to_expire.astype("int16")

In [29]:
trans = trans[trans["months_to_expire"] >= 0].copy()

In [30]:
trans["is_online"] = trans["is_online"].astype("int8")

In [31]:
trans.drop(columns=["expires", "acct_open_date"], inplace=True)

In [32]:
trans = trans.sort_values(
    ["date"]
).reset_index(drop=True)

In [33]:
trans["tx_ts"] = trans["date"].astype("datetime64[ns]")
trans["tx_year"]  = trans["tx_ts"].dt.year.astype("int16")
trans["tx_month"] = trans["tx_ts"].dt.month.astype("int8")
trans["tx_day"]   = trans["tx_ts"].dt.day.astype("int8")     
trans["tx_hour"]  = trans["tx_ts"].dt.hour.astype("int8")

In [34]:
trans["is_weekend"] = (trans["date"].dt.weekday >= 5).astype("int8")

In [35]:
trans["credit_limit"] = (trans["credit_limit"].str.replace("$", "", regex=False).astype("float32"))

In [36]:
trans["has_chip"].replace({"YES": 1, "NO": 0}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  trans["has_chip"].replace({"YES": 1, "NO": 0}, inplace=True)
  trans["has_chip"].replace({"YES": 1, "NO": 0}, inplace=True)


In [37]:
trans["has_chip"] = trans["has_chip"].astype("int8")

In [38]:
trans["is_credit"] = (
    trans["card_type"]
      .astype(str)
      .str.upper()
      .str.contains("CREDIT")
      .astype("int8")
)
trans["is_prepaid"] = (
    trans["card_type"]
      .astype(str)
      .str.upper()
      .str.contains("PREPAID")
      .astype("int8")
)
trans.drop("card_type", axis=1, inplace=True)

In [39]:
trans["num_credit_cards"] = trans["num_credit_cards"].astype("int8")

In [40]:
trans["total_debt"] = (trans["total_debt"].str.replace("$", "", regex=False).astype("float32"))

In [41]:
trans["yearly_income"] = (trans["yearly_income"].str.replace("$", "", regex=False).astype("float32"))

In [42]:
trans["per_capita_income"] = (trans["per_capita_income"].str.replace("$", "", regex=False).astype("float32"))

In [43]:
trans["male"] = (trans["gender"] == "Male").astype("int8")
trans.drop("gender", axis=1, inplace=True)

In [44]:
trans["birth_month"] = trans["birth_month"].astype("int8")

In [45]:
trans.rename(columns={"client_id_x": "client_id"}, inplace=True)

In [46]:
trans["mcc"] = pd.to_numeric(trans["mcc"], errors="coerce")
trans["mcc"] = trans["mcc"].astype("Int64")

In [47]:
MCC_GROUP = {
    # 1) Food & Daily
    "Food & Daily": [
        "5812","5814","5813","5411","5499","5912","5921",
        "5300","5310","5311"
    ],

    # 2) Transport & Travel  (+ freight 일부 흡수, + 4112/4411 포함)
    "Transport & Travel": [
        "4111","4121","4131","4112",
        "3722","3771","3775",
        "4511","4411",
        "4722","7011","4784",
        "4214"  # Motor Freight -> 여기로 흡수
    ],

    # 3) Digital & Online
    "Digital & Online": [
        "5815","5816","4814","4899","3780"
    ],

    # 4) Financial
    "Financial": [
        "4829","6300","7276","8931"
    ],

    # 5) Retail
    "Retail": [
        "5045","5732","5733",
        "5941","5942","5947",
        "5661","5651","5655","5621",
        "5977","5970","5932",
        "5192","5193",
        "5712","5719","5722",
        "5094"
    ],

    # 6) Medical
    "Medical": [
        "8011","8021","8041","8043","8049","8062","8099"
    ],

    # 7) Entertainment 
    "Entertainment": [
        "7832","7922","7996","7801","7802","7995"
    ],

    # 8) Automotive & Home
    "Automotive & Home": [
        "5541", 
        "7531","7538","7542","7549","5533",
        "1711","5251","5261","5211","3504",
        "7210","7230","7349",
        "3640"
    ],

    # 9) Utilities & Government
    "Utilities & Government": [
        "4900","9402"
    ],

    # 10) Professional Services
    "Professional Services": [
        "8111","7393"
    ],

    # 11) Industrial / Manufacturing (나머지 제조/가공 계열)
    "Industrial / Manufacturing": [
        "3000","3001","3005","3006","3007","3008","3009",
        "3058","3066","3075",
        "3132","3144","3174",
        "3256","3260",
        "3359","3387","3389","3390","3393","3395","3405",
        "3509","3596","3684",
        "3730" 
    ],
}



trans["mcc"] = trans["mcc"].astype(str)

for group_name, mcc_list in MCC_GROUP.items():
    col_name = f"mccg_{group_name.replace(' & ', '_').replace(' ', '_')}"
    trans[col_name] = trans["mcc"].isin(mcc_list).astype(np.int8)


In [48]:
mccg_cols = [
    "mccg_Food_Daily",
    "mccg_Transport_Travel",
    "mccg_Digital_Online",
    "mccg_Financial",
    "mccg_Retail",
    "mccg_Medical",
    "mccg_Entertainment",
    "mccg_Automotive_Home",
    "mccg_Utilities_Government",
    "mccg_Professional_Services",
    "mccg_Industrial_/_Manufacturing",
]

all_zero_mask = (trans[mccg_cols].sum(axis=1) == 0)

all_zero_mask.sum()


np.int64(0)

In [49]:
trans.drop(columns=["mcc"], axis=1, inplace=True)

In [50]:
trans.drop("card_number", axis=1, inplace=True)

In [51]:
trans["card_brand"] = (
    trans["card_brand"]
    .astype(str)
    .str.strip()
    .str.title()   # Visa / Mastercard / Amex / Discover
)

BRANDS = ["Visa", "Mastercard", "Amex", "Discover"]

for b in BRANDS:
    trans[f"cb_{b}"] = (trans["card_brand"] == b).astype(np.int8)

In [52]:
trans.drop("card_brand", axis=1, inplace=True)

In [53]:
trans.drop("tx_ts", axis=1, inplace=True)

# e-commerce(비대면) vs POS(대면)

**리스크 발생 메커니즘**) 온라인 결제 != 오프라인 결제 

온라인(CNP) 거래는 오프라인 대비 fraud rate가 높고, 리스크 패턴이 다르기 때문에\
하나의 모델로 통합할 경우 신호가 희석될 가능성이 있다고 판단

거래 특성이 동질적인 집단을 먼저 모델링하는 것이 합리적

### 온라인 전용 거래가 main target

In [54]:
online = trans[trans["is_online"]==1]
online["fraud"].value_counts()

fraud
0    882074
1      8694
Name: count, dtype: int64

In [55]:
offline = trans[trans["is_online"]==0]
offline["fraud"].value_counts()

fraud
0    6754342
1       2368
Name: count, dtype: int64

In [56]:
online

Unnamed: 0,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,current_age,...,mccg_Medical,mccg_Entertainment,mccg_Automotive_Home,mccg_Utilities_Government,mccg_Professional_Services,mccg_Industrial_/_Manufacturing,cb_Visa,cb_Mastercard,cb_Amex,cb_Discover
5,2010-01-01 00:14:00,1684,2140,26.459999,Online Transaction,39021,ONLINE,,,56,...,0,0,0,0,0,0,0,1,0,0
13,2010-01-01 00:34:00,394,4717,26.040001,Online Transaction,39021,ONLINE,,,52,...,0,0,0,0,0,0,0,1,0,0
18,2010-01-01 00:43:00,301,3742,10.170000,Online Transaction,39021,ONLINE,,,47,...,0,0,0,0,0,0,1,0,0,0
21,2010-01-01 00:48:00,1127,3869,22.570000,Online Transaction,39021,ONLINE,,,59,...,0,0,0,0,0,0,1,0,0,0
26,2010-01-01 01:01:00,820,127,270.220001,Online Transaction,73186,ONLINE,,,70,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647427,2018-06-23 10:59:00,116,4899,36.619999,Online Transaction,39021,ONLINE,,,31,...,0,0,0,0,0,0,0,1,0,0
7647433,2018-06-23 11:00:00,876,3918,34.259998,Online Transaction,9932,ONLINE,,,65,...,0,0,0,0,0,0,1,0,0,0
7647434,2018-06-23 11:00:00,1433,5841,71.110001,Online Transaction,88459,ONLINE,,,73,...,0,0,0,0,0,0,0,1,0,0
7647439,2018-06-23 11:02:00,1591,2847,15.120000,Online Transaction,88998,ONLINE,,,58,...,0,0,0,0,0,0,1,0,0,0


In [57]:
# online에는 merchant_state, zip 없어서 drop
online.drop(columns=["merchant_state", "zip"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online.drop(columns=["merchant_state", "zip"], inplace=True)


In [58]:
online["use_chip"].unique()

array(['Online Transaction'], dtype=object)

In [59]:
online.drop("use_chip", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online.drop("use_chip", axis=1, inplace=True)


In [60]:
online.drop(columns=["birth_year", "birth_month"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online.drop(columns=["birth_year", "birth_month"], inplace=True)


In [61]:
online.drop(columns=["expires_year", "expires_month"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online.drop(columns=["expires_year", "expires_month"], inplace=True)


In [62]:
online.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890768 entries, 5 to 7647448
Data columns (total 56 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   date                             890768 non-null  datetime64[ns]
 1   client_id                        890768 non-null  int64         
 2   card_id                          890768 non-null  int64         
 3   amount                           890768 non-null  float32       
 4   merchant_id                      890768 non-null  int64         
 5   merchant_city                    890768 non-null  object        
 6   current_age                      890768 non-null  int64         
 7   retirement_age                   890768 non-null  int64         
 8   address                          890768 non-null  object        
 9   latitude                         890768 non-null  float64       
 10  longitude                        890768 non-null

In [63]:
months_from_account = (
    (online["date"].dt.to_period("M") -
     pd.to_datetime(
         dict(year=online["acct_open_year"],
              month=online["acct_open_month"],
              day=1)
     ).dt.to_period("M"))
).apply(lambda x: x.n)

online["months_from_account"] = months_from_account.astype("int16")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online["months_from_account"] = months_from_account.astype("int16")


In [64]:
online.drop(columns=["acct_open_year", "acct_open_month"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online.drop(columns=["acct_open_year", "acct_open_month"], inplace=True)


In [65]:
online["merchant_city"].value_counts()

merchant_city
ONLINE    890768
Name: count, dtype: int64

In [66]:
online.drop("merchant_city", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online.drop("merchant_city", axis=1, inplace=True)


In [67]:
online.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890768 entries, 5 to 7647448
Data columns (total 54 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   date                             890768 non-null  datetime64[ns]
 1   client_id                        890768 non-null  int64         
 2   card_id                          890768 non-null  int64         
 3   amount                           890768 non-null  float32       
 4   merchant_id                      890768 non-null  int64         
 5   current_age                      890768 non-null  int64         
 6   retirement_age                   890768 non-null  int64         
 7   address                          890768 non-null  object        
 8   latitude                         890768 non-null  float64       
 9   longitude                        890768 non-null  float64       
 10  per_capita_income                890768 non-null

In [68]:
online["has_error"].value_counts()

has_error
0    870418
1     20350
Name: count, dtype: int64

In [69]:
online["years_since_pin_change"] = (
    online["date"].dt.year - online["year_pin_last_changed"]
).astype("int8")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online["years_since_pin_change"] = (


In [70]:
(online["years_since_pin_change"] < 0).sum()


np.int64(131712)

In [71]:
online["years_since_pin_change"]

5          -2
13          1
18          1
21          1
26          1
           ..
7647427     2
7647433     5
7647434    13
7647439    10
7647448     3
Name: years_since_pin_change, Length: 890768, dtype: int8

In [72]:
online[online["years_since_pin_change"] < 0][
    ["date", "year_pin_last_changed"]
].head()

Unnamed: 0,date,year_pin_last_changed
5,2010-01-01 00:14:00,2012
45,2010-01-01 01:54:00,2016
58,2010-01-01 02:17:00,2012
61,2010-01-01 02:25:00,2011
83,2010-01-01 03:20:00,2011


In [73]:
online["year_pin_last_changed"].describe()
online["date"].dt.year.describe()

count    890768.000000
mean       2013.946940
std           2.422897
min        2010.000000
25%        2012.000000
50%        2014.000000
75%        2016.000000
max        2018.000000
Name: date, dtype: float64

In [74]:
online["years_since_pin_change"].describe()


count    890768.000000
mean          2.737550
std           3.467356
min         -10.000000
25%           1.000000
50%           3.000000
75%           5.000000
max          15.000000
Name: years_since_pin_change, dtype: float64

In [75]:
online["year_pin_last_changed"].describe()


count    890768.000000
mean       2011.209390
std           2.712284
min        2002.000000
25%        2010.000000
50%        2011.000000
75%        2013.000000
max        2020.000000
Name: year_pin_last_changed, dtype: float64

In [76]:
online["years_since_pin_change"] = (
    online["years_since_pin_change"]
    .clip(lower=0)
    .astype("int8")
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online["years_since_pin_change"] = (


In [77]:
online.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890768 entries, 5 to 7647448
Data columns (total 55 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   date                             890768 non-null  datetime64[ns]
 1   client_id                        890768 non-null  int64         
 2   card_id                          890768 non-null  int64         
 3   amount                           890768 non-null  float32       
 4   merchant_id                      890768 non-null  int64         
 5   current_age                      890768 non-null  int64         
 6   retirement_age                   890768 non-null  int64         
 7   address                          890768 non-null  object        
 8   latitude                         890768 non-null  float64       
 9   longitude                        890768 non-null  float64       
 10  per_capita_income                890768 non-null

In [78]:
online["address"].value_counts()

address
468 Spruce Street                   10070
375 Hillside Drive                   9333
391 Martin Luther King Boulevard     8198
6785 Essex Lane                      6963
79 South Boulevard                   6403
                                    ...  
42720 East Drive                       53
1426 Fifth Avenue                      45
7111 Lexington Avenue                  39
4701 Littlewood Drive                  38
822 Ocean Street                       32
Name: count, Length: 1219, dtype: int64

In [79]:
online["years_to_retirement"] = (
    online["retirement_age"] - online["current_age"]
).astype("int8")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online["years_to_retirement"] = (


In [80]:
online["years_to_retirement"].describe()
(online["years_to_retirement"] < 0).sum()

np.int64(184720)

In [81]:
online["years_to_retirement"] = (
    online["years_to_retirement"]
    .clip(lower=0)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online["years_to_retirement"] = (


In [82]:
online.drop("retirement_age", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online.drop("retirement_age", axis=1, inplace=True)


In [83]:
# 고객 평균 위치 계산 (고객별 home proxy)
home_loc = (
    online.groupby("client_id")[["latitude", "longitude"]]
    .mean()
    .rename(columns={
        "latitude": "home_lat",
        "longitude": "home_lon"
    })
)

online = online.join(home_loc, on="client_id")


In [84]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # km
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = (np.sin(dlat/2)**2 +
         np.cos(np.radians(lat1)) *
         np.cos(np.radians(lat2)) *
         np.sin(dlon/2)**2)
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

online["distance_from_home"] = haversine(
    online["home_lat"],
    online["home_lon"],
    online["latitude"],
    online["longitude"]
)


In [85]:
online.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890768 entries, 5 to 7647448
Data columns (total 58 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   date                             890768 non-null  datetime64[ns]
 1   client_id                        890768 non-null  int64         
 2   card_id                          890768 non-null  int64         
 3   amount                           890768 non-null  float32       
 4   merchant_id                      890768 non-null  int64         
 5   current_age                      890768 non-null  int64         
 6   address                          890768 non-null  object        
 7   latitude                         890768 non-null  float64       
 8   longitude                        890768 non-null  float64       
 9   per_capita_income                890768 non-null  float32       
 10  yearly_income                    890768 non-null

In [86]:
online["income_ratio_region"] = (
    online["yearly_income"] /
    (online["per_capita_income"] + 1e-6)
).astype("float32")

In [87]:
online["log_yearly_income"] = np.log1p(online["yearly_income"])
online["log_income_ratio_region"] = np.log1p(
    online["income_ratio_region"]
)

In [88]:
online.drop(columns=["latitude", "longitude", "address"], inplace=True)

In [89]:
online = online.drop(columns=["home_lat", "home_lon"])

---

In [90]:
online["is_refund"] = (online["amount"] < 0).astype("int8")

online["log_abs_amount"] = np.log1p(
    np.abs(online["amount"])
).astype("float32")

online["amount_income_ratio"] = (
    online["amount"] /
    (online["yearly_income"] + 1e-6)
).astype("float32")

online["amount_limit_ratio"] = (
    online["amount"] /
    (online["credit_limit"] + 1e-6)
).astype("float32")

In [91]:
online.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890768 entries, 5 to 7647448
Data columns (total 60 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   date                             890768 non-null  datetime64[ns]
 1   client_id                        890768 non-null  int64         
 2   card_id                          890768 non-null  int64         
 3   amount                           890768 non-null  float32       
 4   merchant_id                      890768 non-null  int64         
 5   current_age                      890768 non-null  int64         
 6   per_capita_income                890768 non-null  float32       
 7   yearly_income                    890768 non-null  float32       
 8   total_debt                       890768 non-null  float32       
 9   credit_score                     890768 non-null  int64         
 10  num_credit_cards                 890768 non-null

In [92]:
online.head(20)

Unnamed: 0,date,client_id,card_id,amount,merchant_id,current_age,per_capita_income,yearly_income,total_debt,credit_score,...,years_since_pin_change,years_to_retirement,distance_from_home,income_ratio_region,log_yearly_income,log_income_ratio_region,is_refund,log_abs_amount,amount_income_ratio,amount_limit_ratio
5,2010-01-01 00:14:00,1684,2140,26.459999,39021,56,13668.0,27861.0,108313.0,782,...,0,9,0.0,2.038411,10.235019,1.111335,0,3.31273,0.00095,0.575217
13,2010-01-01 00:34:00,394,4717,26.040001,39021,52,34138.0,69604.0,47193.0,684,...,1,19,0.0,2.038901,11.150592,1.111496,0,3.297317,0.000374,0.001117
18,2010-01-01 00:43:00,301,3742,10.17,39021,47,25654.0,52308.0,135319.0,679,...,1,22,0.0,2.03898,10.864923,1.111522,0,2.413232,0.000194,0.000484
21,2010-01-01 00:48:00,1127,3869,22.57,39021,59,17455.0,35590.0,92603.0,720,...,1,11,0.0,2.038957,10.479848,1.111515,0,3.159975,0.000634,0.000816
26,2010-01-01 01:01:00,820,127,270.220001,73186,70,26762.0,41902.0,29713.0,633,...,1,0,0.0,1.565727,10.643113,0.942242,0,5.602931,0.006449,0.013017
28,2010-01-01 01:06:00,1758,4686,87.089996,17976,57,11061.0,22556.0,43260.0,645,...,0,9,0.0,2.039237,10.023801,1.111606,0,4.478359,0.003861,0.010521
31,2010-01-01 01:11:00,566,5577,14.66,16798,60,22680.0,46244.0,108449.0,814,...,1,7,0.0,2.038977,10.741709,1.111521,0,2.75111,0.000317,0.001451
41,2010-01-01 01:46:00,1575,224,34.34,16798,51,28272.0,57646.0,127892.0,724,...,0,17,0.0,2.038979,10.962093,1.111521,0,3.565016,0.000596,0.001492
45,2010-01-01 01:54:00,1449,241,119.230003,87530,66,15334.0,28104.0,12054.0,751,...,0,0,0.0,1.83279,10.243703,1.041262,0,4.789407,0.004242,0.010407
46,2010-01-01 01:56:00,760,5876,52.98,39021,56,18420.0,37558.0,72514.0,778,...,2,8,0.0,2.038979,10.533669,1.111522,0,3.988614,0.001411,0.005822


In [93]:
online.to_parquet("DATA/online")

In [94]:
train = online[online["date"] < "2016-01-01"].copy()
test  = online[(online["date"] >= "2016-01-01") & (online["date"] < "2017-01-01")].copy()
check = online[online["date"] >= "2017-01-01"].copy()

print(train["fraud"].value_counts())
print(test["fraud"].value_counts())
print(check["fraud"].value_counts())

fraud
0    603057
1      6598
Name: count, dtype: int64
fraud
0    112113
1      2096
Name: count, dtype: int64
fraud
0    166904
Name: count, dtype: int64


In [95]:
train.to_parquet("DATA/train")
test.to_parquet("DATA/test")
check.to_parquet("DATA/check")

## Sampling 여부 판단

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    precision_recall_curve,
    confusion_matrix,
    classification_report,
)

LABEL = "fraud"

DROP_COLS = [
    "date", "client_id", "card_id", "merchant_id",  
    LABEL
]

def build_Xy(df: pd.DataFrame):
    X = df.drop(columns=[c for c in DROP_COLS if c in df.columns]).copy()
    y = df[LABEL].astype(int).copy()
    return X, y

X_train, y_train = build_Xy(train)
X_test,  y_test  = build_Xy(test)


num_cols = [c for c in X_train.columns if str(X_train[c].dtype).startswith(("float", "int"))]


preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=True, with_std=True), num_cols),
    ],
    remainder="drop"
)


clf = LogisticRegression(
    solver="lbfgs",
    max_iter=2000,
    class_weight="balanced",   # 샘플링 없이 불균형 대응
    n_jobs=None
)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", clf),
])

pipe.fit(X_train, y_train)


proba_test = pipe.predict_proba(X_test)[:, 1]

prauc = average_precision_score(y_test, proba_test)
rocauc = roc_auc_score(y_test, proba_test)

print(f"PR-AUC(test):  {prauc:.6f}")
print(f"ROC-AUC(test): {rocauc:.6f}")


target_recall = 0.70

prec, rec, thr = precision_recall_curve(y_test, proba_test)
prec_t = prec[:-1]
rec_t  = rec[:-1]
thr_t  = thr

mask = rec_t >= target_recall
if mask.any():
    best_idx = np.argmax(prec_t[mask])
    best_thr = thr_t[mask][best_idx]
    best_prec = prec_t[mask][best_idx]
    best_rec  = rec_t[mask][best_idx]
    note = "picked_best_precision_under_recall_constraint"
else:
    f1 = 2 * (prec_t * rec_t) / (prec_t + rec_t + 1e-12)
    best_idx = np.argmax(f1)
    best_thr = thr_t[best_idx]
    best_prec = prec_t[best_idx]
    best_rec  = rec_t[best_idx]
    note = "fallback_best_f1"

print("\n=== THRESHOLD PICK ===")
print({"threshold": float(best_thr), "precision": float(best_prec), "recall": float(best_rec), "note": note})

yhat = (proba_test >= best_thr).astype(int)
print("\nconfusion_matrix(test):")
print(confusion_matrix(y_test, yhat))

print("\nclassification_report(test):")
print(classification_report(y_test, yhat, digits=4))


PR-AUC(test):  0.236228
ROC-AUC(test): 0.881867

=== THRESHOLD PICK ===
{'threshold': 0.6178754716132203, 'precision': 0.10490210090038588, 'recall': 0.700381679389313, 'note': 'picked_best_precision_under_recall_constraint'}

confusion_matrix(test):
[[99587 12526]
 [  628  1468]]

classification_report(test):
              precision    recall  f1-score   support

           0     0.9937    0.8883    0.9380    112113
           1     0.1049    0.7004    0.1825      2096

    accuracy                         0.8848    114209
   macro avg     0.5493    0.7943    0.5603    114209
weighted avg     0.9774    0.8848    0.9242    114209



In [None]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    precision_recall_curve,
    confusion_matrix,
)

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

LABEL = "fraud"


# 공통: X, y 구성

DROP_COLS = ["date", "client_id", "card_id", "merchant_id", LABEL]

def build_Xy(df: pd.DataFrame):
    X = df.drop(columns=[c for c in DROP_COLS if c in df.columns]).copy()
    y = df[LABEL].astype(int).copy()
    return X, y

X_test, y_test = build_Xy(test)

# num 컬럼
def get_num_cols(X: pd.DataFrame):
    return [c for c in X.columns if str(X[c].dtype).startswith(("float", "int"))]


# threshold 선택: recall >= target_recall에서 precision 최대

def pick_threshold_by_recall(y_true, y_score, target_recall=0.70):
    prec, rec, thr = precision_recall_curve(y_true, y_score)
    prec_t = prec[:-1]
    rec_t  = rec[:-1]
    thr_t  = thr

    mask = rec_t >= target_recall
    if mask.any():
        best_local = np.argmax(prec_t[mask])
        best_thr = thr_t[mask][best_local]
        best_prec = prec_t[mask][best_local]
        best_rec  = rec_t[mask][best_local]
        note = "picked_best_precision_under_recall_constraint"
    else:
        f1 = 2 * (prec_t * rec_t) / (prec_t + rec_t + 1e-12)
        best_idx = np.argmax(f1)
        best_thr = thr_t[best_idx]
        best_prec = prec_t[best_idx]
        best_rec  = rec_t[best_idx]
        note = "fallback_best_f1"
    return float(best_thr), float(best_prec), float(best_rec), note


# 모델 학습 + 평가

def fit_and_eval(train_df: pd.DataFrame, tag: str, target_recall=0.70):
    X_train, y_train = build_Xy(train_df)

    num_cols = get_num_cols(X_train)

    preprocess = ColumnTransformer(
        transformers=[("num", StandardScaler(), num_cols)],
        remainder="drop"
    )

    clf = LogisticRegression(
        solver="lbfgs",
        max_iter=2000,
        class_weight="balanced",   
    )

    pipe = Pipeline(steps=[("prep", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)

    proba = pipe.predict_proba(X_test)[:, 1]

    prauc = average_precision_score(y_test, proba)
    rocauc = roc_auc_score(y_test, proba)

    thr, prec, rec, note = pick_threshold_by_recall(y_test, proba, target_recall=target_recall)
    yhat = (proba >= thr).astype(int)

    cm = confusion_matrix(y_test, yhat)
    tn, fp, fn, tp = cm.ravel()

    alert_rate = (tp + fp) / len(y_test)

    return {
        "tag": tag,
        "train_size": int(len(train_df)),
        "train_pos": int(train_df[LABEL].sum()),
        "train_pos_rate": float(train_df[LABEL].mean()),
        "PR_AUC_test": float(prauc),
        "ROC_AUC_test": float(rocauc),
        "threshold": float(thr),
        "precision@recall": float(prec),
        "recall": float(rec),
        "alert_rate": float(alert_rate),
        "TP": int(tp),
        "FP": int(fp),
        "FN": int(fn),
        "TN": int(tn),
        "note": note,
    }


# 1) Baseline 
results = []
results.append(fit_and_eval(train, tag="baseline_raw", target_recall=0.70))


# 2) UnderSampling (OSS) -> train_under 생성 후 같은 평가

def make_under_oss(train_df: pd.DataFrame, sampling_strategy=0.02, seed=42):
    X, y = build_Xy(train_df)
    rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=seed)
    X_res, y_res = rus.fit_resample(X, y)
    out = X_res.copy()
    out[LABEL] = y_res.values
    # 시계열 유지용 정렬(원하면)
    if "date" in train_df.columns:
        pass
    return out

train_under = make_under_oss(train, sampling_strategy=0.02, seed=42)
results.append(fit_and_eval(train_under, tag="under_1to50", target_recall=0.70))


# 3) OverSampling (OSS) -> train_over 생성 후 같은 평가

def make_over_oss(train_df: pd.DataFrame, sampling_strategy=0.10, seed=42):
    X, y = build_Xy(train_df)
    ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=seed)
    X_res, y_res = ros.fit_resample(X, y)
    out = X_res.copy()
    out[LABEL] = y_res.values
    return out

train_over = make_over_oss(train, sampling_strategy=0.10, seed=42)
results.append(fit_and_eval(train_over, tag="over_1to10", target_recall=0.70))

# 결과 표

df_res = pd.DataFrame(results)
cols = [
    "tag", "train_size", "train_pos", "train_pos_rate",
    "PR_AUC_test", "ROC_AUC_test",
    "threshold", "precision@recall", "recall", "alert_rate",
    "TP", "FP", "FN", "TN", "note"
]
df_res = df_res[cols].sort_values("tag")
print(df_res.to_string(index=False))


         tag  train_size  train_pos  train_pos_rate  PR_AUC_test  ROC_AUC_test  threshold  precision@recall   recall  alert_rate   TP    FP  FN    TN                                          note
baseline_raw      609655       6598        0.010823     0.236228      0.881867   0.617875          0.104902 0.700382    0.122530 1468 12526 628 99587 picked_best_precision_under_recall_constraint
  over_1to10      663362      60305        0.090908     0.224439      0.881353   0.620461          0.105060 0.700382    0.122346 1468 12505 628 99608 picked_best_precision_under_recall_constraint
 under_1to50      336498       6598        0.019608     0.234731      0.881991   0.618671          0.104872 0.700382    0.122565 1468 12530 628 99583 picked_best_precision_under_recall_constraint


In [98]:
def fit_and_eval(train_df: pd.DataFrame, tag: str, target_recall=0.70, use_class_weight=True):
    X_train, y_train = build_Xy(train_df)
    num_cols = get_num_cols(X_train)

    preprocess = ColumnTransformer(
        transformers=[("num", StandardScaler(), num_cols)],
        remainder="drop"
    )

    clf = LogisticRegression(
        solver="lbfgs",
        max_iter=2000,
        class_weight=("balanced" if use_class_weight else None),
    )

    pipe = Pipeline(steps=[("prep", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)

    proba = pipe.predict_proba(X_test)[:, 1]

    prauc = average_precision_score(y_test, proba)
    rocauc = roc_auc_score(y_test, proba)

    thr, prec, rec, note = pick_threshold_by_recall(y_test, proba, target_recall=target_recall)
    yhat = (proba >= thr).astype(int)

    tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
    alert_rate = (tp + fp) / len(y_test)

    return {
        "tag": tag,
        "use_class_weight": use_class_weight,
        "train_size": int(len(train_df)),
        "train_pos_rate": float(train_df[LABEL].mean()),
        "PR_AUC_test": float(prauc),
        "precision@recall": float(prec),
        "recall": float(rec),
        "alert_rate": float(alert_rate),
        "TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn),
        "threshold": float(thr),
        "note": note,
    }


In [99]:
results = []
results.append(fit_and_eval(train, "raw + cw", use_class_weight=True))
results.append(fit_and_eval(train, "raw + no_cw", use_class_weight=False))

results.append(fit_and_eval(train_under, "under + no_cw", use_class_weight=False))
results.append(fit_and_eval(train_under, "under + cw", use_class_weight=True))

results.append(fit_and_eval(train_over, "over + no_cw", use_class_weight=False))
results.append(fit_and_eval(train_over, "over + cw", use_class_weight=True))

pd.DataFrame(results).sort_values(["PR_AUC_test"], ascending=False)


Unnamed: 0,tag,use_class_weight,train_size,train_pos_rate,PR_AUC_test,precision@recall,recall,alert_rate,TP,FP,FN,TN,threshold,note
1,raw + no_cw,False,609655,0.010823,0.256806,0.098649,0.700382,0.130296,1468,13413,628,98700,0.016384,picked_best_precision_under_recall_constraint
2,under + no_cw,False,336498,0.019608,0.254981,0.099002,0.700382,0.129832,1468,13360,628,98753,0.029936,picked_best_precision_under_recall_constraint
4,over + no_cw,False,663362,0.090908,0.245812,0.102586,0.700382,0.125297,1468,12842,628,99271,0.143167,picked_best_precision_under_recall_constraint
0,raw + cw,True,609655,0.010823,0.236228,0.104902,0.700382,0.12253,1468,12526,628,99587,0.617875,picked_best_precision_under_recall_constraint
3,under + cw,True,336498,0.019608,0.234731,0.104872,0.700382,0.122565,1468,12530,628,99583,0.618671,picked_best_precision_under_recall_constraint
5,over + cw,True,663362,0.090908,0.224439,0.10506,0.700382,0.122346,1468,12505,628,99608,0.620461,picked_best_precision_under_recall_constraint


=> raw data로 진행 확정