In [1]:
import pandas as pd
df = pd.read_parquet("DATA/train")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 609655 entries, 5 to 5332941
Data columns (total 60 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   date                             609655 non-null  datetime64[ns]
 1   client_id                        609655 non-null  int64         
 2   card_id                          609655 non-null  int64         
 3   amount                           609655 non-null  float32       
 4   merchant_id                      609655 non-null  int64         
 5   current_age                      609655 non-null  int64         
 6   per_capita_income                609655 non-null  float32       
 7   yearly_income                    609655 non-null  float32       
 8   total_debt                       609655 non-null  float32       
 9   credit_score                     609655 non-null  int64         
 10  num_credit_cards                 609655 non-null

In [3]:

#  5   current_age                      609655 non-null  int64         
#  6   per_capita_income                609655 non-null  float32       
#  7   yearly_income                    609655 non-null  float32       
#  8   total_debt                       609655 non-null  float32       
#  9   credit_score                     609655 non-null  int64         
#  10  num_credit_cards                 609655 non-null  int8          
#  11  has_chip                         609655 non-null  int8          
#  12  num_cards_issued                 609655 non-null  int64         
#  13  credit_limit                     609655 non-null  float32       
#  14  year_pin_last_changed            609655 non-null  int64               
#  25  months_to_expire                 609655 non-null  int16                  
#  31  is_credit                        609655 non-null  int8          
#  32  is_prepaid                       609655 non-null  int8          
#  33  male                             609655 non-null  int8                   
#  45  cb_Visa                          609655 non-null  int8          
#  46  cb_Mastercard                    609655 non-null  int8          
#  47  cb_Amex                          609655 non-null  int8          
#  48  cb_Discover                      609655 non-null  int8          
#  49  months_from_account              609655 non-null  int16         
#  50  years_since_pin_change           609655 non-null  int8          
#  51  years_to_retirement              609655 non-null  int8          
#  52  distance_from_home               609655 non-null  float64       
#  53  income_ratio_region              609655 non-null  float32       
#  54  log_yearly_income                609655 non-null  float32       
#  55  log_income_ratio_region          609655 non-null  float32       
#  58  amount_income_ratio              609655 non-null  float32       
#  59  amount_limit_ratio               609655 non-null  float32       


In [4]:
derived_cols = [
    "date",
    "current_age",
    "per_capita_income",
    "yearly_income",
    "total_debt",
    "credit_score",
    "num_credit_cards",
    "has_chip",
    "num_cards_issued",
    "credit_limit",
    "year_pin_last_changed",
    "months_to_expire",
    "is_credit",
    "is_prepaid",
    "male",
    "cb_Visa",
    "cb_Mastercard",
    "cb_Amex",
    "cb_Discover",
    "months_from_account",
    "years_since_pin_change",
    "years_to_retirement",
    "distance_from_home",
    "income_ratio_region",
    "log_yearly_income",
    "log_income_ratio_region",
    "amount_income_ratio",
    "amount_limit_ratio",
]
label_col = "fraud"
id_cols = ["client_id", "card_id", "merchant_id"]

# transaction-only 데이터셋
df_tx_only = df.drop(columns=derived_cols)

# feature / label 분리
X = df_tx_only.drop(columns=id_cols + [label_col])
y = df_tx_only[label_col]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

pipe.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

from sklearn.metrics import average_precision_score

y_prob = pipe.predict_proba(X_test)[:, 1]
pr_auc = average_precision_score(y_test, y_prob)

print(f"PR-AUC: {pr_auc:.4f}")


              precision    recall  f1-score   support

           0     0.9978    0.8388    0.9114    120611
           1     0.0534    0.8303    0.1003      1320

    accuracy                         0.8387    121931
   macro avg     0.5256    0.8345    0.5058    121931
weighted avg     0.9876    0.8387    0.9026    121931

PR-AUC: 0.2110


In [5]:
df = pd.read_parquet("DATA/train")

In [8]:
label_col = "fraud"
id_cols = ["client_id", "card_id", "merchant_id"]

df_tx_only = df.copy()
df_tx_only.drop("date", axis=1, inplace=True)

# feature / label 분리
X = df_tx_only.drop(columns=id_cols + [label_col])
y = df_tx_only[label_col]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

pipe.fit(X_train, y_train)

from sklearn.metrics import classification_report

y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

from sklearn.metrics import average_precision_score

y_prob = pipe.predict_proba(X_test)[:, 1]
pr_auc = average_precision_score(y_test, y_prob)

print(f"PR-AUC: {pr_auc:.4f}")


              precision    recall  f1-score   support

           0     0.9978    0.8393    0.9117    120611
           1     0.0536    0.8311    0.1006      1320

    accuracy                         0.8392    121931
   macro avg     0.5257    0.8352    0.5062    121931
weighted avg     0.9876    0.8392    0.9029    121931

PR-AUC: 0.2212
