- 사기거래탐지

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
card_df = pd.read_csv('../data/ftd/cards_data.csv')
trans_df = pd.read_csv('../data/ftd/transactions_data.csv')
user_df = pd.read_csv('../data/ftd/users_data.csv')
mcc_df = pd.read_json('../data/ftd/mcc_codes.json', orient='index').rename({0:'category'}, axis=1)
fraud_df = pd.read_json('../data/ftd/train_fraud_labels.json')

NameError: name 'data' is not defined

In [None]:
0

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, auc
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ======================
# 평가 함수
# ======================
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_recall_curve, auc,
    precision_score, recall_score, confusion_matrix, classification_report,
    matthews_corrcoef, log_loss
)

def evaluate_model(model, X_val, y_val, model_name="Model"):
    # 예측값과 확률
    preds = model.predict(X_val)
    probas = model.predict_proba(X_val)[:, 1]

    # 주요 지표
    #사기vs정상을 구분하는 능력 0.5=랜덤추측,1.0=완벽한 구분
    roc_auc = roc_auc_score(y_val, probas)
    
    #precision vs recall 의 균형 0 ~ 1.0 ,0.6이상
    f1 = f1_score(y_val, preds)
    
    #정밀도(precision) 높을수록 좋음 0.8이상 양호 0.9 이상 최상
    #사기라고 예측한 거래중 실제 사기 비율 - 고객불편
    #recall 높을수록 좋음 0.7이상 양호 0.9이상 최상
    #실제 사기거래중에서 모델이 잡아낸 비율 - 금전손실위험
    precision, recall, _ = precision_recall_curve(y_val, probas)

    #precision-recall 곡선 아래 면적 0~1.0,0.7 이상 좋은값 실제 사기거래를 얼마나 잘 잡아내는지
    pr_auc = auc(recall, precision)

    #
    prec = precision_score(y_val, preds)
    
    rec = recall_score(y_val, preds)
    mcc = matthews_corrcoef(y_val, preds)
    loss = log_loss(y_val, probas)
    cm = confusion_matrix(y_val, preds)

    # 출력
    print(f"\n📊 {model_name} 결과")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"F1: {f1:.4f}")
    print(f"PR-AUC: {pr_auc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"MCC: {mcc:.4f}")
    print(f"LogLoss: {loss:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("\nClassification Report:")
    print(classification_report(y_val, preds))

    return {
        "roc_auc": roc_auc,
        "f1": f1,
        "pr_auc": pr_auc,
        "precision": prec,
        "recall": rec,
        "mcc": mcc,
        "log_loss": loss,
        "confusion_matrix": cm
    }



In [None]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(X_train, y_train)
res_lgb = evaluate_model(lgb_model, X_val, y_val, "LightGBM")

In [None]:
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),  # 클래스 불균형 보정
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="logloss"
)

xgb_model.fit(X_train, y_train)
res_xgb = evaluate_model(xgb_model, X_val, y_val, "XGBoost")


In [None]:
cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=100,
    task_type="CPU"  # GPU 사용 가능하면 "GPU"
)

cat_model.fit(X_train, y_train)
res_cat = evaluate_model(cat_model, X_val, y_val, "CatBoost")

In [None]:
results = pd.DataFrame([res_lgb, res_xgb, res_cat], 
                       index=["LightGBM", "XGBoost", "CatBoost"])
print("\n📌 최종 성능 비교")
print(results)