# 4.10 분류 실습 - 캐글 신용카드 사기 검출

In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# 평가 함수 정의
def get_clf_eval(y_test, pred=None, pred_proba=None):
    # 오차 행렬
    confusion = confusion_matrix(y_test, pred)

    # 주요 평가 지표
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    # ROC-AUC 계산 (확률 값 필요)
    roc_auc = roc_auc_score(y_test, pred_proba) if pred_proba is not None else None

    # 결과 출력
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC: {4: .4f}'.format(
        accuracy, precision, recall, f1, roc_auc))


In [3]:
# 데이터 불러오기 및 기본 설정
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

card_df = pd.read_csv('./creditcard.csv')
card_df.head(3)


# 1. 데이터 전처리 함수
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def get_preprocessed_df(df=None):
    df_copy = df.copy()
    # Amount 로그 변환
    amount_n = np.log1p(df_copy['Amount'])
    df_copy.insert(0, 'Amount_Scaled', amount_n)
    df_copy.drop(['Time', 'Amount'], axis=1, inplace=True)
    return df_copy


# 학습/테스트 데이터 분할 함수
def get_train_test_dataset(df=None):
    df_copy = get_preprocessed_df(df)
    X_features = df_copy.iloc[:, :-1]
    y_target = df_copy.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(
        X_features, y_target, test_size=0.3, random_state=0, stratify=y_target
    )
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = get_train_test_dataset(card_df)

print('학습 데이터 레이블 값 비율')
print(y_train.value_counts() / y_train.shape[0] * 100)
print('테스트 데이터 레이블 값 비율')
print(y_test.value_counts() / y_test.shape[0] * 100)


# 2. 평가 함수 예시 (3장에서 사용한 함수 그대로)
def get_clf_eval(y_test, pred=None, pred_proba=None):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
    print('Accuracy: {:.4f}'.format(accuracy_score(y_test, pred)))
    print('Precision: {:.4f}'.format(precision_score(y_test, pred)))
    print('Recall: {:.4f}'.format(recall_score(y_test, pred)))
    print('F1 score: {:.4f}'.format(f1_score(y_test, pred)))
    print('ROC AUC: {:.4f}'.format(roc_auc_score(y_test, pred_proba)))


# 3. 모델 학습/평가 함수
def get_model_train_eval(model, ftr_train, ftr_test, tgt_train, tgt_test):
    model.fit(ftr_train, tgt_train)
    pred = model.predict(ftr_test)
    pred_proba = model.predict_proba(ftr_test)[:, 1]
    get_clf_eval(tgt_test, pred, pred_proba)


# 4. 로지스틱 회귀 및 LightGBM 학습/평가
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

lr_clf = LogisticRegression(max_iter=1000)
get_model_train_eval(lr_clf, X_train, X_test, y_train, y_test)

lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False)
get_model_train_eval(lgbm_clf, X_train, X_test, y_train, y_test)


# 5. 이상치 제거 함수
def get_outlier(df=None, column=None, weight=1.5):
    fraud = df[df['Class']==1][column]
    quantile_25 = np.percentile(fraud.values, 25)
    quantile_75 = np.percentile(fraud.values, 75)
    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight
    outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index
    return outlier_index


# 예시: V14 이상치 제거
outlier_index = get_outlier(df=card_df, column='V14', weight=1.5)
print('이상치 데이터 인덱스:', outlier_index)


# 6. SMOTE 적용
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트:', X_train.shape, y_train.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트:', X_train_over.shape, y_train_over.shape)
print('SMOTE 적용 후 레이블 값 분포:\n', pd.Series(y_train_over).value_counts())

# SMOTE 적용 후 모델 학습/평가
get_model_train_eval(lr_clf, X_train_over, X_test, y_train_over, y_test)
get_model_train_eval(lgbm_clf, X_train_over, X_test, y_train_over, y_test)


학습 데이터 레이블 값 비율
Class
0    99.827451
1     0.172549
Name: count, dtype: float64
테스트 데이터 레이블 값 비율
Class
0    99.826785
1     0.173215
Name: count, dtype: float64
Accuracy: 0.9992
Precision: 0.8725
Recall: 0.6014
F1 score: 0.7120
ROC AUC: 0.9734
[LightGBM] [Info] Number of positive: 344, number of negative: 199020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.125384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 199364, number of used features: 29
Accuracy: 0.9995
Precision: 0.9576
Recall: 0.7635
F1 score: 0.8496
ROC AUC: 0.9796
이상치 데이터 인덱스: Index([8296, 8615, 9035, 9252], dtype='int64')
SMOTE 적용 전 학습용 피처/레이블 데이터 세트: (199364, 29) (199364,)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트: (398040, 29) (398040,)
SMOTE 적용 후 레이블 값 분포:
 Class
0    199020
1    199020
Name: count, dtype: int64
Accuracy: 0.9767
Precision: 0.0632
Recall: 0.8986
F1 score: 0.1180
RO

<결과>


*   Accuracy: 0.9994
*   Precision: 0.8615
*   Recall: 0.7568
*   F1 score: 0.8058
*   ROC AUC: 0.9780

