In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, average_precision_score, precision_recall_curve
from imblearn.over_sampling import SMOTE

df = pd.read_csv('creditcard.csv')


In [2]:
# 1. 데이터 로드 및 기본 탐색
# 1-1.기본 데이터 로드
print('###HEAD###')
print(df.head())
print('###INFO###')
print(df.info())
print('###DESCRIPTION###')
print(df.describe())

# 1-2.사기건수 확인 및 비율 확인
print(df['Class'].value_counts())
print(df['Class'].value_counts(normalize=True))

###HEAD###
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27

In [3]:
# 2. 샘플링
# 2-1. 데이터 추출 및 샘플링
abnormal = df[df['Class'] == 1]
normal = df[df['Class'] == 0]

normal_sample = normal.sample(n=10000, random_state=42)

df_sampled=pd.concat([normal_sample, abnormal])

# 2. 샘플링된 데이터 비율 확인
print("\n### Sampled Data Distribution ###")
print(df_sampled['Class'].value_counts(normalize=True))


### Sampled Data Distribution ###
Class
0    0.953107
1    0.046893
Name: proportion, dtype: float64


In [4]:
# 3. 데이터 전처리

# 3-1. Amount_Scaled 변수 정의
scaler = StandardScaler()
df_sampled['Amount_Scaled'] = scaler.fit_transform(df_sampled['Amount'].values.reshape(-1, 1))

# 3-2. Amount 변수 삭제
df_sampled = df_sampled.drop(['Amount'], axis=1)

# 3-3.X, y로 데이터프레임 분리
X = df_sampled.drop(['Class'], axis=1)
y = df_sampled['Class']

In [5]:
# 4. 학습 데이터와 테스트 데이터 분할
# 4-1. 학습, 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("\n### Train/Test Data Distribution ###")
print(f"Train Class counts:\n{y_train.value_counts()}")
print(f"Test Class counts:\n{y_test.value_counts()}")


### Train/Test Data Distribution ###
Train Class counts:
Class
0    7999
1     394
Name: count, dtype: int64
Test Class counts:
Class
0    2001
1      98
Name: count, dtype: int64


In [6]:
# 5. SMOTE 적용
# 5-1.Train dataset에 SMOTE 적용
smote = SMOTE(random_state=42)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

# 5-2.SMOTE 적용 전후 데이터수 출력
print("\n### Before SMOTE ###")
print(y_train.value_counts())
print("\n### After SMOTE ###")
print(y_train_over.value_counts())


### Before SMOTE ###
Class
0    7999
1     394
Name: count, dtype: int64

### After SMOTE ###
Class
0    7999
1    7999
Name: count, dtype: int64


In [7]:
# 6. 모델 학습
# 6-1. 모델 선정 및 학습 진행
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_over, y_train_over)

# 6-2.예측값 및 예측 확률 저장
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# 6-3.
print("\n### Classification Report (Default Threshold) ###")
print(classification_report(y_test, y_pred))

pr_auc = average_precision_score(y_test, y_pred_proba)
print(f"PR-AUC Score: {pr_auc:.4f}")



### Classification Report (Default Threshold) ###
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2001
           1       0.95      0.89      0.92        98

    accuracy                           0.99      2099
   macro avg       0.97      0.94      0.96      2099
weighted avg       0.99      0.99      0.99      2099

PR-AUC Score: 0.9537


In [8]:
# 7. 최종 성능 평가 및 Threshold 조정


precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)

#7-1. 목표 달성을 위한 최적의 Threshold 탐색
best_threshold = 0.5
best_f1 = 0

print("\n### Finding Best Threshold ###")
for i, threshold in enumerate(thresholds):
    y_pred_custom = (y_pred_proba >= threshold).astype(int)

    # Report 딕셔너리 형태로 추출
    report = classification_report(y_test, y_pred_custom, output_dict=True)

    # Class 1 변수 저장
    rec_1 = report['1']['recall']
    f1_1 = report['1']['f1-score']

    # 목표 조건에 가까운지 확인
    if f1_1 > best_f1 and rec_1 >= 0.80:
        best_f1 = f1_1
        best_threshold = threshold
        best_recall = rec_1

print(f"Best Threshold: {best_threshold:.4f}")
print(f"Best F1: {best_f1:.4f}, Best Recall: {best_recall:.4f}")

# 최종 Threshold 적용 모델 평가
final_pred = (y_pred_proba >= best_threshold).astype(int)

print("\n### Final Classification Report (Tuned Threshold) ###")
print(classification_report(y_test, final_pred))
print(f"Final PR-AUC: {pr_auc:.4f}")

if (best_recall >= 0.80) and (best_f1 >= 0.88) and (pr_auc >= 0.90):
    print(">>> 목표 성능을 달성하였습니다!")
else:
    print(">>> 목표 성능을 완전히 달성하지 못했습니다.")


### Finding Best Threshold ###


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Threshold: 0.6400
Best F1: 0.9239, Best Recall: 0.8673

### Final Classification Report (Tuned Threshold) ###
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2001
           1       0.99      0.87      0.92        98

    accuracy                           0.99      2099
   macro avg       0.99      0.93      0.96      2099
weighted avg       0.99      0.99      0.99      2099

Final PR-AUC: 0.9537
>>> 목표 성능을 달성하였습니다!
