In [13]:

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import kagglehub
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, callback
import tensorflow as tf

# general setting
RANDOM_SEED = 42
TEST_SIZE = 0.3
# 載入資料集（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# 準備數據
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

#from imblearn.over_sampling import SMOTE

# Extract 特徵和標籤
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])
# y = data['Class'].astype(int).values

# 分割資料集與訓練
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

# PCA 降維
pca = PCA(n_components=25)  # 嘗試 2-20 的不同值
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)



# 用正常交易訓練 Isolation Forest
iso = IsolationForest(
    contamination=0.0017,
    random_state=RANDOM_SEED,
    n_estimators=500,
    max_features=15,
    # max_samples=0.8,
    bootstrap=True
    )
iso.fit(x_train)

# 用全部資料做預測（正常/異常）
iso_labels = iso.predict(x_train)
iso_labels = (iso_labels == -1).astype(int)


x_train = np.hstack([x_train, iso_labels.reshape(-1, 1)])
iso_pred_test = iso.predict(x_test)
iso_feature_test = (iso_pred_test == -1).astype(int)
x_test = np.hstack((x_test, iso_feature_test.reshape(-1, 1)))


xgb_model = XGBClassifier(
    objective='binary:logistic', #二元邏輯回歸用於二分類問題
    n_estimators=500,
    max_depth=7,
    learning_rate=0.3,
    scale_pos_weight=6,
    random_state=RANDOM_SEED,
)

xgb_model.fit(x_train, y_train)



#預測測試集
threshold = 0.4
y_proba = xgb_model.predict_proba(x_test)[:, 1]  # 取得預測為正類（詐欺）的機率
y_pred = (y_proba > threshold).astype(int)
# y_pred = xg_model.predict(x_test)


def evaluation(y_true, y_pred, model_name="Model"):
   accuracy = accuracy_score(y_true, y_pred)
   precision = precision_score(y_true, y_pred, zero_division=0)
   recall = recall_score(y_true, y_pred)
   f1 = f1_score(y_true, y_pred)

   print(f'\n{model_name} Evaluation:')
   print('===' * 15)
   print('         Accuracy:', accuracy)
   print('  Precision Score:', precision)
   print('     Recall Score:', recall)
   print('         F1 Score:', f1)
   print("\nClassification Report:")
   print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred, model_name="Hybrid Model")

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)

Hybrid Model Evaluation:
         Accuracy: 0.9996957035684608
  Precision Score: 0.9365079365079365
     Recall Score: 0.8676470588235294
         F1 Score: 0.9007633587786259

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.87      0.90       136

    accuracy                           1.00     85443
   macro avg       0.97      0.93      0.95     85443
weighted avg       1.00      1.00      1.00     85443

