In [None]:
# 載入必要套件
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
import kagglehub  # 用於下載 Kaggle 資料集

# 一般設定
RANDOM_SEED = 42
TEST_SIZE = 0.3

In [25]:
# 讀取資料並前處理
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data.drop(['Time'], axis=1, inplace=True)
data['Amount'] = StandardScaler().fit_transform(
    data['Amount'].values.reshape(-1, 1)
)
X = data.drop(columns=['Class']).values
y = data['Class'].values

# 切分訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=y
)

# 切分詐欺與非詐欺交易
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent: {len(fraud)}, non-fraudulent: {len(nonfraud)}')
print(f'Positive class percentage: {len(fraud)/(len(fraud)+len(nonfraud))*100:.3f}%')

# 定義通用評估函式
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))


Fraudulent: 492, non-fraudulent: 284315
Positive class percentage: 0.173%


In [27]:
selected_features = ['V1','V2','V3','V4','V5','V6','V7',
                     'V9','V10','V11','V12','V14','V16','V17','V18','V19','Amount']
X  = data[selected_features].to_numpy()
y  = data['Class'].to_numpy()

nonfraud_sample = nonfraud.sample(n=5000, random_state=RANDOM_SEED)
data_balanced   = pd.concat([fraud, nonfraud_sample])

X_bal = data_balanced[selected_features].to_numpy()
y_bal = data_balanced['Class'].to_numpy()

# -----------------------------------------
# 2. 切分 train / test
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_bal, y_bal, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_bal)

# -----------------------------------------
# 3. Isolation Forest → 產生 anomaly score
#    只用正常樣本 fit，避免資訊外洩
# -----------------------------------------
iso = IsolationForest(
        n_estimators=300,
        max_samples='auto',
        contamination='auto',
        bootstrap=True,
        random_state=RANDOM_SEED
     )
iso.fit(X_train[y_train == 0])

# decision_function：分數越低越異常
iso_train = (-iso.decision_function(X_train)).reshape(-1, 1)  # 反向 → 分數大 = 高風險
iso_test  = (-iso.decision_function(X_test )).reshape(-1, 1)

# 疊回特徵
X_train_if = np.hstack([X_train, iso_train])
X_test_if  = np.hstack([X_test , iso_test ])

# -----------------------------------------
# 4. XGBoost（含新特徵）
# -----------------------------------------
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_SEED,
    eval_metric='logloss'
)
xgb_model.fit(X_train_if, y_train)

y_pred = xgb_model.predict(X_test_if)
y_prob = xgb_model.predict_proba(X_test_if)[:, 1]


evaluation(y_test, y_pred, model_name="Hybrid Model")


Hybrid Model Evaluation:
         Accuracy: 0.9866504854368932
  Precision Score: 0.9772727272727273
     Recall Score: 0.8716216216216216
         F1 Score: 0.9214285714285714

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1500
           1       0.98      0.87      0.92       148

    accuracy                           0.99      1648
   macro avg       0.98      0.93      0.96      1648
weighted avg       0.99      0.99      0.99      1648

