In [1]:
%pip install xgboost optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
Y = np.asarray(data['Class'])

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#　計算詐騙與非詐騙的比例
contamination = len(fraud) / len(nonfraud)
scale_pos_weight = len(nonfraud) / len(fraud)

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


In [18]:
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier

# Step 1: 用 IsolationForest 做初步異常篩選，產生新特徵
iso = IsolationForest(
    # 異常比例
    contamination=contamination,
    # 使用 200 棵樹
    n_estimators=200,
    # 自動選擇樣本數
    max_samples='auto',
    random_state=RANDOM_SEED
  )

# 對訓練集做異常預測，回傳 +1 正常，-1 異常
iso_train_pred = iso.fit_predict(X_train)
# 對測試集也預測
iso_test_pred = iso.predict(X_test)

# 把 IsolationForest 預測結果轉成 0/1（1 代表異常）
iso_train_feature = (iso_train_pred == -1).astype(int).reshape(-1, 1)
iso_test_feature = (iso_test_pred == -1).astype(int).reshape(-1, 1)

# 將新特徵與原特徵合併
X_train_enhanced = np.hstack((X_train, iso_train_feature))
X_test_enhanced = np.hstack((X_test, iso_test_feature))

In [None]:
# Step 2: 用 XGBoost 進行監督式分類
xgb_model = XGBClassifier(
 # 樹的數量（弱分類器個數），越多可能提升效果，但計算成本也越高
    n_estimators=211,
    # 控制每棵樹對最終預測的貢獻，小學習率通常能提升泛化能力
    learning_rate=0.168601190206765,
    # 樹的最大深度控制模型複雜度與過擬合程度
    max_depth=9,
    # 每棵樹訓練時使用的樣本比例，用於防止過擬合
    subsample=0.9989914764140614,
    # 每棵樹訓練時使用的特徵比例，也能降低過擬合與特徵間干擾
    colsample_bytree=0.7464806626828626,
    # 分裂節點所需的最小資訊增益，數值越大越保守，有助於防止過擬合
    gamma=0.4911060467180274,
    # 類別不平衡處理：調整正負樣本的權重比例，提升對少數類別的辨識能力
    scale_pos_weight=scale_pos_weight,
    # 葉節點最小樣本權重總和，限制葉節點最小樣本數，避免過度擬合小樣本
    min_child_weight=1,
    # 使用直方圖加速訓練，特別適合大數據
    tree_method='hist',
    # 模型訓練的評估指標，這裡選用 logloss 評估預測機率的準確性
    eval_metric='logloss',
    # 隨機種子，確保結果可重現
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train_enhanced, y_train.ravel())

# 預測與評估
y_pred = xgb_model.predict(X_test_enhanced)

In [20]:
# 定義評估函式，輸出常見指標
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# 評估模型在測試集上的表現
evaluation(y_test, y_pred, model_name="XGBClassifier")



XGBClassifier Evaluation:
         Accuracy: 0.9996371850239341
  Precision Score: 0.9133858267716536
     Recall Score: 0.8529411764705882
         F1 Score: 0.8821292775665399

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.91      0.85      0.88       136

    accuracy                           1.00     85443
   macro avg       0.96      0.93      0.94     85443
weighted avg       1.00      1.00      1.00     85443

