In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report
)
from xgboost import XGBClassifier
import kagglehub

# === 1. 基本參數與讀取資料 ===
RANDOM_SEED = 42
TEST_SIZE = 0.3
N_NORMAL_SEED = 1000
N_FRAUD_SEED = 100
N_PCA_COMPONENTS = 7  # 保持與原始 PCA 相同維度

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")

data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop(columns=['Class']).to_numpy()
Y = data['Class'].to_numpy()

# === 2. 切分資料集 ===
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y
)

# === 3. 標準化處理 ===
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# === 4. PCA 特徵抽取（保持一致的維度）===
pca = PCA(n_components=N_PCA_COMPONENTS, random_state=RANDOM_SEED)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# === 5. 使用部份訓練資料建立 KMeans 中心 ===
normal_idx = np.where(y_train == 0)[0][:N_NORMAL_SEED]
fraud_idx = np.where(y_train == 1)[0][:N_FRAUD_SEED]
seed_X = np.vstack([X_train_pca[normal_idx], X_train_pca[fraud_idx]])
seed_labels = np.hstack([np.zeros(len(normal_idx)), np.ones(len(fraud_idx))])

seed_kmeans = KMeans(n_clusters=2, init='k-means++', random_state=RANDOM_SEED)
seed_kmeans.fit(seed_X)
centroids_init = seed_kmeans.cluster_centers_

# === 6. 使用 KMeans 並產生「到中心點的距離」作為額外特徵 ===
kmeans = KMeans(n_clusters=2, init=centroids_init, n_init=1, random_state=RANDOM_SEED)
kmeans.fit(X_train_pca)

def compute_cluster_distances(X, centers):
    return np.linalg.norm(X[:, np.newaxis] - centers, axis=2)  # shape: (n_samples, n_clusters)

train_kmeans_dist = compute_cluster_distances(X_train_pca, kmeans.cluster_centers_)
test_kmeans_dist = compute_cluster_distances(X_test_pca, kmeans.cluster_centers_)

# === 7. 組合最終特徵（標準化 + PCA + KMeans距離）===
X_train_final = np.hstack((X_train_std, X_train_pca, train_kmeans_dist))
X_test_final = np.hstack((X_test_std, X_test_pca, test_kmeans_dist))

# === 8. 建立 XGBoost 模型 ===
xgb_model = XGBClassifier(
    n_estimators=250,
    max_depth=6,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=1.0,
    scale_pos_weight=10,
    gamma=0.05,
    use_label_encoder=False,
    eval_metric='aucpr',
    tree_method='hist',
    random_state=RANDOM_SEED
)
xgb_model.fit(X_train_final, y_train)

# === 9. 使用自訂閾值進行預測 ===
y_prob = xgb_model.predict_proba(X_test_final)[:, 1]
threshold = 0.80
y_pred = (y_prob > threshold).astype(int)
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt

# === 掃描不同 threshold 值 ===
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)

# === 找出 F1-score 最高的 threshold ===
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_precision = precisions[best_idx]
best_recall = recalls[best_idx]
best_f1 = f1_scores[best_idx]

print(f"\nBest Threshold = {best_threshold:.4f}")




Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best Threshold = 0.8030


In [32]:
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# === Results ===
#evaluation(y_test, xgb_pred, model_name="XGBoost")
#evaluation(y_test, iso_pred, model_name="Isolation Forest")
evaluation(y_test, y_pred, model_name="Combined (XGB + IsoForest)")


Combined (XGB + IsoForest) Evaluation:
         Accuracy: 0.9995903701883126
  Precision Score: 0.9669421487603306
     Recall Score: 0.7905405405405406
         F1 Score: 0.8698884758364313

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.97      0.79      0.87       148

    accuracy                           1.00     85443
   macro avg       0.98      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443




Combined (XGB + IsoForest) Evaluation:
=============================================
         Accuracy: 0.9995903701883126
  Precision Score: 0.9669421487603306
     Recall Score: 0.7905405405405406
         F1 Score: 0.8698884758364313

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.97      0.79      0.87       148

    accuracy                           1.00     85443
   macro avg       0.98      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443
