In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
Y = np.asarray(data.iloc[:, data.columns == 'Class'])

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

#　計算詐騙與非詐騙的比例
scale_pos_weight = len(nonfraud) / len(fraud)

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


In [None]:
%pip install xgboost optuna



In [None]:
import optuna
from xgboost import XGBClassifier
# 使用 optuna 進行超參數優化
def objective(trial):
    # 設定超參數的範圍
    param = {
        # 樹的數量（弱分類器的個數），較高的值可能提升模型表現，但也增加計算成本
        'n_estimators': trial.suggest_int('n_estimators', 200, 300),
        # 控制每棵樹對最終預測的貢獻，log空間中調整能更有效探索最佳值（小幅變動會顯著影響結果）
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.2, log=True),
        # 樹的最大深度，控制模型複雜度與過擬合程度，值越大模型越複雜
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        # 每棵樹使用的樣本比例（隨機抽樣），有助於減少過擬合
        'subsample': trial.suggest_float('subsample', 0.7, 1),
         # 每棵樹使用的特徵比例，降低特徵間的共線性影響，也能減少過擬合
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        # 損失函數下降的最小值（資訊增益閾值），值越大越保守，有助於避免過擬合
        'gamma': trial.suggest_float('gamma', 0.01, 1),
        # 控制每個葉節點的最小樣本權重總和（避免學習過少樣本），增強模型穩健性
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        # 類別不平衡處理：將正負樣本的比例傳入，提升模型在少數類別的辨識能力
        'scale_pos_weight': scale_pos_weight,
        # 使用直方圖算法（hist），加速大規模資料的訓練
        'tree_method': 'hist',
         # 評估指標選擇：使用logloss來反映預測機率的準確性
        'eval_metric': 'logloss',
        'random_state': RANDOM_SEED
    }

    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    # 使用f1來當評分標準
    # (應該也可以直接用ROC AUC)

    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

# Best params: {'n_estimators': 211, 'learning_rate': 0.168601190206765, 'max_depth': 9, 'subsample': 0.9989914764140614, 'colsample_bytree': 0.7464806626828626, 'gamma': 0.4911060467180274, 'min_child_weight': 1}
# Best F1: 0.9007633587786259

print('Best params:', study.best_params)
print('Best F1:', study.best_value)

[I 2025-05-28 16:58:48,418] A new study created in memory with name: no-name-8f7fadd3-8cf7-42e2-89bc-fe9667050108
[I 2025-05-28 16:58:58,173] Trial 0 finished with value: 0.8721804511278195 and parameters: {'n_estimators': 200, 'learning_rate': 0.09217255649380003, 'max_depth': 10, 'subsample': 0.9088060814641391, 'colsample_bytree': 0.8492557081044834, 'gamma': 0.43983815765944123, 'min_child_weight': 5}. Best is trial 0 with value: 0.8721804511278195.
[I 2025-05-28 16:59:07,635] Trial 1 finished with value: 0.8698884758364313 and parameters: {'n_estimators': 229, 'learning_rate': 0.09120150935810055, 'max_depth': 8, 'subsample': 0.762132964410547, 'colsample_bytree': 0.9680214362657149, 'gamma': 0.19148803342448756, 'min_child_weight': 2}. Best is trial 0 with value: 0.8721804511278195.
[I 2025-05-28 16:59:16,906] Trial 2 finished with value: 0.8654545454545455 and parameters: {'n_estimators': 237, 'learning_rate': 0.06546833150169883, 'max_depth': 6, 'subsample': 0.7666529659901032,

Best params: {'n_estimators': 211, 'learning_rate': 0.168601190206765, 'max_depth': 9, 'subsample': 0.9989914764140614, 'colsample_bytree': 0.7464806626828626, 'gamma': 0.4911060467180274, 'min_child_weight': 1}
Best F1: 0.9007633587786259


In [None]:
xgb_model = XGBClassifier(
    # 樹的數量（弱分類器個數），越多可能提升效果，但計算成本也越高
    n_estimators=211,
    # 控制每棵樹對最終預測的貢獻，小學習率通常能提升泛化能力
    learning_rate=0.168601190206765,
    # 樹的最大深度控制模型複雜度與過擬合程度
    max_depth=9,
    # 每棵樹訓練時使用的樣本比例，用於防止過擬合
    subsample=0.9989914764140614,
    # 每棵樹訓練時使用的特徵比例，也能降低過擬合與特徵間干擾
    colsample_bytree=0.7464806626828626,
    # 分裂節點所需的最小資訊增益，數值越大越保守，有助於防止過擬合
    gamma=0.4911060467180274,
    # 類別不平衡處理：調整正負樣本的權重比例，提升對少數類別的辨識能力
    scale_pos_weight=scale_pos_weight,
    # 葉節點最小樣本權重總和，限制葉節點最小樣本數，避免過度擬合小樣本
    min_child_weight=1,
    # 使用直方圖加速訓練，特別適合大數據
    tree_method='hist',
    # 模型訓練的評估指標，這裡選用 logloss 評估預測機率的準確性
    eval_metric='logloss',
    # 隨機種子，確保結果可重現
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train, y_train)

# define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# predict and print result
y_pred = xgb_model.predict(X_test)
evaluation(y_test, y_pred, model_name="XGBClassifier")


XGBClassifier Evaluation:
         Accuracy: 0.9996957035684608
  Precision Score: 0.9365079365079365
     Recall Score: 0.8676470588235294
         F1 Score: 0.9007633587786259

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.87      0.90       136

    accuracy                           1.00     85443
   macro avg       0.97      0.93      0.95     85443
weighted avg       1.00      1.00      1.00     85443



In [52]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.utils import resample
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data.drop('Time', axis=1, inplace=True)

scaler = RobustScaler()
data['Amount'] = scaler.fit_transform(data[['Amount']])

# Extract features and labels
X = data.drop('Class', axis=1).values
y = data['Class'].values

fraud = X[y == 1]
nonfraud = X[y == 0]

# undersampling 使正常樣本和異常樣本的數量一樣
nonfraud_down = resample(nonfraud, replace=False, n_samples=len(fraud), random_state=RANDOM_SEED)
X_resampled = np.vstack([nonfraud_down, fraud])
y_resampled = np.hstack([np.zeros(len(fraud)), np.ones(len(fraud))])

# Split the dataset into training and testing sets (with stratification)
X_train_resample, X_test_resample, y_train_resample, y_test_resample = train_test_split(
   X_resampled, y_resampled, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_resampled
)

# 使用 PCA 對訓練資料進行降維
pca = PCA().fit(X_train_resample)
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
pca = PCA(n_components=n_components, random_state=RANDOM_SEED)
X_train_pca = pca.fit_transform(X_train_resample)
X_test_pca = pca.transform(X_test_resample)

In [None]:
# 設定 KMeans 的參數
kmeans_p = {
    'n_clusters': 8,          # 分成 8 群
    'init': 'k-means++',      # 初始化方法，較穩定的聚類起點選擇
    'n_init': 39,             # 重新初始化的次數，取最好結果
    # 'tol': 0.000901...       # 收斂容許誤差，這裡註解掉
}
n_clusters = kmeans_p['n_clusters']

# 建立 KMeans 模型（非監督式學習）
kmeans = KMeans(**kmeans_p, random_state=RANDOM_SEED)

# 使用正常樣本（標記為 0）進行 KMeans 訓練
# 因為正常樣本代表「大多數情況」，可作為基準
kmeans.fit(X_train_pca[y_train_resample == 0])

# 對測試資料進行分類（取得每筆資料被分到的 cluster label）
cluster_labels = kmeans.predict(X_test_pca)

# 定義函式：對每個 cluster 評估其內部詐欺樣本的比例
def rating(y_pred, n_clusters):
    rate = np.zeros(len(y_pred), dtype=float)  # 建立與 y_pred 同長度的結果陣列
    for i in range(n_clusters):  # 對每個 cluster 編號做評估
        mask = (y_pred == i)     # 找出被分到第 i 群的測試資料
        if np.sum(mask) > 0:
            # 計算這群中實際為詐欺樣本的平均比例（即：詐欺機率）
            rate[mask] = np.mean(y_test_resample[mask])
        else:
            rate[mask] = 0       # 若該群沒資料，則視為非詐欺群
    return rate

# 對所有 cluster 做評估，算出其詐欺樣本比例
fraud_rate = rating(cluster_labels, n_clusters)

# 計算所有 cluster 詐欺比例的平均值
mean_rate = np.mean(fraud_rate)

# 將詐欺比例高於平均值的群，視為「異常群」（潛在詐欺）
anomaly_clusters = [
    i for i, rate in enumerate(fraud_rate) if rate > mean_rate
]

# 建立預測標籤：若該筆測試資料所屬的群是異常群 → 預測為詐欺（1），否則為正常（0）
y_pred = np.array([1 if label in anomaly_clusters else 0 for label in cluster_labels])

# 定義評估函式，輸出常見指標
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# 評估 KMeans 模型在測試集上的表現
evaluation(y_test_resample, y_pred, model_name="KMeans (Unsupervised)")



KMeans (Unsupervised) Evaluation:
         Accuracy: 0.7668918918918919
  Precision Score: 0.8434782608695652
     Recall Score: 0.6554054054054054
         F1 Score: 0.7376425855513308

Classification Report:
              precision    recall  f1-score   support

         0.0       0.72      0.88      0.79       148
         1.0       0.84      0.66      0.74       148

    accuracy                           0.77       296
   macro avg       0.78      0.77      0.76       296
weighted avg       0.78      0.77      0.76       296

