In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, precision_recall_curve
)
from xgboost import XGBClassifier
import kagglehub
import torch
import torch.nn as nn

# AutoEncoder 定義
RANDOM_SEED = 42
TEST_SIZE = 0.3

class AutoEncoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        #壓縮成四維encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 4)
        )
        #解壓縮回原始維度decoder
        self.decoder = nn.Sequential(
            nn.Linear(4, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = RobustScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop(columns=['Class']).values
y = data['Class'].values

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

scaler = RobustScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [None]:
# PCA降維，去除冗餘特徵
pca = PCA(n_components=20, random_state=RANDOM_SEED)
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)
# IsolationForest(非監督)
iso = IsolationForest(n_estimators=300, contamination=0.02, random_state=RANDOM_SEED)
iso.fit(x_train_pca)
anomaly_scores_if_train = iso.decision_function(x_train_pca)
anomaly_scores_if_test = iso.decision_function(x_test_pca)
iso_labels_train = iso.predict(x_train_pca) * 10
iso_labels_test = iso.predict(x_test_pca) * 10
# LOF(Local Outlier Factor)(非監督)
lof = LocalOutlierFactor(n_neighbors=40, contamination=0.015, novelty=True)
lof.fit(x_train_pca)
lof_scores_train = lof.negative_outlier_factor_
lof_scores_test = lof.decision_function(x_test_pca)
lof_labels_train = lof.predict(x_train_pca) * 10
lof_labels_test = lof.predict(x_test_pca) * 10
# AutoEncoder訓練
ae = AutoEncoder(input_dim=x_train_scaled.shape[1]).float()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(ae.parameters(), lr=0.01)
torch.manual_seed(RANDOM_SEED)
x_tensor = torch.tensor(x_train_scaled, dtype=torch.float32)
for _ in range(40):
    optimizer.zero_grad()
    outputs = ae(x_tensor)
    loss = criterion(outputs, x_tensor)
    loss.backward()
    optimizer.step()

with torch.no_grad():
    train_recon = ae(torch.tensor(x_train_scaled, dtype=torch.float32)).numpy()
    test_recon = ae(torch.tensor(x_test_scaled, dtype=torch.float32)).numpy()

recon_error_train = np.mean((train_recon - x_train_scaled)**2, axis=1)
recon_error_test = np.mean((test_recon - x_test_scaled)**2, axis=1)
# 特徵整合
x_train_all = np.hstack([
    x_train_scaled, x_train_pca,
    anomaly_scores_if_train.reshape(-1, 1),
    iso_labels_train.reshape(-1, 1),
    lof_scores_train.reshape(-1, 1), 
    lof_labels_train.reshape(-1, 1),
    recon_error_train.reshape(-1, 1)
])

x_test_all = np.hstack([
    x_test_scaled, x_test_pca,
    anomaly_scores_if_test.reshape(-1, 1), 
    iso_labels_test.reshape(-1, 1),
    lof_scores_test.reshape(-1, 1), 
    lof_labels_test.reshape(-1, 1),
    recon_error_test.reshape(-1, 1)
])


In [None]:
# XGBoost 訓練
xgb_model = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.042,
    gamma=0.12, subsample=0.95, colsample_bytree=0.87,
    min_child_weight=1, reg_alpha=0.02, reg_lambda=1.0,
    max_delta_step=3, tree_method='hist',
    scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    objective='binary:logistic', eval_metric='aucpr',
    random_state=RANDOM_SEED, use_label_encoder=False
)

xgb_model.fit(x_train_all, y_train)
y_proba = xgb_model.predict_proba(x_test_all)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
best_idx = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]

y_pred_final = (y_proba >= best_thresh).astype(int)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
def evaluation(y_true, y_pred, model_name="Hybrid Model"):
    print(f"\n{model_name} Evaluation:")
    print("=" * 40)
    print(f"Accuracy       : {accuracy_score(y_true, y_pred):.15f}")
    print(f"Precision Score: {precision_score(y_true, y_pred):.15f}")
    print(f"Recall Score   : {recall_score(y_true, y_pred):.15f}")
    print(f"F1 Score       : {f1_score(y_true, y_pred):.15f}\n")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, digits=2))

evaluation(y_test, y_pred_final)


Hybrid Model Evaluation:
Accuracy       : 0.999508444225975
Precision Score: 0.920634920634921
Recall Score   : 0.783783783783784
F1 Score       : 0.846715328467153

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.92      0.78      0.85       148

    accuracy                           1.00     85443
   macro avg       0.96      0.89      0.92     85443
weighted avg       1.00      1.00      1.00     85443

