In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report
)
from xgboost import XGBClassifier
import kagglehub

# === 1. 基本參數與讀取資料 ===
RANDOM_SEED = 42
TEST_SIZE = 0.3

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")

data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop(columns=['Class']).to_numpy()
Y = data['Class'].to_numpy()

# === 2. 切分資料集 ===
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y
)

# === 3. 標準化處理 ===
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# === 4. Isolation Forest 異常分數作為額外特徵 ===
iso_forest = IsolationForest(
    n_estimators=300,
    contamination=sum(y_train) / len(y_train),
    max_samples='auto',
    random_state=RANDOM_SEED
)
iso_forest.fit(X_train_std)
train_anomaly_scores = iso_forest.decision_function(X_train_std).reshape(-1, 1)
test_anomaly_scores = iso_forest.decision_function(X_test_std).reshape(-1, 1)

# === 5. PCA 特徵抽取（保持足夠變異量，例如 95%）===
pca = PCA(n_components=15, random_state=RANDOM_SEED)
pca.fit(X_train_std)
X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# === 6. 組合所有特徵 ===
X_train_final = np.hstack((X_train_std, train_anomaly_scores, X_train_pca))
X_test_final = np.hstack((X_test_std, test_anomaly_scores, X_test_pca))

# === 7. 建立 XGBoost 模型（結合測試過的超參數）===
xgb_model = XGBClassifier(
    n_estimators=250,
    max_depth=6,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=1.0,
    scale_pos_weight=15,
    gamma=0.05,
    use_label_encoder=False,
    eval_metric='logloss',
    tree_method='hist',
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train_final, y_train)

# === 8. 自訂閾值進行預測（使用概率）===
y_prob = xgb_model.predict_proba(X_test_final)[:, 1]
threshold = 0.43
y_pred = (y_prob > threshold).astype(int)



  from .autonotebook import tqdm as notebook_tqdm
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [2]:
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# === Results ===
#evaluation(y_test, xgb_pred, model_name="XGBoost")
#evaluation(y_test, iso_pred, model_name="Isolation Forest")
evaluation(y_test, y_pred, model_name="Combined (XGB + IsoForest)")


Combined (XGB + IsoForest) Evaluation:
         Accuracy: 0.9995435553526912
  Precision Score: 0.9224806201550387
     Recall Score: 0.8040540540540541
         F1 Score: 0.8592057761732852

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.92      0.80      0.86       148

    accuracy                           1.00     85443
   macro avg       0.96      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443




Combined (XGB + IsoForest) Evaluation:
=============================================
         Accuracy: 0.9995435553526912
  Precision Score: 0.936
     Recall Score: 0.7905405405405406
         F1 Score: 0.8571428571428571

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.94      0.79      0.86       148

    accuracy                           1.00     85443
   macro avg       0.97      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443
