In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import kagglehub

# 通用設置
RANDOM_SEED = 42
TEST_SIZE = 0.3

# 載入數據集
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# 數據集資訊
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent: {len(fraud)}, Non-Fraudulent: {len(nonfraud)}')
print(f'The positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

# 準備特徵和標籤
X = data.drop('Class', axis=1).values
y = data['Class'].values

# 分割數據
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# 步驟 1: 使用 Isolation Forest 進行異常檢測並添加異常分數作為特徵
iso_forest = IsolationForest(contamination=0.005, random_state=RANDOM_SEED, n_jobs=-1)
iso_forest.fit(X_train)
iso_scores = iso_forest.decision_function(X_train)
iso_threshold = np.percentile(iso_scores, 5)  # 篩選異常分數前5%
X_train_iso = X_train[iso_scores <= iso_threshold]
y_train_iso = y_train[iso_scores <= iso_threshold]
iso_scores_train = iso_scores[iso_scores <= iso_threshold].reshape(-1, 1)
X_train_iso = np.hstack([X_train_iso, iso_scores_train])  # 添加異常分數作為特徵

# 對測試集和原始訓練集添加異常分數
iso_scores_test = iso_forest.decision_function(X_test).reshape(-1, 1)
X_test = np.hstack([X_test, iso_scores_test])
iso_scores_full_train = iso_forest.decision_function(X_train).reshape(-1, 1)
X_train = np.hstack([X_train, iso_scores_full_train])  # 為原始訓練集添加異常分數

# 步驟 2: 應用 SMOTE 平衡數據
smote = SMOTE(sampling_strategy=0.2, random_state=RANDOM_SEED)
X_train_smote, y_train_smote = smote.fit_resample(X_train_iso, y_train_iso)
print(f"Before SMOTE: {np.bincount(y_train_iso)}")
print(f"After SMOTE: {np.bincount(y_train_smote)}")

# 步驟 3: XGBoost 隨機搜索參數範圍
param_dist = {
    'n_estimators': [ 500,600],
    'max_depth': [6, 9, 12],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'scale_pos_weight': [30, 50, 70]
}

# 隨機搜索（優化 F1 分數）
xgb_model = XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1)
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=15,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1,
    random_state=RANDOM_SEED
)
random_search.fit(X_train_smote, y_train_smote)

# 輸出最佳參數
print(f"\nBest Parameters: {random_search.best_params_}")
print(f"Best Cross-Validation F1 Score: {random_search.best_score_:.4f}")

# 定義評估函數
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print(f'         Accuracy: {accuracy:.4f}')
    print(f'  Precision Score: {precision:.4f}')
    print(f'     Recall Score: {recall:.4f}')
    print(f'         F1 Score: {f1:.4f}')
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# 步驟 4: 評估原始 XGBoost（未調優）
original_xgb = XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1)
original_xgb.fit(X_train, y_train)  # 使用包含異常分數的 X_train
y_pred_orig = original_xgb.predict(X_test)
evaluation(y_test, y_pred_orig, model_name="Original XGBoost")

# 步驟 5: 動態搜索最佳閾值
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.5, 0.9, 0.05)
best_threshold = 0.5
best_precision = 0
best_f1 = 0
for threshold in thresholds:
    y_pred = (y_pred_proba >= threshold).astype(int)
    precision = precision_score(y_true=y_test, y_pred=y_pred, zero_division=0)
    if precision >= 0.94:
        f1 = f1_score(y_true=y_test, y_pred=y_pred, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision

y_pred_best = (y_pred_proba >= best_threshold).astype(int)
evaluation(y_test, y_pred_best, model_name=f"Isolation Forest + SMOTE + Tuned XGBoost (Threshold={best_threshold:.2f})")



Fraudulent: 492, Non-Fraudulent: 284315
The positive class (frauds) percentage: 492/284807 (0.173%)
Before SMOTE: [9667  302]
After SMOTE: [9667 1933]
Fitting 3 folds for each of 15 candidates, totalling 45 fits

Best Parameters: {'subsample': 0.8, 'scale_pos_weight': 30, 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.05, 'colsample_bytree': 0.9}
Best Cross-Validation F1 Score: 0.9889

Original XGBoost Evaluation:
         Accuracy: 0.9996
  Precision Score: 0.9417
     Recall Score: 0.8309
         F1 Score: 0.8828

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.83      0.88       136

    accuracy                           1.00     85443
   macro avg       0.97      0.92      0.94     85443
weighted avg       1.00      1.00      1.00     85443


Isolation Forest + SMOTE + Tuned XGBoost (Threshold=0.50) Evaluation:
         Accuracy: 0.9996
  Precision Score: 0.