In [147]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import IsolationForest
import kagglehub

# === General setting ===
RANDOM_SEED = 42
TEST_SIZE = 0.3

# === Load dataset from Kaggle ===
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# === Preprocessing ===
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
X = np.asarray(data.drop(columns=['Class']))
Y = np.asarray(data['Class'])

# === Split dataset ===
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y)

# === Train XGBoost model ===
xgb_model = XGBClassifier(
    n_estimators=500,#最好
    max_depth=7, #最好
    learning_rate=0.05,#最好
    subsample=0.8,
    colsample_bytree=1.0,
    scale_pos_weight=5, #最好
    gamma=0.05, #最好
    use_label_encoder=False,
    bootstrap=True,
    eval_metric='logloss',
    random_state=RANDOM_SEED
)
xgb_model.fit(X_train, y_train)

# === Train Isolation Forest model ===
iso_forest = IsolationForest(
    n_estimators=500,
    max_samples="auto",
    contamination=sum(y_train) / len(y_train),  # roughly fraud ratio
    random_state=RANDOM_SEED,
    bootstrap=True,
)
iso_forest.fit(X_train)
iso_scores = iso_forest.decision_function(X_test)
iso_pred = iso_forest.predict(X_test)
iso_pred = np.where(iso_pred == -1, 1, 0)  # Anomaly -> 1

# === Combine both models ===
xgb_prob = xgb_model.predict_proba(X_test)[:, 1]
xgb_pred = (xgb_prob > 0.45).astype(int)

# Soft voting (weighted average)
combined_score = 0.9 * xgb_prob + 0.1 * (-iso_scores)  # iso_scores 越小越異常
combined_pred = (combined_score > 0.3).astype(int)


Parameters: { "bootstrap", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [148]:
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# === Results ===
#evaluation(y_test, xgb_pred, model_name="XGBoost")
#evaluation(y_test, iso_pred, model_name="Isolation Forest")
evaluation(y_test, combined_pred, model_name="Combined (XGB + IsoForest)")


Combined (XGB + IsoForest) Evaluation:
         Accuracy: 0.9995669627705019
  Precision Score: 0.937007874015748
     Recall Score: 0.8040540540540541
         F1 Score: 0.8654545454545455

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.94      0.80      0.87       148

    accuracy                           1.00     85443
   macro avg       0.97      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443

