In [20]:
#isolation + xgboost
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
import kagglehub

warnings.filterwarnings("ignore")

# 設定常數
RANDOM_SEED = 42
TEST_SIZE = 0.3
np.random.seed(RANDOM_SEED)

def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# 載入和預處理資料
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
Y = np.asarray(data.iloc[:, data.columns == 'Class'])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# 創建 Isolation Forest 特徵
iso_configs = [
    {'contamination': 0.0017, 'n_estimators': 150, 'random_state': RANDOM_SEED},
    {'contamination': 0.0025, 'n_estimators': 180, 'random_state': RANDOM_SEED + 1},
    {'contamination': 0.0012, 'n_estimators': 120, 'random_state': RANDOM_SEED + 2}
]

iso_features = []
for config in iso_configs:
    iso_model = IsolationForest(**config, n_jobs=-1)
    iso_scores = iso_model.fit(X).decision_function(X)
    iso_scores_norm = (iso_scores - iso_scores.min()) / (iso_scores.max() - iso_scores.min())
    iso_features.append(iso_scores_norm)

# 特徵融合
ensemble_avg = np.mean(iso_features, axis=0)
ensemble_weighted = 0.4 * iso_features[0] + 0.35 * iso_features[1] + 0.25 * iso_features[2]

selected_iso_features = np.column_stack([
    ensemble_avg,
    ensemble_weighted,
    iso_features[0]
])

X_enhanced = np.hstack([X, selected_iso_features])
X_train_enh, X_test_enh, y_train_enh, y_test_enh = train_test_split(
    X_enhanced, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

#  XGBoost 配置
xgb = XGBClassifier(
    n_estimators=325,
    max_depth=7,
    learning_rate=0.066,
    subsample=0.95,
    colsample_bytree=0.85,
    scale_pos_weight=8.2,
    random_state=RANDOM_SEED,
    use_label_encoder=False,
    eval_metric='logloss',
    tree_method='hist'
)

# 訓練模型
xgb.fit(X_train_enh, y_train_enh)
y_proba = xgb.predict_proba(X_test_enh)[:, 1]

# 使用閾值進行預測
final_pred = (y_proba > 0.570).astype(int)

# 輸出最終結果
evaluation(y_test_enh, final_pred, "isolation forest and xgboost evaluation")


isolation forest and xgboost evaluation Evaluation:
         Accuracy: 0.9996839998595555
  Precision Score: 0.943089430894309
     Recall Score: 0.8529411764705882
         F1 Score: 0.8957528957528957

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.85      0.90       136

    accuracy                           1.00     85443
   macro avg       0.97      0.93      0.95     85443
weighted avg       1.00      1.00      1.00     85443

