<a href="https://colab.research.google.com/github/Rukiren/NTCU-Machine-Learning/blob/main/ex2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, roc_auc_score, precision_recall_curve
)
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import kagglehub

# === 新的評估函數 ===
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# === 1. 基本參數設定 ===
RANDOM_SEED = 42
TEST_SIZE = 0.3
N_NORMAL_SEED = 1000
N_FRAUD_SEED = 100
N_PCA_COMPONENTS = 7

# === 2. 載入與預處理資料 ===
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop(columns=['Class']).to_numpy()
Y = data['Class'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y
)

# === 3. 標準化處理 ===
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# === 4. PCA 降維特徵 ===
pca = PCA(n_components=N_PCA_COMPONENTS, random_state=RANDOM_SEED)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# === 5. Isolation Forest 異常檢測特徵 ===
iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.002,
    random_state=RANDOM_SEED
)
iso_forest.fit(X_train_std)
X_train_iso_score = iso_forest.decision_function(X_train_std).reshape(-1, 1)
X_test_iso_score = iso_forest.decision_function(X_test_std).reshape(-1, 1)

# === 6. KMeans 聚類距離特徵 ===
normal_idx = np.where(y_train == 0)[0][:N_NORMAL_SEED]
fraud_idx = np.where(y_train == 1)[0][:N_FRAUD_SEED]
seed_X = np.vstack([X_train_pca[normal_idx], X_train_pca[fraud_idx]])

seed_kmeans = KMeans(n_clusters=2, init='k-means++', random_state=RANDOM_SEED)
seed_kmeans.fit(seed_X)

kmeans = KMeans(
    n_clusters=2,
    init=seed_kmeans.cluster_centers_,
    n_init=1,
    random_state=RANDOM_SEED
)
kmeans.fit(X_train_pca)

def compute_cluster_distances(X, centers):
    return np.linalg.norm(X[:, np.newaxis] - centers, axis=2)

train_kmeans_dist = compute_cluster_distances(X_train_pca, kmeans.cluster_centers_)
test_kmeans_dist = compute_cluster_distances(X_test_pca, kmeans.cluster_centers_)

# === 7. 組合所有特徵 ===
X_train_hybrid = np.hstack((
    X_train_std,           # 原始標準化特徵
    X_train_pca,           # PCA 特徵
    X_train_iso_score,     # Isolation Forest 異常分數
    train_kmeans_dist      # KMeans 聚類距離
))

X_test_hybrid = np.hstack((
    X_test_std,
    X_test_pca,
    X_test_iso_score,
    test_kmeans_dist
))

# === 8. SMOTE 處理類別不平衡 ===
smote = SMOTE(random_state=RANDOM_SEED)
X_resampled, y_resampled = smote.fit_resample(X_train_hybrid, y_train)

# === 9. 使用你的最佳 XGBoost 參數 ===
xgb_hybrid = XGBClassifier(
    tree_method='hist',
    device='cpu',  # 根據你的環境調整
    n_estimators=500,
    max_depth=8,
    learning_rate=0.02,
    subsample=0.7,
    colsample_bytree=1.0,
    scale_pos_weight=2,
    gamma=0.05,
    eval_metric='logloss',
    random_state=RANDOM_SEED
)
xgb_hybrid.fit(X_resampled, y_resampled)

# === 10. 閾值優化 ===
y_prob_hybrid = xgb_hybrid.predict_proba(X_test_hybrid)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob_hybrid)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
y_pred_hybrid_optimized = (y_prob_hybrid > best_threshold).astype(int)

# === 11. 評估混合模型 ===
print("=== Multi-Hybrid Model (PCA + Isolation Forest + KMeans + XGBoost) ===")
print(f"Best Threshold: {best_threshold:.4f}")
evaluation(y_test, y_pred_hybrid_optimized, model_name="Multi-Hybrid Model")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob_hybrid):.4f}")

=== Multi-Hybrid Model (PCA + Isolation Forest + KMeans + XGBoost) ===
Best Threshold: 0.9963

Multi-Hybrid Model Evaluation:
         Accuracy: 0.9995084442259752
  Precision Score: 0.9732142857142857
     Recall Score: 0.7364864864864865
         F1 Score: 0.8384615384615385

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.97      0.74      0.84       148

    accuracy                           1.00     85443
   macro avg       0.99      0.87      0.92     85443
weighted avg       1.00      1.00      1.00     85443

ROC-AUC: 0.9717
