<a href="https://colab.research.google.com/github/hyazoe/NTCU-Machine-Learning/blob/main/ex2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
pip install pandas numpy scikit-learn xgboost



In [7]:
# 匯入必要套件
import pandas as pd
import numpy as np
import kagglehub

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier

# 固定亂數種子
RANDOM_SEED = 42
TEST_SIZE = 0.3

# 載入資料
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# 資料前處理
data = data.drop(columns=['Time'])
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

x = data.drop(columns=['Class']).values
y = data['Class'].values

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 評估函數
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"\n{model_name} Evaluation:")
    print("=" * 45)
    print(f"         Accuracy: {accuracy:.16f}")
    print(f"  Precision Score: {precision:.16f}")
    print(f"     Recall Score: {recall:.16f}")
    print(f"         F1 Score: {f1:.16f}\n")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))


In [8]:
# ========== 融合非監督與監督模型 ========== #
# 1. Isolation Forest 建立異常特徵
iso_forest = IsolationForest(
    contamination=len(y_train[y_train==1]) / len(y_train),
    random_state=RANDOM_SEED
)
iso_forest.fit(x_train)

iso_feature_train = iso_forest.predict(x_train).reshape(-1, 1)
iso_feature_test = iso_forest.predict(x_test).reshape(-1, 1)
iso_feature_train = np.where(iso_feature_train == -1, 1, 0)
iso_feature_test = np.where(iso_feature_test == -1, 1, 0)

# 2. 合併特徵
x_train_fused = np.hstack((x_train, iso_feature_train))
x_test_fused = np.hstack((x_test, iso_feature_test))

# 3. XGBoost 訓練（融合了非監督特徵）
scale_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
xgb_model = XGBClassifier(
    random_state=RANDOM_SEED,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_weight
)
xgb_model.fit(x_train_fused, y_train)
y_pred_fused = xgb_model.predict(x_test_fused)

# 4. 評估融合模型
evaluation(y_test, y_pred_fused, model_name="XGBoost with Isolation Forest Feature")

Parameters: { "use_label_encoder" } are not used.




XGBoost with Isolation Forest Feature Evaluation:
         Accuracy: 0.9994148145547324
  Precision Score: 0.8769230769230769
     Recall Score: 0.7702702702702703
         F1 Score: 0.8201438848920863

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.88      0.77      0.82       148

    accuracy                           1.00     85443
   macro avg       0.94      0.89      0.91     85443
weighted avg       1.00      1.00      1.00     85443

