<a href="https://colab.research.google.com/github/thomaschen01/NTCU-Machine-Learning/blob/main/ACS111106_ex2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

非監督式學習:Isolation Forest 和 PCA降維 + 監督式學習:XGBoost

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
import kagglehub
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")

data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop(columns=['Class']).to_numpy()
Y = data['Class'].to_numpy()
RANDOM_SEED = 42
TEST_SIZE = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# === 3. Isolation Forest 特徵 ===
iso_forest = IsolationForest(contamination=0.0017, random_state=RANDOM_SEED)
iso_forest.fit(X_train_std)

train_anomaly_scores = iso_forest.decision_function(X_train_std).reshape(-1, 1)
test_anomaly_scores = iso_forest.decision_function(X_test_std).reshape(-1, 1)

# === 4. PCA 特徵抽取（10 維）===
pca = PCA(n_components=10, random_state=RANDOM_SEED)
pca.fit(X_train_std)

X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# === 5. 合併原始特徵 + IF分數 + PCA 特徵 ===
X_train_enhanced = np.hstack((X_train_std, train_anomaly_scores, X_train_pca))
X_test_enhanced = np.hstack((X_test_std, test_anomaly_scores, X_test_pca))

# === 6. 建立並訓練 XGBoost 模型 ===
xgb_model = XGBClassifier(
    colsample_bytree=1.0,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=200,
    subsample=0.8,
    scale_pos_weight=2.5,
    eval_metric='logloss',
    tree_method='hist',
    use_label_encoder=False,
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train_enhanced, y_train)
y_prob = xgb_model.predict_proba(X_test_enhanced)[:, 1]
threshold = 0.43
y_pred_custom = (y_prob > threshold).astype(int)
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
evaluation(y_test, y_pred_custom, model_name="XGBoost with IF + PCA Features")

Parameters: { "use_label_encoder" } are not used.

  msg += " or "



XGBoost with IF + PCA Features Evaluation:
         Accuracy: 0.9996957035684608
  Precision Score: 0.9365079365079365
     Recall Score: 0.8676470588235294
         F1 Score: 0.9007633587786259

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.87      0.90       136

    accuracy                           1.00     85443
   macro avg       0.97      0.93      0.95     85443
weighted avg       1.00      1.00      1.00     85443

