In [None]:
!git clone https://github.com/ggguuuaaannn/NTCU-Machine-Learning.git

Cloning into 'NTCU-Machine-Learning'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 18 (delta 1), reused 0 (delta 0), pack-reused 16 (from 1)[K
Receiving objects: 100% (18/18), 5.13 KiB | 1.03 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve

# 下載資料
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
df = pd.read_csv(f"{path}/creditcard.csv")

# 前處理
df['Amount'] = StandardScaler().fit_transform(df[['Amount']])
df = df.drop(columns=['Time'])
X = df.drop(columns=['Class'])
y = df['Class']

# 資料切分與標準化
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Isolation Forest
iso = IsolationForest(n_estimators=600, contamination=0.017, max_features=10, random_state=42)
iso.fit(X_train_scaled)

X_train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
X_train_df["anomaly_score"] = iso.decision_function(X_train_scaled)
X_test_df["anomaly_score"] = iso.decision_function(X_test_scaled)
X_train_df["isolation_label"] = (iso.predict(X_train_scaled) == -1).astype(int)
X_test_df["isolation_label"] = (iso.predict(X_test_scaled) == -1).astype(int)

# XGBoost 訓練（精調）
xgb = XGBClassifier(
    max_depth=7,
    n_estimators=710,
    learning_rate=0.031,
    subsample=0.91,
    colsample_bytree=0.84,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    eval_metric='logloss',
    random_state=42
)
xgb.fit(X_train_df, y_train)

# 預測 + 最佳 threshold
y_prob = xgb.predict_proba(X_test_df)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * precision * recall / (precision + recall + 1e-6)
best_threshold = thresholds[np.argmax(f1_scores)]

# 最終預測
y_pred = (y_prob >= best_threshold).astype(int)

# 評估
print("\nHybrid Model Evaluation:")
print("=" * 50)
print(f"Accuracy       : {accuracy_score(y_test, y_pred)}")
print(f"Precision Score: {precision_score(y_test, y_pred)}")
print(f"Recall Score   : {recall_score(y_test, y_pred)}")
print(f"F1 Score       : {f1_score(y_test, y_pred)}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=2))




Hybrid Model Evaluation:
Accuracy       : 0.9996839998595555
Precision Score: 0.9658119658119658
Recall Score   : 0.8308823529411765
F1 Score       : 0.8932806324110671

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.97      0.83      0.89       136

    accuracy                           1.00     85443
   macro avg       0.98      0.92      0.95     85443
weighted avg       1.00      1.00      1.00     85443

