<a href="https://colab.research.google.com/github/Thomas993300/NTCU-Machine-Learning/blob/main/ex02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kagglehub

#ragular
RANDOM_SEED = 42
TEST_SIZE = 0.3

#load
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

#data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

#split
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)


In [3]:
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'\n{model_name} Evaluation:')
    print('=' * 30)
    print(f' Accuracy       : {accuracy:.8f}')
    print(f' Precision Score: {precision:.8f}')
    print(f' Recall Score   : {recall:.8f}')
    print(f' F1 Score       : {f1:.8f}')
    print('\nClassification Report:')
    print(classification_report(y_true, y_pred))

In [36]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.002, random_state=RANDOM_SEED)
iso.fit(x_train[y_train == 0])
anomaly_score_train = iso.decision_function(x_train)
anomaly_score_test = iso.decision_function(x_test)
iso_pred_train = iso.predict(x_train)  # -1: 異常, 1: 正常
iso_pred_test = iso.predict(x_test)




In [37]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8, random_state=RANDOM_SEED)   # 你可以自己調整n_components
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)


In [38]:
# 合併所有特徵（原始 + PCA + anomaly score + IF預測標籤）
x_train_all = np.hstack([
    x_train,
    x_train_pca,
    anomaly_score_train.reshape(-1,1),
    iso_pred_train.reshape(-1,1)
])
x_test_all = np.hstack([
    x_test,
    x_test_pca,
    anomaly_score_test.reshape(-1,1),
    iso_pred_test.reshape(-1,1)
])


In [34]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


xgb = XGBClassifier(
    colsample_bytree=1.0,
    learning_rate=0.1,
    max_depth=8,
    n_estimators=600,
    subsample=0.8,
    scale_pos_weight=30,
    eval_metric='logloss',
    tree_method='hist',
    use_label_encoder=False,
    random_state=RANDOM_SEED
)

# XGBoost 模型訓練

#xgb = XGBClassifier(n_estimators=100, max_depth=5, scale_pos_weight=10, random_state=RANDOM_SEED)
xgb.fit(x_train_all, y_train)
y_pred = xgb.predict(x_test_all)



evaluation(y_test, y_pred, model_name="IsolationForest + XGBoost")


Parameters: { "use_label_encoder" } are not used.




IsolationForest + XGBoost Evaluation:
 Accuracy       : 0.99953185
 Precision Score: 0.92187500
 Recall Score   : 0.79729730
 F1 Score       : 0.85507246

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.92      0.80      0.86       148

    accuracy                           1.00     85443
   macro avg       0.96      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [39]:
import numpy as np
y_proba = xgb.predict_proba(x_test_all)[:, 1]

best_f1 = 0
best_thresh = 0.5
for t in np.arange(0.1, 0.9, 0.01):
    y_pred_new = (y_proba > t).astype(int)
    f1 = f1_score(y_test, y_pred_new)
    recall = recall_score(y_test, y_pred_new)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t
        best_recall = recall

print(f"Best F1: {best_f1:.5f} at threshold: {best_thresh:.2f} (Recall: {best_recall:.5f})")
# 若你要印出最佳組合的所有分數
y_pred_best = (y_proba > best_thresh).astype(int)
evaluation(y_test, y_pred_best, model_name="XGBoost + IF + PCA + IFlabel (Best Threshold)")


Best F1: 0.86022 at threshold: 0.35 (Recall: 0.81081)

XGBoost + IF + PCA + IFlabel (Best Threshold) Evaluation:
 Accuracy       : 0.99954356
 Precision Score: 0.91603053
 Recall Score   : 0.81081081
 F1 Score       : 0.86021505

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.92      0.81      0.86       148

    accuracy                           1.00     85443
   macro avg       0.96      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443

