<a href="https://colab.research.google.com/github/Thomas993300/NTCU-Machine-Learning/blob/main/ex02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

!pip install --upgrade xgboost imbalanced-learn


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.4
    Uninstalling xgboost-2.1.4:
      Successfully uninstalled xgboost-2.1.4
Successfully installed xgboost-3.0.2


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import kagglehub

In [4]:
def evaluation(y_true, y_pred, model_name="Model"):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"\n{model_name} Evaluation:")
    print("=" * 40)
    print(f" Accuracy       : {acc:.8f}")
    print(f" Precision Score: {prec:.8f}")
    print(f" Recall Score   : {rec:.8f}")
    print(f" F1 Score       : {f1:.8f}\n")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

In [5]:
def load_data():
    path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
    data = pd.read_csv(f"{path}/creditcard.csv")
    data['Class'] = data['Class'].astype(int)
    data.drop(['Time'], axis=1, inplace=True)
    data['Amount'] = StandardScaler().fit_transform(
        data['Amount'].values.reshape(-1, 1))
    return data

# Load and split
RANDOM_SEED = 42
TEST_SIZE = 0.3

data = load_data()
X = data.drop(columns=['Class']).values
y = data['Class'].values
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)

In [7]:
iso = IsolationForest(contamination=0.002, random_state=RANDOM_SEED)
iso.fit(x_train[y_train == 0])
score_train = iso.decision_function(x_train)
score_test  = iso.decision_function(x_test)

x_train_feat = np.hstack([x_train, score_train.reshape(-1,1)])
x_test_feat  = np.hstack([x_test,  score_test.reshape(-1,1)])

In [16]:
param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [5, 8],
    'learning_rate': [0.05, 0.1],
    'scale_pos_weight': [10, 20]
}


xgb = XGBClassifier(
    tree_method='hist',  # <-- changed here: use CPU 'hist' instead of 'gpu_hist'
    eval_metric='logloss',
    scale_pos_weight=0.172,
    random_state=RANDOM_SEED
)
grid = GridSearchCV(
    xgb, param_grid, scoring='f1', cv=3, n_jobs=-1, verbose=1
)

grid.fit(x_train_feat, y_train)
best_model = grid.best_estimator_
print("Best parameters:", grid.best_params_)

# Default threshold (0.5)
y_pred_default = best_model.predict(x_test_feat)
evaluation(y_test, y_pred_default, model_name="DefaultThreshold")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 400, 'scale_pos_weight': 10}

DefaultThreshold Evaluation:
 Accuracy       : 0.99952015
 Precision Score: 0.92125984
 Recall Score   : 0.79054054
 F1 Score       : 0.85090909

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.92      0.79      0.85       148

    accuracy                           1.00     85443
   macro avg       0.96      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [17]:
y_proba = best_model.predict_proba(x_test_feat)[:,1]

# 門檻掃描
best_f1, best_thresh = 0, 0.5
for t in np.arange(0.1, 0.9, 0.01):
    preds = (y_proba > t).astype(int)
    f1 = f1_score(y_test, preds)
    if f1 > best_f1:
        best_f1, best_thresh = f1, t

print(f"Best F1: {best_f1:.5f} at threshold: {best_thresh:.2f}")

# 使用最佳門檻
y_pred_tuned = (y_proba > best_thresh).astype(int)
evaluation(y_test, y_pred_tuned, model_name=f"Threshold {best_thresh:.2f}")

Best F1: 0.85507 at threshold: 0.42

Threshold 0.42 Evaluation:
 Accuracy       : 0.99953185
 Precision Score: 0.92187500
 Recall Score   : 0.79729730
 F1 Score       : 0.85507246

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.92      0.80      0.86       148

    accuracy                           1.00     85443
   macro avg       0.96      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443

