In [2]:
# 監督式學習 XGBoost

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3


In [3]:
# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))


In [4]:
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


In [5]:
from xgboost import XGBClassifier

X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
Y = np.asarray(data.iloc[:, data.columns == 'Class'])

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

xgb_model = XGBClassifier(
    n_estimators=325,
    max_depth=7,
    learning_rate=0.066,
    subsample=0.95,
    colsample_bytree=0.85,
    scale_pos_weight=7.5,
    random_state=RANDOM_SEED,
    use_label_encoder=False,
    eval_metric='logloss',
    tree_method='hist'

)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# predict and print result
evaluation(y_test, y_pred, model_name="XGBoost")






XGBoost Evaluation:
         Accuracy: 0.9996722961506501
  Precision Score: 0.9426229508196722
     Recall Score: 0.8455882352941176
         F1 Score: 0.8914728682170543

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.85      0.89       136

    accuracy                           1.00     85443
   macro avg       0.97      0.92      0.95     85443
weighted avg       1.00      1.00      1.00     85443



In [6]:
import numpy as np
import pandas as pd
import kagglehub
import warnings
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

warnings.filterwarnings("ignore")
tf.get_logger().setLevel("ERROR")

# --- 評估函數 ---
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# --- 超參 ---
SEED        = 42
TEST_SIZE   = 0.30
ENC_UNITS   = [64,32,16]
DROPOUT     = 0.2
NOISE_STD   = 0.05
EPOCHS      = 70
BATCH       = 256
THRESH_PCT  = 99.9   # 從掃描結果選出的最佳門檻


y = data["Class"].values

x_tr, x_te, y_tr, y_te = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=SEED)

scaler = StandardScaler()
x_tr = scaler.fit_transform(x_tr)
x_te = scaler.transform(x_te)

#  訓練 Denoising Auto-Encoder（只用正常樣本）
x_norm = x_tr[y_tr == 0]
tf.random.set_seed(SEED)

inp = tf.keras.Input(shape=(x_tr.shape[1],))
x   = tf.keras.layers.GaussianNoise(NOISE_STD)(inp)
for h in ENC_UNITS[:-1]:
    x = tf.keras.layers.Dense(h, activation="relu")(x)
    x = tf.keras.layers.Dropout(DROPOUT)(x)
encoded = tf.keras.layers.Dense(ENC_UNITS[-1], activation="relu")(x)
x = tf.keras.layers.Dense(ENC_UNITS[-2], activation="relu")(encoded)
x = tf.keras.layers.Dense(ENC_UNITS[-3], activation="relu")(x)
out = tf.keras.layers.Dense(x_tr.shape[1], activation="linear")(x)

dae = tf.keras.Model(inp, out)
dae.compile(optimizer="adam", loss="mse")
dae.fit(x_norm, x_norm, epochs=EPOCHS, batch_size=BATCH, shuffle=True, verbose=0)

# 計算重建誤差
mse_tr = np.mean((dae.predict(x_tr, verbose=0) - x_tr)**2, axis=1)
mse_te = np.mean((dae.predict(x_te, verbose=0) - x_te)**2, axis=1)

# 使用固定門檻進行預測
thr = np.percentile(mse_tr, THRESH_PCT)
y_pred = (mse_te > thr).astype(int)

# 呼叫 evaluation 印出結果
evaluation(y_te, y_pred, model_name=f"Denoising AE (thr={THRESH_PCT}%)")



Denoising AE (thr=99.9%) Evaluation:
         Accuracy: 0.9987828142738434
  Precision Score: 0.7619047619047619
     Recall Score: 0.43243243243243246
         F1 Score: 0.5517241379310345

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.76      0.43      0.55       148

    accuracy                           1.00     85443
   macro avg       0.88      0.72      0.78     85443
weighted avg       1.00      1.00      1.00     85443



# 新增區段