# 資料處理

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kagglehub

# general setting. do not change TEST_SIZE
# 這個不能動
RANDOM_SEED = 42
TEST_SIZE = 0.3

################################ 資料處理 ####################################

# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
# 0 for nonfraud 1 for fraud
data["Class"] = data["Class"].astype(int)

# prepare data
data = data.drop(["Time"], axis=1)  # 去除 time
data["Amount"] = StandardScaler().fit_transform(
    data["Amount"].values.reshape(-1, 1)
)  # 標準化

# 計算詐騙和正常交易的數量 (資料集極度不平衡)
fraud = data[data["Class"] == 1]
nonfraud = data[data["Class"] == 0]
print(f"Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}")
print(
    f"the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)"
)

# 選擇非class的值轉成numpy array 且如果原本就是np array時不複製
X = np.asarray(data.iloc[:, ~data.columns.isin(["Class"])])
# 最佳化為 Pandas → NumPy 的不複製轉換
Y = data["Class"].to_numpy()

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

# 非監督

In [None]:
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)
import numpy as np
import random
import tensorflow as tf

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# 只拿正常樣本訓練
X_train_auto = X_train[y_train == 0]

# 建立 AutoEncoder 結構
input_dim = X_train.shape[1]
encoding_dim = 18  # 壓縮
hidden_dim = int(encoding_dim / 2)

input_layer = Input(shape=(input_dim,))

# Encoder
encoder = Dense(29, activation="tanh", activity_regularizer=regularizers.l1(1e-5))(
    input_layer
)
encoder = Dense(17, activation="tanh")(encoder)
# 取出最重要的8個特徵
latent_output = Dense(8, activation="relu")(encoder)

# Decoder
decoder = Dense(17, activation="relu")(latent_output)
decoder = Dense(29, activation="tanh")(decoder)
decoder = Dense(input_dim, activation="linear")(decoder)  # 回到原始維度


autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer="adam", loss="mean_squared_error")

# 建出 Encoder 模型
encoder_model = Model(inputs=input_layer, outputs=latent_output)

def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"\n{model_name} Evaluation:")
    print("===" * 15)
    print("         Accuracy:", accuracy)
    print("  Precision Score:", precision)
    print("     Recall Score:", recall)
    print("         F1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))


# 訓練
autoencoder.fit(
    X_train_auto,
    X_train_auto,
    epochs=40,
    batch_size=64,
    shuffle=True,
    validation_split=0.1,
    verbose=1,
)

X_test_pred = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

for c in [99.8, 99.825, 99.85, 99.875, 99.9]:
    threshold = np.percentile(mse, c)
    y_pred = (mse > threshold).astype(int)

    evaluation(y_test, y_pred, model_name="AutoEncoder")


# 計算 reconstruction error (可加到特徵中)
X_train_pred = autoencoder.predict(X_train)
X_test_pred = autoencoder.predict(X_test)

# 和原資料合併
X_train_latent = encoder_model.predict(X_train)
X_test_latent = encoder_model.predict(X_test)


train_mse = np.mean(np.power(X_train - X_train_pred, 2), axis=1).reshape(-1, 1)
test_mse = np.mean(np.power(X_test - X_test_pred, 2), axis=1).reshape(-1, 1)

# 加上 AE 重建誤差作為額外特徵
X_train_aug = np.hstack([X_train, train_mse])
X_test_aug = np.hstack([X_test, test_mse])

Epoch 1/40
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.6489 - val_loss: 0.3627
Epoch 2/40
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.3389 - val_loss: 0.3006
Epoch 3/40
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.2953 - val_loss: 0.2806
Epoch 4/40
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.2762 - val_loss: 0.2723
Epoch 5/40
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.2638 - val_loss: 0.2577
Epoch 6/40
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.2530 - val_loss: 0.2475
Epoch 7/40
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.2425 - val_loss: 0.2371
Epoch 8/40
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.2340 - val_loss: 0.2315
Epoch 9/40
[1m2799/2799

# 監督

In [85]:
from xgboost import XGBClassifier

########################## train model ################################

xgb_model = XGBClassifier(
    n_estimators=150,
    max_depth=8,
    # 處理資料集不平衡
    scale_pos_weight=len(nonfraud) / len(fraud),
    eval_metric="logloss",
    learning_rate=0.1,
    random_state=RANDOM_SEED,  # 42
)

xgb_model.fit(
    X_train_aug,
    y_train,
)

y_proba = xgb_model.predict_proba(X_test_aug)[:, 1]


def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"\n{model_name} Evaluation:")
    print("===" * 15)
    print("         Accuracy:", accuracy)
    print("  Precision Score:", precision)
    print("     Recall Score:", recall)
    print("         F1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))


# 嘗試不同 threshold
for threshold in [0.9, 0.95, 0.96, 0.97, 0.975, 0.98, 0.99, 0.995]:
    y_pred_thresh = (y_proba > threshold).astype(int)
    evaluation(y_test, y_pred_thresh, model_name=f"XGB-threshold {threshold:.3f} ")

    y_train_pred = xgb_model.predict(X_train_aug)
    y_test_pred = xgb_model.predict(X_test_aug)

    f1_train = f1_score(y_train, y_train_pred)
    f1_test = f1_score(y_test, y_test_pred)

    print("Train F1:", f1_train)
    print("Test  F1:", f1_test)


XGB-threshold 0.900  Evaluation:
         Accuracy: 0.9996839998595555
  Precision Score: 0.9658119658119658
     Recall Score: 0.8308823529411765
         F1 Score: 0.8932806324110671

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.97      0.83      0.89       136

    accuracy                           1.00     85443
   macro avg       0.98      0.92      0.95     85443
weighted avg       1.00      1.00      1.00     85443

Train F1: 1.0
Test  F1: 0.8679245283018868

XGB-threshold 0.950  Evaluation:
         Accuracy: 0.9996722961506501
  Precision Score: 0.9655172413793104
     Recall Score: 0.8235294117647058
         F1 Score: 0.8888888888888888

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.97      0.82      0.89       136

    accuracy                           1.00   

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

params = {
    "n_estimators": [50, 75, 100, 125, 150],
    "max_depth": [6, 7, 8, 9],
    "learning_rate": [0.05, 0.1],
    "scale_pos_weight": [len(nonfraud) / len(fraud), len(nonfraud) * 2 / len(fraud)],
}

grid = GridSearchCV(
    XGBClassifier(random_state=42), param_grid=params, scoring="f1", cv=3, n_jobs=-1
)

grid.fit(X_train_aug, y_train)

print("Best Parameters:", grid.best_params_)