In [1]:
#import necessary package
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

In [2]:
# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))


In [3]:
#fraud/Non-Fraud Transactions

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')


Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


In [None]:
!pip install imbalanced-learn



In [15]:
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

X, y = data.drop('Class', axis=1), data['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE,
    #stratify=y,
    random_state=RANDOM_SEED)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

pca = PCA(n_components=0.95, random_state=RANDOM_SEED)  # 保留 95% 解釋變異
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

iso = IsolationForest(contamination=0.01, random_state=RANDOM_SEED)
outlier_train = iso.fit_predict(X_train_pca).reshape(-1, 1)
outlier_test = iso.predict(X_test_pca).reshape(-1, 1)

# 組合成新的訓練與測試集
X_train_combined = np.concatenate([X_train_pca, outlier_train], axis=1)
X_test_combined = np.concatenate([X_test_pca, outlier_test], axis=1)

#y_iso_pred = iso.fit_predict(X_train_pca)

#保留異常樣本
# mask = y_iso_pred == 1
# X_train_combined = X_train_pca[mask]
# y_train = y_train.reset_index(drop=True)
# y_train_combined = y_train[mask].reset_index(drop=True)


In [8]:
!pip install optuna
!pip install xgboost

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.41-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.2.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sqlalchemy-2.0.41-cp311-cp311-manyli

In [23]:
# XGboost模型
# 使用 scale_pos_weight 對不平衡類別進行加權
from xgboost import XGBClassifier

#xgb_p = {'max_depth': 6, 'learning_rate': 0.13042046559796128, 'n_estimators': 239, 'subsample': 0.8734341721479638, 'colsample_bytree': 0.9573714080077769, 'scale_pos_weight': 0.39978732376580606, 'gamma': 0.14893186350468346}
xgb_p = {'max_depth': 10, 'learning_rate': 0.11469837208498773, 'n_estimators': 187, 'subsample': 0.6387559589864432, 'colsample_bytree': 0.5074817214741923, 'scale_pos_weight': 3.9961104352358396, 'gamma': 3.3674507273496967}
#xgb_p = {'max_depth': 10, 'learning_rate': 0.11909506754087697, 'n_estimators': 253, 'subsample': 0.8495263706521958, 'colsample_bytree': 0.5801064534999925, 'scale_pos_weight': 2.6261833195973074, 'gamma': 3.227701257413156}
xgb_model = XGBClassifier(
  **xgb_p,
  eval_metric='logloss',
  tree_method='hist',
  random_state=RANDOM_SEED

)
#xgb_model.fit(X_train_std, y_train)
xgb_model.fit(X_train_combined, y_train)

In [24]:
# evaluate
# define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))


y_pred = xgb_model.predict(X_test_combined)

evaluation(y_test, y_pred, model_name="hybrid")


hybrid Evaluation:
         Accuracy: 0.9996605924417448
  Precision Score: 0.9572649572649573
     Recall Score: 0.8235294117647058
         F1 Score: 0.8853754940711462

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.96      0.82      0.89       136

    accuracy                           1.00     85443
   macro avg       0.98      0.91      0.94     85443
weighted avg       1.00      1.00      1.00     85443



In [7]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading mako-1.3.10-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: 

In [17]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 350),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'eval_metric': 'logloss',
        'tree_method': 'hist',
        'random_state': RANDOM_SEED,
        #'use_label_encoder': False
    }

    model = XGBClassifier(**params)
    model.fit(X_train_combined, y_train)

    y_pred = model.predict(X_test_combined)
    return f1_score(y_test, y_pred)
study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=50)  # 可調整試驗次數

print(" 最佳參數組合:", study.best_trial.params)
print(" 最佳 F1 分數:", study.best_value)

[I 2025-05-29 15:14:18,142] A new study created in memory with name: no-name-3f218aef-0521-4620-9fdc-e528a6825326
[I 2025-05-29 15:14:24,331] Trial 0 finished with value: 0.8455284552845529 and parameters: {'max_depth': 9, 'learning_rate': 0.1753812670940355, 'n_estimators': 303, 'subsample': 0.9479859625680532, 'colsample_bytree': 0.7682098199913296, 'scale_pos_weight': 0.9783297681944758, 'gamma': 2.066894755906623}. Best is trial 0 with value: 0.8455284552845529.
[I 2025-05-29 15:14:29,827] Trial 1 finished with value: 0.8333333333333334 and parameters: {'max_depth': 10, 'learning_rate': 0.15473029602414962, 'n_estimators': 341, 'subsample': 0.9245575168608494, 'colsample_bytree': 0.7110628387780951, 'scale_pos_weight': 0.5773582425990158, 'gamma': 1.0909151078150503}. Best is trial 0 with value: 0.8455284552845529.
[I 2025-05-29 15:14:35,760] Trial 2 finished with value: 0.8695652173913043 and parameters: {'max_depth': 6, 'learning_rate': 0.12849866631433254, 'n_estimators': 206, '

 最佳參數組合: {'max_depth': 8, 'learning_rate': 0.11909506754087697, 'n_estimators': 253, 'subsample': 0.8495263706521958, 'colsample_bytree': 0.5801064534999925, 'scale_pos_weight': 2.6261833195973074, 'gamma': 3.227701257413156}
 最佳 F1 分數: 0.8774703557312253


In [14]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 200, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 50),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'eval_metric': 'logloss',
        'random_state': RANDOM_SEED,
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)
    scores = []

    for train_idx, val_idx in skf.split(X_train_combined, y_train_combined):
        X_train_cv, X_val_cv = X_train_combined[train_idx], X_train_combined[val_idx]
        y_train_cv, y_val_cv = y_train_combined[train_idx], y_train_combined[val_idx]

        model = XGBClassifier(**params)
        model.fit(X_train_cv, y_train_cv)

        proba = model.predict_proba(X_val_cv)[:, 1]
        preds = (proba > 0.43).astype(int)

        f1 = f1_score(y_val_cv, preds)
        precision = precision_score(y_val_cv, preds)
        score = f1 + precision  # 可改為 weighted sum 或 harmonic mean
        scores.append(score)

    return np.mean(scores)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("最佳參數：", study.best_params)
print("最佳綜合評分：", study.best_value)

[I 2025-05-29 15:13:57,169] A new study created in memory with name: no-name-34a960b1-97cf-4951-81df-361f85be5dbd
[W 2025-05-29 15:13:57,173] Trial 0 failed with parameters: {'max_depth': 6, 'learning_rate': 0.19531621054615458, 'n_estimators': 233, 'subsample': 0.8291362057136018, 'colsample_bytree': 0.6885010990263384, 'scale_pos_weight': 16.47718857758229, 'gamma': 3.3615561668770546} because of the following error: NameError("name 'y_train_combined' is not defined").
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-14-459dd01aecbf>", line 23, in objective
    for train_idx, val_idx in skf.split(X_train_combined, y_train_combined):
                                                          ^^^^^^^^^^^^^^^^
NameError: name 'y_train_combined' is not defined
[W 2025-05-29 15:13:57,175] Trial 0 failed with value N

NameError: name 'y_train_combined' is not defined