In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub
from xgboost import XGBClassifier, callback

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

In [4]:
# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

In [5]:
# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')


Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


- Time 拿掉
- Amount 限制範圍在 -1 ~ 1
- Class 前面被轉為 int，只有 1 / 0


In [6]:
# define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))


# RandomForest

測試結果：
- depth 8~10 差異不大，但似乎 8 是最佳
- 數的數量大概 500 就好，因為在 8 差不多是極限
- subsample 比例 0.7 目前最好，繼續向上或往下都會掉下去
- colsample_bytree 用 1 會是最好
- learning_rate 其實就直接抓最小，目前還沒遇到過擬合

## 測試最佳參數

In [60]:
X = np.asarray(data.iloc[:,~data.columns.isin(['Class'])])
Y = np.asarray(data.iloc[:, data.columns == 'Class'])

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)


# 準備收集結果的 list
param_grid = [
    # {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8},
    # {'n_estimators': 220, 'max_depth': 6, 'learning_rate': 0.12, 'subsample': 0.8, 'colsample_bytree': 1.0},
    # {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.10, 'subsample': 0.8, 'colsample_bytree': 0.8},
    # {'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.10, 'subsample': 0.8, 'colsample_bytree': 0.8},
    # {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.8},
    # {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.02, 'subsample': 0.8, 'colsample_bytree': 0.8},
    # {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8},
    # {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.02, 'subsample': 0.8, 'colsample_bytree': 1.0},
    # {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.02, 'subsample': 1.0, 'colsample_bytree': 1.0},
    # {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.02, 'subsample': 0.8, 'colsample_bytree': 1.0},
    {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.02, 'subsample': 0.7, 'colsample_bytree': 1.0},
    {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.02, 'subsample': 0.75, 'colsample_bytree': 1.0},
    {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.02, 'subsample': 0.65, 'colsample_bytree': 1.0},
    {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.02, 'subsample': 0.725, 'colsample_bytree': 1.0},
]

results = []
for params in param_grid:
    # 建立 GPU 版 XGBoost
    model = XGBClassifier(
        tree_method='hist',
        device='cuda',
        scale_pos_weight=2,
        gamma=0.05,
        eval_metric='logloss',
        random_state=42,
        **params
    )
    model.fit(X_train, y_train)
    # 預測
    y_pred = model.predict(X_test)
    # 計算指標
    results.append({
        **params,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred)
    })
    # evaluation(y_test, y_pred, model_name="XGB:")

# 將結果放入 DataFrame
df_results = pd.DataFrame(results)
print(df_results)

   n_estimators  max_depth  learning_rate  subsample  colsample_bytree  \
0           500          8           0.02      0.700               1.0   
1           500          8           0.02      0.750               1.0   
2           500          8           0.02      0.650               1.0   
3           500          8           0.02      0.725               1.0   

   Accuracy  Precision    Recall        F1  
0  0.999707   0.966387  0.845588  0.901961  
1  0.999661   0.934959  0.845588  0.888031  
2  0.999696   0.958333  0.845588  0.898438  
3  0.999684   0.950413  0.845588  0.894942  


In [59]:
xgb_best = XGBClassifier(
    tree_method='hist',
    device='cuda',
    n_estimators=500,
    max_depth=8,
    learning_rate=0.02,
    subsample=0.7,
    colsample_bytree=1.0,
    scale_pos_weight=2,
    gamma=0.05,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=RANDOM_SEED
)
xgb_best.fit(X_train, y_train)
y_pred_best = xgb_best.predict(X_test)

evaluation(y_test, y_pred_best, model_name="XGB")

Parameters: { "use_label_encoder" } are not used.




XGB Evaluation:
         Accuracy: 0.9997074072773662
  Precision Score: 0.9663865546218487
     Recall Score: 0.8455882352941176
         F1 Score: 0.9019607843137255

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.97      0.85      0.90       136

    accuracy                           1.00     85443
   macro avg       0.98      0.92      0.95     85443
weighted avg       1.00      1.00      1.00     85443



原始數據：
```
Random Forest Evaluation:
=============================================
         Accuracy: 0.9996137776061234
  Precision Score: 0.9478260869565217
     Recall Score: 0.8014705882352942
         F1 Score: 0.8685258964143426

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.95      0.80      0.87       136

    accuracy                           1.00     85443
   macro avg       0.97      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443
```

# KMeans

In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, silhouette_score
)
from scipy.stats import mode
import kagglehub

# --- Load and preprocess data ---
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data = data.drop(columns=["Time"])
data["Amount"] = StandardScaler().fit_transform(data[["Amount"]])

X = data.drop(columns=["Class"]).values
y = data["Class"].astype(int).values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import silhouette_score

# --- Settings ---
RANDOM_SEED = 42
TEST_SIZE = 0.3


# 再次標準化特徵
data_scaler = StandardScaler()
X_train = data_scaler.fit_transform(X_train)
X_test  = data_scaler.transform(X_test)

# --- 使用正常樣本計算最佳 k ---
# 抽取前 1000 筆正常樣本
norm_samples = X_train[y_train == 0][:1000]
# Silhouette 分數列表
sil_scores = []
for k in range(2, 10):
    km = KMeans(
        n_clusters=k,
        init='k-means++',
        random_state=RANDOM_SEED
    )
    labels = km.fit_predict(norm_samples)
    sil_scores.append(silhouette_score(norm_samples, labels))
# 選出最佳 k
optimal_k = np.argmax(sil_scores) + 2
print(f"Optimal k (2-10) by Silhouette: {optimal_k}")

km_final = KMeans(
    n_clusters=optimal_k,
    init='k-means++',
    n_init=30,
    max_iter=700,
    tol=1e-4,
    random_state=RANDOM_SEED
)
km_final.fit(norm_samples)

# 在測試集上分群並對齊標籤
labels_test = km_final.predict(X_test)

def align_labels(y_true, y_pred, n_clusters):
    aligned = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.any(mask):
            aligned[mask] = np.bincount(y_true[mask]).argmax()
    return aligned

y_pred_aligned = align_labels(y_test, labels_test, optimal_k)

Optimal k (2-10) by Silhouette: 8


In [52]:
def evaluation(y_true, y_pred, model_name="Model"):
   accuracy = accuracy_score(y_true, y_pred)
   precision = precision_score(y_true, y_pred, zero_division=0)
   recall = recall_score(y_true, y_pred)
   f1 = f1_score(y_true, y_pred)

   print(f'\n{model_name} Evaluation:')
   print('===' * 15)
   print('         Accuracy:', accuracy)
   print('  Precision Score:', precision)
   print('     Recall Score:', recall)
   print('         F1 Score:', f1)
   print("\nClassification Report:")
   print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred, model_name="KMeans (Unsupervised)")


KMeans (Unsupervised) Evaluation:
         Accuracy: 0.9989817773252344
  Precision Score: 0.8144329896907216
     Recall Score: 0.5337837837837838
         F1 Score: 0.6448979591836734

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.81      0.53      0.64       148

    accuracy                           1.00     85443
   macro avg       0.91      0.77      0.82     85443
weighted avg       1.00      1.00      1.00     85443

