In [2]:
# !pip install kagglehub
# !pip install ipywidgets

## Import Necessary Package

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (silhouette_score, accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report, precision_recall_curve)
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
import kagglehub
import optuna

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

## Load Dataset & Prepare Data

In [4]:
# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
print(path)
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

/home/u7539525/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3


## Fraud/Non-Fraud Transactions

In [5]:
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


In [6]:
# define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

## 監督式學習 (XGBoost)

**Baseline**:

```
Random forest Evaluation:
=============================================
         Accuracy: 0.9996371850239341
  Precision Score: 0.9411764705882353
     Recall Score: 0.8235294117647058
         F1 Score: 0.8784313725490196

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.82      0.88       136

    accuracy                           1.00     85443
   macro avg       0.97      0.91      0.94     85443
weighted avg       1.00      1.00      1.00     85443
```

### 說明

XGBoost 是使用 Gradient Boosting 方式，依序訓練多個決策樹，每棵新的樹都會對前一棵樹進行學習跟修正。相較於 Random Forest 的每棵樹彼此獨立，XGBoost 有更高的 accuracy，並且訓練效率更高。

參數調整：

- `enable_categorical`: 使用分類模式。
- `n_estimators`: 經過多組參數測試，設置 250 的效果最好，設更高結果不再提升。
- `tree_method`: 分類模式需要使用 `approx` 或 `hist` 演算法，前者兼顧效率與準確度。
- `device`: 使用 GPU 加速計算。
- `learning_rate`: 預設值是 0.3，在 40 步之後開始出現 overfitting 的現象，
- `n_jobs`: -1 表示用所有 CPU 核心進行平行計算。

### 結果

四項指標皆有提升：

|      指標       | Baseline |   My Model   |
|:---------------:|:--------:|:------------:|
|    Accuracy     | 0.999637 | **0.999672** |
| Precision Score | 0.941176 | **0.950000** |
|  Recall Score   | 0.823529 | **0.838235** |
|    F1 Score     | 0.878431 | **0.890625** |

In [7]:
# split feature and label
X = np.asarray(data.drop(columns=['Class']))
Y = np.asarray(data['Class']) # 1-D array

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

model = XGBClassifier(
    enable_categorical=True,
    n_estimators=250,
    tree_method='approx',
    device='cuda',
    learning_rate=0.1,
    n_jobs=-1
)

# 訓練
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)
evaluation(y_test, y_pred, model_name="XGBoost")

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.





XGBoost Evaluation:
         Accuracy: 0.9996722961506501
  Precision Score: 0.95
     Recall Score: 0.8382352941176471
         F1 Score: 0.890625

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.95      0.84      0.89       136

    accuracy                           1.00     85443
   macro avg       0.97      0.92      0.95     85443
weighted avg       1.00      1.00      1.00     85443



## 非監督式學習(KMeans)

Baseline:

```
KMeans (Unsupervised) Evaluation:
=============================================
         Accuracy: 0.9987242957293166
  Precision Score: 0.782608695652174
     Recall Score: 0.36486486486486486
         F1 Score: 0.4976958525345622

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.78      0.36      0.50       148

    accuracy                           1.00     85443
   macro avg       0.89      0.68      0.75     85443
weighted avg       1.00      1.00      1.00     85443
```

### 調整方法

雖然資料集本身已經過 PCA 處理，但或許還能再去除較不重要的資料，提升判斷的準確性。在嘗試多種參數後，設置 `n_components=0.95` 的提升最大，保留了 27 維的資料。

### 結果

四項指標皆有提升：

|      指標       | Baseline |   My Model   |
|:---------------:|:--------:|:------------:|
|    Accuracy     | 0.998724 | **0.998748** |
| Precision Score | 0.782609 | **0.788732** |
|  Recall Score   | 0.364865 | **0.378378** |
|    F1 Score     | 0.497696 | **0.511416** |

In [12]:
from sklearn.decomposition import PCA

# Extract features and labels
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

# Split the dataset into training and testing sets (with stratification)
x_train, x_test, y_train, y_test = train_test_split(
   X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

pca = PCA(n_components=0.95, random_state=RANDOM_SEED)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

print("保留維度數量:", x_train_pca.shape[1])

# Select a small sample of normal (non-fraud) data for unsupervised training
n_x_train = x_train_pca[y_train == 0]
n_x_train = n_x_train[:1000]

scores = []
for k in range(2, 5):
   kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
   kmeans.fit(n_x_train)
   score = silhouette_score(n_x_train, kmeans.labels_)
   scores.append(score)

optimal_k = np.argmax(scores) + 2
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)
y_pred_test = kmeans.predict(x_test_pca)

def align_labels(y_true, y_pred, n_clusters):
   labels = np.zeros_like(y_pred)
   for i in range(n_clusters):
       mask = (y_pred == i)
       if np.sum(mask) > 0:
           labels[mask] = np.bincount(y_true[mask]).argmax()
       else:
           labels[mask] = 0  # Default to normal class
   return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)
evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")


Isolation Forest (Unsupervised) Evaluation:
         Accuracy: 0.9949205903350772
  Precision Score: 0.15789473684210525
     Recall Score: 0.44594594594594594
         F1 Score: 0.2332155477031802

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.16      0.45      0.23       148

    accuracy                           0.99     85443
   macro avg       0.58      0.72      0.62     85443
weighted avg       1.00      0.99      1.00     85443

