In [None]:
# ex1.ipynb (as .py script for compatibility)

# 匯入必要套件
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report, silhouette_score
)
import kagglehub

# 一般設定
RANDOM_SEED = 42
TEST_SIZE = 0.3

# 下載並載入資料集
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# 資料預處理
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# 顯示詐騙/正常資料比例
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f"Fraudulent: {len(fraud)}, Non-fraudulent: {len(nonfraud)}")
print(f"Positive class ratio: {len(fraud)/(len(fraud)+len(nonfraud))*100:.3f}%")

# 有監督學習 - Random Forest
X = np.asarray(data.drop(columns=['Class']))
Y = np.asarray(data['Class'])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

def evaluate(y_true, y_pred, model_name):
    print(f"\n{model_name} Evaluation")
    print("=" * 45)
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

evaluate(y_test, y_pred_rf, "Random Forest")

# 非監督學習 - KMeans
X_unsup = np.asarray(data.drop(columns=['Class']))
y_unsup = np.asarray(data['Class'])

x_train_unsup, x_test_unsup, y_train_unsup, y_test_unsup = train_test_split(
    X_unsup, y_unsup, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_unsup)

scaler = StandardScaler()
x_train_unsup = scaler.fit_transform(x_train_unsup)
x_test_unsup = scaler.transform(x_test_unsup)

# 取 1000 筆非詐騙樣本作為 KMeans 訓練資料
n_x_train = x_train_unsup[y_train_unsup == 0][:1000]

# 決定最佳 K
scores = []
for k in range(2, 5):
    kmeans = KMeans(n_clusters=k, random_state=RANDOM_SEED)
    kmeans.fit(n_x_train)
    score = silhouette_score(n_x_train, kmeans.labels_)
    scores.append(score)

optimal_k = np.argmax(scores) + 2
kmeans = KMeans(n_clusters=optimal_k, random_state=RANDOM_SEED)
kmeans.fit(n_x_train)

# 在測試集上預測
y_pred_kmeans = kmeans.predict(x_test_unsup)

# 對齊 KMeans 標籤與實際標籤
def align_labels(y_true, y_pred, n_clusters):
    aligned = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            aligned[mask] = np.bincount(y_true[mask]).argmax()
        else:
            aligned[mask] = 0
    return aligned

y_pred_kmeans_aligned = align_labels(y_test_unsup, y_pred_kmeans, optimal_k)
evaluate(y_test_unsup, y_pred_kmeans_aligned, f"KMeans (k={optimal_k})")


Fraudulent:16, non-fraudulent:9984
the positive class (frauds) percentage: 16/10000 (0.160%)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2993
           1       1.00      0.29      0.44         7

    accuracy                           1.00      3000
   macro avg       1.00      0.64      0.72      3000
weighted avg       1.00      1.00      1.00      3000

