In [2]:
#hw 1-2 my best solution
import numpy as np
import pandas as pd
import kagglehub
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_score, accuracy_score, precision_score,
    recall_score, f1_score, classification_report
)

# ===[ 固定參數 ]===
RANDOM_SEED = 42
TEST_SIZE = 0.3
N_NORMAL_SEED = 1000
N_FRAUD_SEED = 100
N_PCA_COMPONENTS = 10
CONTAMINATION_PERCENTILE = 99.4  # 提高精準度
CLUSTER_STD_MULTIPLIER = 4.0


# ===[ 資料讀取與預處理 ]===
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
df = pd.read_csv(f"{path}/creditcard.csv")
df['Class'] = df['Class'].astype(int)
df = df.drop(['Time'], axis=1)
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))

X = df.drop(columns=['Class']).values
y = df['Class'].values

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_SEED
)

# 正常資料訓練
x_train_normal = x_train[y_train == 0][:N_NORMAL_SEED]

# PCA 降維
pca = PCA(n_components=N_PCA_COMPONENTS, random_state=RANDOM_SEED)
x_train_pca = pca.fit_transform(x_train_normal)
x_test_pca = pca.transform(x_test)

# 自動找最佳 k
best_k, best_score = 2, -1
for k in range(2, 9):
    km = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    km.fit(x_train_pca)
    score = silhouette_score(x_train_pca, km.labels_)
    if score > best_score:
        best_k, best_score = k, score
print(f" 最適合的 K: {best_k}（silhouette score = {best_score:.4f}）")

# 訓練 KMeans 模型
kmeans = KMeans(n_clusters=best_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(x_train_pca)

# 預測 test cluster 並計算距離
cluster_dists = kmeans.transform(x_test_pca)
min_dists = np.min(cluster_dists, axis=1)
assigned_cluster = np.argmin(cluster_dists, axis=1)

# 計算每個 cluster 平均距離
cluster_means = []
for i in range(best_k):
    cluster_means.append(np.mean(min_dists[assigned_cluster == i]))

# 雙門檻判斷異常：距離 > percentile，且 > 該 cluster 平均
threshold = np.percentile(min_dists, CONTAMINATION_PERCENTILE)
y_pred = np.zeros_like(min_dists, dtype=int)

for i in range(best_k):
    cluster_indices = (assigned_cluster == i)
    cluster_distances = min_dists[cluster_indices]

    if len(cluster_distances) < 2:
        continue

    mean_dist = np.mean(cluster_distances)
    std_dist = np.std(cluster_distances)
    local_threshold = mean_dist + CLUSTER_STD_MULTIPLIER * std_dist

    for idx in np.where(cluster_indices)[0]:
        if (min_dists[idx] > np.percentile(min_dists, CONTAMINATION_PERCENTILE)) and (min_dists[idx] > local_threshold):
            y_pred[idx] = 1


# ===[ 評估函式 ]===
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} 評估結果:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

evaluation(y_test, y_pred, model_name=f"KMeans (k={best_k}, PCA={N_PCA_COMPONENTS}) @ {CONTAMINATION_PERCENTILE}% + ClusterFilter")

 最適合的 K: 5（silhouette score = 0.2159）

KMeans (k=5, PCA=10) @ 99.4% + ClusterFilter 評估結果:
         Accuracy: 0.9987242957293166
  Precision Score: 0.6666666666666666
     Recall Score: 0.527027027027027
         F1 Score: 0.5886792452830188

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.67      0.53      0.59       148

    accuracy                           1.00     85443
   macro avg       0.83      0.76      0.79     85443
weighted avg       1.00      1.00      1.00     85443

