In [56]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_score, davies_bouldin_score,
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)
import kagglehub

# --- Settings ---
RANDOM_SEED = 42
TEST_SIZE = 0.3
N_NORMAL_SEED = 1000
N_FRAUD_SEED = 100
N_PCA_COMPONENTS = 7
ENSEMBLE_RUNS = 1

# --- Load and prepare data ---
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
# Drop Time, scale Amount
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# Features and label
X = data.drop(columns=['Class']).values
y = data['Class'].astype(int).values
# Train/test split stratified
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE,
    random_state=RANDOM_SEED, stratify=y
)
# Standardize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# --- Dimensionality Reduction with PCA ---
pca = PCA(n_components=N_PCA_COMPONENTS, random_state=RANDOM_SEED)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

# --- Seed samples for initialization ---
normal_idx = np.where(y_train == 0)[0][:N_NORMAL_SEED]
fraud_idx = np.where(y_train == 1)[0][:N_FRAUD_SEED]
seed_X = np.vstack([x_train_pca[normal_idx], x_train_pca[fraud_idx]])
seed_labels = np.hstack([np.zeros(len(normal_idx)), np.ones(len(fraud_idx))])
# Compute centroids from seed samples
seed_km = KMeans(n_clusters=2, init='k-means++', random_state=RANDOM_SEED)
seed_km.fit(seed_X)
centroids_init = seed_km.cluster_centers_

# --- Ensemble of seeded KMeans ---
predictions = np.zeros((ENSEMBLE_RUNS, len(x_test_pca)), dtype=int)
for run in range(ENSEMBLE_RUNS):
    km = KMeans(
        n_clusters=2,
        init=centroids_init,
        n_init=1,
        random_state=RANDOM_SEED + run
    )
    km.fit(x_train_pca)
    predictions[run] = km.predict(x_test_pca)

# --- Majority vote ---
from scipy.stats import mode
y_pred_majority = mode(predictions, axis=0).mode.ravel()

# --- Align cluster labels to true classes ---
cluster_ids = np.unique(y_pred_majority)
cluster_map = {}
for cid in cluster_ids:
    mask = (y_pred_majority == cid)
    # assign based on majority true label in test mask
    cluster_map[cid] = np.bincount(y_test[mask]).argmax()

y_pred_aligned = np.vectorize(cluster_map.get)(y_pred_majority)


In [58]:
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print(f'         Accuracy: {accuracy}')
    print(f'  Precision Score: {precision}')
    print(f'     Recall Score: {recall}')
    print(f'         F1 Score: {f1}')
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# 執行評估
evaluation(y_test, y_pred_aligned, model_name="Kmeans (Unsupervised)")


Kmeans (Unsupervised) Evaluation:
         Accuracy: 0.9989817773252344
  Precision Score: 0.8144329896907216
     Recall Score: 0.5337837837837838
         F1 Score: 0.6448979591836734

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.81      0.53      0.64       148

    accuracy                           1.00     85443
   macro avg       0.91      0.77      0.82     85443
weighted avg       1.00      1.00      1.00     85443

