# ACS111120 - 挑戰一
監督式與非監督式學習程式碼

In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, silhouette_score
import kagglehub

RANDOM_SEED = 42
TEST_SIZE = 0.3

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f"Fraudulent: {len(fraud)}, non-fraudulent: {len(nonfraud)}")
print(f"Fraud %: {len(fraud)/(len(fraud)+len(nonfraud))*100:.3f}%")


In [None]:

X = np.asarray(data.drop(columns=['Class']))
Y = np.asarray(data['Class'])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y)

rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

def evaluation(y_true, y_pred, model_name="Model"):
    print(f"\n{model_name} Evaluation:")
    print("=" * 45)
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, zero_division=0):.4f}")
    print(f"Recall   : {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score : {f1_score(y_true, y_pred):.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred, zero_division=0))

evaluation(y_test, y_pred_rf, model_name="Random Forest")


In [None]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[y_train == 0][:1000])
X_test_scaled = scaler.transform(X_test)

scores = []
for k in range(2, 5):
    km = KMeans(n_clusters=k, random_state=RANDOM_SEED)
    km.fit(X_train_scaled)
    scores.append(silhouette_score(X_train_scaled, km.labels_))
best_k = np.argmax(scores) + 2

km = KMeans(n_clusters=best_k, random_state=RANDOM_SEED)
km.fit(X_train_scaled)
y_km_pred = km.predict(X_test_scaled)

def align_labels(y_true, y_pred, n_clusters):
    labels = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = np.bincount(y_true[mask]).argmax()
        else:
            labels[mask] = 0
    return labels

y_km_aligned = align_labels(y_test, y_km_pred, best_k)
evaluation(y_test, y_km_aligned, model_name="KMeans")
