In [2]:
# ex1.ipynb - 挑戰一 練習作業框架
# Author: [請寫上你的名字或學號]

# ====== 🔹 套件匯入區 ======
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ====== 🔹 資料載入與前處理 ======
# 如果 kagglehub 有裝就用這個；如果你下載 csv，改成讀本地檔
try:
    import kagglehub
    path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
    data = pd.read_csv(f"{path}/creditcard.csv")
except:
    print("使用本地資料路徑")
    data = pd.read_csv("creditcard.csv")  # <-- 下載檔案後請放這裡

data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

print("✅ 資料載入完成，總筆數：", len(data))

# ====== 🔹 評估函式 ======
def evaluation(y_true, y_pred, model_name="Model"):
    print(f"\n📊 {model_name} 評估結果")
    print("="*40)
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1 Score :", f1_score(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

# ====== 🔹 Random Forest：有監督式學習 ======
X = np.asarray(data.drop(columns=['Class']))
Y = np.asarray(data['Class'])

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42, stratify=Y
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

evaluation(y_test, y_pred_rf, "Random Forest")

# ====== 🔹 KMeans：非監督式學習 ======
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_unsupervised = X_scaled[Y == 0][:1000]  # 取前 1000 筆非詐欺樣本

# 選擇最佳 k 值
scores = []
for k in range(2, 5):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_train_unsupervised)
    score = silhouette_score(X_train_unsupervised, kmeans.labels_)
    scores.append(score)

optimal_k = np.argmax(scores) + 2
print("🌀 KMeans 最佳群數 k =", optimal_k)

# 用最佳 k 訓練模型
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(X_train_unsupervised)
X_test_scaled = scaler.transform(X_test)
y_pred_kmeans = kmeans.predict(X_test_scaled)

# 對齊群集標籤
def align_labels(y_true, y_pred, n_clusters):
    labels = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = np.bincount(y_true[mask]).argmax()
        else:
            labels[mask] = 0
    return labels

y_pred_aligned = align_labels(y_test, y_pred_kmeans, optimal_k)
evaluation(y_test, y_pred_aligned, "KMeans (Unsupervised)")

# ====== 🔹 TODO：你可以在這裡進行改進 ======
# 例如：
# - 改用其他分類器（如 XGBoost、SVM）
# - 嘗試調整 Random Forest 參數
# - SMOTE 資料平衡處理
# - 比較更多非監督模型
# - 把結果寫入 ex1.md 檔案


  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...


100%|██████████| 66.0M/66.0M [00:25<00:00, 2.76MB/s]

Extracting files...





✅ 資料載入完成，總筆數： 284807

📊 Random Forest 評估結果
Accuracy : 0.9994499256814484
Precision: 0.9719626168224299
Recall   : 0.7027027027027027
F1 Score : 0.8156862745098039

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.97      0.70      0.82       148

    accuracy                           1.00     85443
   macro avg       0.99      0.85      0.91     85443
weighted avg       1.00      1.00      1.00     85443

🌀 KMeans 最佳群數 k = 2

📊 KMeans (Unsupervised) 評估結果
Accuracy : 0.9982678510820079
Precision: 0.0
Recall   : 0.0
F1 Score : 0.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.00      0.00      0.00       148

    accuracy                           1.00     85443
   macro avg       0.50      0.50      0.50     85443
weighted avg       1.00      1.00      1.00     85443



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
