<a href="https://colab.research.google.com/github/hyazoe/NTCU-Machine-Learning/blob/main/ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import kagglehub
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

RANDOM_SEED = 42
TEST_SIZE = 0.3

#load dataset(from kagglehub)
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype('int')

#prepare data
data = data.drop('Time', axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('Accuracy:', accuracy)
    print('Precision Score:', precision)
    print('Recall Score:', recall)
    print('F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred, model_name="Random Forest")



Random Forest Evaluation:
Accuracy: 0.9995318516437859
Precision Score: 0.9576271186440678
Recall Score: 0.7635135135135135
F1 Score: 0.849624060150376

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.96      0.76      0.85       148

    accuracy                           1.00     85443
   macro avg       0.98      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443



In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

RANDOM_SEED = 42
TEST_SIZE = 0.3

#load dataset(from kagglehub)
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype('int')

#prepare data
data = data.drop('Time', axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# 資料預處理
x = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 找最佳 k 值（根據 silhouette score）
scores = []
for k in range(2, 5):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    kmeans.fit(x_train)
    score = silhouette_score(x_train, kmeans.labels_)
    scores.append(score)

optimal_k = np.argmax(scores) + 2
print(f"Best k chosen based on silhouette score: {optimal_k}")
print(f"Silhouette scores for k=2~4: {scores}")

# 建立最終 KMeans 模型
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(x_train)
y_pred = kmeans.predict(x_test)

# 標籤對齊（用 majority voting）
def align_labels(y_true, y_pred, n_clusters):
    labels = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = np.bincount(y_true[mask].astype(int)).argmax()
    return labels

y_pred_aligned = align_labels(y_test, y_pred, optimal_k)

# 評估函數
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('=' * 45)
    print(f"         Accuracy: {accuracy:.16f}")
    print(f"  Precision Score: {precision:.16f}")
    print(f"     Recall Score: {recall:.16f}")
    print(f"         F1 Score: {f1:.16f}\n")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

# 執行評估
evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")

KeyboardInterrupt: 