<a href="https://colab.research.google.com/github/Thomas993300/NTCU-Machine-Learning/blob/main/ex01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#import
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import kagglehub

from sklearn.metrics import silhouette_score
from sklearn.metrics import classification_report
#general set
RANDOM_SEED = 42
TEST_SIZE = 0.3

#download
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")

#prepare
data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))


In [11]:
#output
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'\n{model_name} Evaluation:')
    print('=' * 30)
    print(f' Accuracy       : {accuracy:.8f}')
    print(f' Precision Score: {precision:.8f}')
    print(f' Recall Score   : {recall:.8f}')
    print(f' F1 Score       : {f1:.8f}')
    print('\nClassification Report:')
    print(classification_report(y_true, y_pred))


In [12]:
#basic

X = data.drop(columns=['Class']).values
y = data['Class'].values

#splite
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

#rf model
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf.fit(X_train, y_train)

#output
y_pred_rf = rf.predict(X_test)
evaluation(y_test, y_pred_rf, model_name="Random Forest")




Random Forest Evaluation:
 Accuracy       : 0.99963719
 Precision Score: 0.94117647
 Recall Score   : 0.82352941
 F1 Score       : 0.87843137

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.82      0.88       136

    accuracy                           1.00     85443
   macro avg       0.97      0.91      0.94     85443
weighted avg       1.00      1.00      1.00     85443



In [13]:
!pip install xgboost



In [14]:
#XGBoost
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=600,
    max_depth=8,
    learning_rate=0.1,
    scale_pos_weight=8,
    random_state=RANDOM_SEED,
    #use_label_encoder=False,
    eval_metric='logloss'
)
xgb.fit(X_train, y_train.ravel())
y_pred_xgb = xgb.predict(X_test)
evaluation(y_test, y_pred_xgb, model_name="XGBoost")




XGBoost Evaluation:
 Accuracy       : 0.99969570
 Precision Score: 0.94354839
 Recall Score   : 0.86029412
 F1 Score       : 0.90000000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.86      0.90       136

    accuracy                           1.00     85443
   macro avg       0.97      0.93      0.95     85443
weighted avg       1.00      1.00      1.00     85443



In [15]:
#Kmeans
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

#Split
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

#ragular
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

#train KMeans
n_x_train = x_train[y_train == 0]
n_x_train = n_x_train[:1000]

#find best k
scores = []
for k in range(2, 5):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    kmeans.fit(n_x_train)
    score = silhouette_score(n_x_train, kmeans.labels_)
    scores.append(score)
optimal_k = np.argmax(scores) + 2

#train best k
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)
y_pred_test = kmeans.predict(x_test)

def align_labels(y_true, y_pred, n_clusters):
    labels = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = np.bincount(y_true[mask]).argmax()
        else:
            labels[mask] = 0  # Default to normal class
    return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)


evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")



KMeans (Unsupervised) Evaluation:
 Accuracy       : 0.99872430
 Precision Score: 0.78260870
 Recall Score   : 0.36486486
 F1 Score       : 0.49769585

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.78      0.36      0.50       148

    accuracy                           1.00     85443
   macro avg       0.89      0.68      0.75     85443
weighted avg       1.00      1.00      1.00     85443



In [None]:
!pip install --upgrade --force-reinstall --no-cache-dir jax jaxlib


Collecting jax
  Downloading jax-0.6.1-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib
  Downloading jaxlib-0.6.1-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting ml_dtypes>=0.5.0 (from jax)
  Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Collecting numpy>=1.25 (from jax)
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting opt_einsum (from jax)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting scipy>=1.11.1 (from jax)
  Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m141.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jax-0.6.1-py3-none-any.whl (2.4 MB)
[2K  

In [16]:
#MY_KMeans

normal = x_train[y_train == 0][:800]
fraud  = x_train[y_train == 1][:200]
n_x_train = np.vstack([normal, fraud])

# find k
scores = []
for k in range(2, 5):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    kmeans.fit(n_x_train)
    score = silhouette_score(n_x_train, kmeans.labels_)
    scores.append(score)
optimal_k = np.argmax(scores) + 2

#train with k
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)


y_pred_test = kmeans.predict(x_test)
def align_labels(y_true, y_pred, n_clusters):
    labels = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = np.bincount(y_true[mask]).argmax()
        else:
            labels[mask] = 0
    return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)


evaluation(y_test, y_pred_aligned, model_name="MY_KMeans")





MY_KMeans Evaluation:
 Accuracy       : 0.99897007
 Precision Score: 0.83333333
 Recall Score   : 0.50675676
 F1 Score       : 0.63025210

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.83      0.51      0.63       148

    accuracy                           1.00     85443
   macro avg       0.92      0.75      0.81     85443
weighted avg       1.00      1.00      1.00     85443

