<a href="https://colab.research.google.com/github/Sandrala0413/NTCU-Machine-Learning/blob/main/NTCU_ML_Challenge1_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Necessary Package

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

## Load Dataset & Prepare Data

In [None]:
# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)  #拿掉Time欄位
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

## Fraud/Non-Fraud Transactions

In [None]:
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


## 監督式學習 (random forest)

In [None]:
X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])]) #從資料集中選出除了Class外的欄位作為輸入特徵x
Y = np.asarray(data.iloc[:, data.columns == 'Class'])   #選出Class欄位當作標籤Y(是否詐欺)

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

y_train = y_train.ravel() #標籤轉為一維向量
y_test = y_test.ravel()

#define GridSearchCV

# param_grid = {
#     'n_estimators': [100, 150, 200, 250],           # 樹的數量：先從 100、150 開始，不要太多
#     'max_depth': [10, 15, 20],                # 限制樹的深度：避免過擬合
#     'min_samples_split': [2],             # 最小切分樣本數：先固定
#     'min_samples_leaf': [1],              # 最小葉節點樣本數：先固定
#     'class_weight': [
#         'balanced',
#         'balanced_subsample',
#         {0: 1, 1: 5},
#         {0: 1, 1: 10},
#         {0: 1, 1: 15},
#         {0: 1, 1: 20}
#     ]
# }

# grid_search = GridSearchCV(
#     estimator=RandomForestClassifier(random_state=42),
#     param_grid=param_grid,
#     cv=3,
#     scoring='f1',
#     n_jobs=-1,
#     verbose=1
# )

# grid_search.fit(X_train, y_train)

# print("Best Parameters:")
# print(grid_search.best_params_)

# 評估所有模型在 test set 的 precision 和 recall
# results = []
# for i, params in enumerate(grid_search.cv_results_["params"]):
#     est = grid_search.cv_results_["params"][i]
#     model = RandomForestClassifier(**est, random_state=42)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)

#     if precision >= 0.94 and recall >= 0.82:
#         results.append({
#             "params": est,
#             "precision": precision,
#             "recall": recall,
#             "f1": f1
#         })

# 顯示符合條件的參數組合
# for r in results:
#     print(f"\n✓ 符合條件的模型：")
#     print(f"params = {r['params']}")
#     print(f"precision = {r['precision']:.4f}, recall = {r['recall']:.4f}, f1 = {r['f1']:.4f}")

# build Random Forest model

rf_best = RandomForestClassifier(
    n_estimators=150,
    max_depth=25,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight={0: 1, 1: 10},
    max_features=15,
    random_state=42
)
rf_best.fit(X_train, y_train)

# rf_model = RandomForestClassifier(n_estimators=250, random_state=RANDOM_SEED)
# rf_model.fit(X_train, y_train)

## Result of 監督式學習(random forest)

In [None]:
# define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# predict and print result
# best_rf = grid_search.best_estimator_
# y_probs = rf_best.predict_proba(X_test)[:, 1]
# y_pred = (y_probs >= 0.483).astype(int)
y_pred = rf_best.predict(X_test)
#y_pred = rf_model.predict(X_test)
# print(classification_report(y_test, y_pred))
evaluation(y_test, y_pred, model_name="Random Forest(Supervised)")

## 非監督式學習(KMeans)

In [None]:
# Extract features and labels
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

# Split the dataset into training and testing sets (with stratification)
x_train, x_test, y_train, y_test = train_test_split(
   X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Select a small sample of normal (non-fraud) data for unsupervised training
n_x_train = x_train[y_train == 0]
n_x_train = n_x_train[:1000]

scores = []
for k in range(2, 5):
   kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
   kmeans.fit(n_x_train)
   score = silhouette_score(n_x_train, kmeans.labels_)
   scores.append(score)

optimal_k = np.argmax(scores) + 2
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)
y_pred_test = kmeans.predict(x_test)
def align_labels(y_true, y_pred, n_clusters):
   labels = np.zeros_like(y_pred)
   for i in range(n_clusters):
       mask = (y_pred == i)
       if np.sum(mask) > 0:
           labels[mask] = np.bincount(y_true[mask]).argmax()
       else:
           labels[mask] = 0  # Default to normal class
   return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)

## Result of 非監督式學習(KMeans)

In [None]:
def evaluation(y_true, y_pred, model_name="Model"):
   accuracy = accuracy_score(y_true, y_pred)
   precision = precision_score(y_true, y_pred, zero_division=0)
   recall = recall_score(y_true, y_pred)
   f1 = f1_score(y_true, y_pred)

   print(f'\n{model_name} Evaluation:')
   print('===' * 15)
   print('         Accuracy:', accuracy)
   print('  Precision Score:', precision)
   print('     Recall Score:', recall)
   print('         F1 Score:', f1)
   print("\nClassification Report:")
   print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")


KMeans (Unsupervised) Evaluation:
         Accuracy: 0.9987242957293166
  Precision Score: 0.782608695652174
     Recall Score: 0.36486486486486486
         F1 Score: 0.4976958525345622

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.78      0.36      0.50       148

    accuracy                           1.00     85443
   macro avg       0.89      0.68      0.75     85443
weighted avg       1.00      1.00      1.00     85443

