In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score,precision_score, recall_score, f1_score
import kagglehub
from sklearn.decomposition import PCA

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0].sample(frac=0.2, random_state=42)
data_small = pd.concat([fraud, nonfraud])
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

Fraudulent:492, non-fraudulent:56863
the positive class (frauds) percentage: 492/57355 (0.858%)


監督式 

In [26]:
X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
Y = np.asarray(data.iloc[:, data.columns == 'Class'])

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# 建立 Random Forest 模型
rf = RandomForestClassifier(
    n_estimators=125,
    max_depth=20,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=RANDOM_SEED
)

rf.fit(X_train, y_train)

  rf.fit(X_train, y_train)


result of 監督式

In [27]:
# define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# predict and print result
y_pred = rf.predict(X_test)
evaluation(y_test, y_pred)
#print(classification_report(y_test, y_pred))



Model Evaluation:
         Accuracy: 0.9996605924417448
  Precision Score: 0.9421487603305785
     Recall Score: 0.8382352941176471
         F1 Score: 0.8871595330739299

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.84      0.89       136

    accuracy                           1.00     85443
   macro avg       0.97      0.92      0.94     85443
weighted avg       1.00      1.00      1.00     85443



非監督式

In [29]:
# Extract features and labels
selected_features = data_small.columns.drop('Class')
X = data_small[selected_features].to_numpy()
y = data_small['Class'].to_numpy()

# Train-test split (with stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

# Train Isolation Forest only on normal data
from sklearn.ensemble import IsolationForest

iso_model = IsolationForest(
    n_estimators=1000,
    contamination='auto',
    random_state=RANDOM_SEED
)
iso_model.fit(X_train[y_train == 0])  # only fit normal

# Predict anomaly scores
test_scores = -iso_model.decision_function(X_test)
threshold = np.percentile(test_scores, 97)
y_pred = (test_scores > threshold).astype(int)


result of 非監督式

In [31]:
def evaluation(y_true, y_pred, model_name="Model"):
   accuracy = accuracy_score(y_true, y_pred)
   precision = precision_score(y_true, y_pred, zero_division=0)
   recall = recall_score(y_true, y_pred)
   f1 = f1_score(y_true, y_pred)

   print(f'\n{model_name} Evaluation:')
   print('===' * 15)
   print('         Accuracy:', accuracy)
   print('  Precision Score:', precision)
   print('     Recall Score:', recall)
   print('         F1 Score:', f1)
   print("\nClassification Report:")
   print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred, model_name="KMeans (Unsupervised)")


KMeans (Unsupervised) Evaluation:
         Accuracy: 0.9750682861626082
  Precision Score: 0.22823984526112184
     Recall Score: 0.7972972972972973
         F1 Score: 0.35488721804511275

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     17059
           1       0.23      0.80      0.35       148

    accuracy                           0.98     17207
   macro avg       0.61      0.89      0.67     17207
weighted avg       0.99      0.98      0.98     17207

