In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA


# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3


In [None]:
import kagglehub
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")

In [None]:
# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))


In [None]:
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


In [None]:
X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
Y = np.asarray(data.iloc[:, data.columns == 'Class'])

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# build Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_model.fit(X_train, y_train)


  return fit_method(estimator, *args, **kwargs)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,        #50、100、150
    max_depth=10,           #adjust depth
    min_samples_split=10,
    class_weight='balanced', # unbalance solved
    random_state=42
)


In [None]:
# 2. model training
model.fit(X_train, y_train)

# 3. test the dataset
y_pred = model.predict(X_test)
# define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# predict and print result
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

evaluation(y_test, y_pred, model_name="Random Forest")

# train model
model.fit(X_train, y_train)

# new model prediction
y_pred_new = model.predict(X_test)
evaluation(y_test, y_pred_new, model_name="Tuned Random Forest")



  return fit_method(estimator, *args, **kwargs)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.82      0.88       136

    accuracy                           1.00     85443
   macro avg       0.97      0.91      0.94     85443
weighted avg       1.00      1.00      1.00     85443


Random Forest Evaluation:
         Accuracy: 0.9996371850239341
  Precision Score: 0.9411764705882353
     Recall Score: 0.8235294117647058
         F1 Score: 0.8784313725490196

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.82      0.88       136

    accuracy                           1.00     85443
   macro avg       0.97      0.91      0.94     85443
weighted avg       1.00      1.00      1.00     85443



  return fit_method(estimator, *args, **kwargs)



Tuned Random Forest Evaluation:
         Accuracy: 0.9994850368081645
  Precision Score: 0.8333333333333334
     Recall Score: 0.8455882352941176
         F1 Score: 0.8394160583941606

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.83      0.85      0.84       136

    accuracy                           1.00     85443
   macro avg       0.92      0.92      0.92     85443
weighted avg       1.00      1.00      1.00     85443



In [None]:
# Extract features and labels
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

# Split the dataset into training and testing sets (with stratification)
x_train, x_test, y_train, y_test = train_test_split(
   X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Select a small sample of normal (non-fraud) data for unsupervised training
n_x_train = x_train[y_train == 0]
n_x_train = n_x_train[:1000]


In [None]:
scores = []
for k in range(2, 5):
   kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
   kmeans.fit(n_x_train)
   score = silhouette_score(n_x_train, kmeans.labels_)
   scores.append(score)

optimal_k = np.argmax(scores) + 2
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)
y_pred_test = kmeans.predict(x_test)
def align_labels(y_true, y_pred, n_clusters):
   labels = np.zeros_like(y_pred)
   for i in range(n_clusters):
       mask = (y_pred == i)
       if np.sum(mask) > 0:
           labels[mask] = np.bincount(y_true[mask]).argmax()
       else:
           labels[mask] = 0  # Default to normal class
   return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)


In [None]:
def evaluation(y_true, y_pred, model_name="Model"):
   accuracy = accuracy_score(y_true, y_pred)
   precision = precision_score(y_true, y_pred, zero_division=0)
   recall = recall_score(y_true, y_pred)
   f1 = f1_score(y_true, y_pred)

   print(f'\n{model_name} Evaluation:')
   print('===' * 15)
   print('         Accuracy:', accuracy)
   print('  Precision Score:', precision)
   print('     Recall Score:', recall)
   print('         F1 Score:', f1)
   print("\nClassification Report:")
   print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")

pca = PCA(n_components=5, random_state=RANDOM_SEED)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)



KMeans (Unsupervised) Evaluation:
         Accuracy: 0.9987242957293166
  Precision Score: 0.782608695652174
     Recall Score: 0.36486486486486486
         F1 Score: 0.4976958525345622

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.78      0.36      0.50       148

    accuracy                           1.00     85443
   macro avg       0.89      0.68      0.75     85443
weighted avg       1.00      1.00      1.00     85443



In [None]:
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# PCA
pca = PCA(n_components=0.9, random_state=RANDOM_SEED)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

# searching K
scores = []
for k in range(2, 5):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    kmeans.fit(x_train_pca[y_train == 0][:1000])
    score = silhouette_score(x_train_pca[y_train == 0][:1000], kmeans.labels_)
    scores.append(score)

optimal_k = np.argmax(scores) + 2
print("Best K value:", optimal_k)

#training
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(x_train_pca[y_train == 0][:1000])
y_pred_test = kmeans.predict(x_test_pca)

#evaluate again
evaluation(y_test, y_pred_aligned, model_name="KMeans + PCA")

# sil score output
sil_score = silhouette_score(x_test_pca, y_pred_test)
print(f"\nSilhouette Score on test set: {sil_score:.4f}")


Best K value: 4

KMeans + PCA Evaluation:
         Accuracy: 0.9987242957293166
  Precision Score: 0.782608695652174
     Recall Score: 0.36486486486486486
         F1 Score: 0.4976958525345622

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.78      0.36      0.50       148

    accuracy                           1.00     85443
   macro avg       0.89      0.68      0.75     85443
weighted avg       1.00      1.00      1.00     85443


Silhouette Score on test set: 0.1075


In [48]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve
from xgboost import XGBClassifier

# 載入資料
data = pd.read_csv(f"{path}/creditcard.csv")

# 特徵與標籤分開
X = data.drop(columns=["Class"])
y = data["Class"]

# 分割訓練與測試集（保持詐欺比例）
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# 資料標準化
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# ==============================
# 1️⃣ IsolationForest 異常分數作為特徵
# ==============================
iso = IsolationForest(n_estimators=100, contamination=0.002, random_state=42)
iso.fit(x_train_scaled)

# 取得 anomaly score（愈小代表愈異常）
train_scores = iso.decision_function(x_train_scaled)
test_scores = iso.decision_function(x_test_scaled)

# 合併 anomaly score 作為新特徵
x_train_combined = np.hstack([x_train_scaled, train_scores.reshape(-1, 1)])
x_test_combined = np.hstack([x_test_scaled, test_scores.reshape(-1, 1)])

# ==============================
# 2️⃣ XGBoost 訓練
# ==============================
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    scale_pos_weight=(len(y_train[y_train == 0]) / len(y_train[y_train == 1])),  # 動態調整
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb.fit(x_train_combined, y_train)

# ==============================
# 3️⃣ 預測 + 門檻調整 + 評估
# ==============================

# 預測詐欺機率
y_prob = xgb.predict_proba(x_test_combined)[:, 1]

# 利用 precision_recall_curve 找最佳門檻使 Precision >= 0.9，且 Recall 最大化
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)

best_threshold = 0.5
best_recall = 0
for p, r, t in zip(precisions, recalls, thresholds):
    if p >= 0.9 and r > best_recall:
        best_recall = r
        best_threshold = t

# 使用最佳門檻轉換預測標籤
y_pred_adj = (y_prob >= best_threshold).astype(int)

def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', round(accuracy, 4))
    print('  Precision Score:', round(precision, 4))
    print('     Recall Score:', round(recall, 4))
    print('         F1 Score:', round(f1, 4))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred_adj, model_name="Threshold Adjusted Hybrid Model")


Parameters: { "use_label_encoder" } are not used.




Threshold Adjusted Hybrid Model Evaluation:
         Accuracy: 0.9994
  Precision Score: 0.9024
     Recall Score: 0.75
         F1 Score: 0.8192

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.90      0.75      0.82       148

    accuracy                           1.00     85443
   macro avg       0.95      0.87      0.91     85443
weighted avg       1.00      1.00      1.00     85443

