<a href="https://colab.research.google.com/github/Sandrala0413/NTCU-Machine-Learning/blob/main/NTCU_ML_Challenge2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Necessary Package


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import kagglehub

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

## Load Dataset & Prepare Data

In [294]:
# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)  #拿掉Time欄位
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

## Fraud/Non-Fraud Transactions

In [4]:
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


## 非監督學習（isolation）+ 監督學習（XGBoost）


In [290]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_curve, f1_score, confusion_matrix
from xgboost import XGBClassifier

# === 資料切分與標準化 ===
X = data.drop(columns=['Class'])
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=42,
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=25)
x_train = pca.fit_transform(X_train_scaled)
x_test = pca.transform(X_test_scaled)
# === Isolation Forest 異常偵測 ===
iso_forest = IsolationForest(
    n_estimators=600,
    contamination=0.017,
    max_features=10,
    random_state=42
)
iso_forest.fit(X_train_scaled)

# 計算異常標籤與異常分數（分數越小越異常）
iso_train_labels = (iso_forest.predict(X_train_scaled) == -1).astype(int)
train_anomaly_scores = iso_forest.decision_function(X_train_scaled)

iso_test_labels = (iso_forest.predict(X_test_scaled) == -1).astype(int)
test_anomaly_scores = iso_forest.decision_function(X_test_scaled)

# 將異常資訊加入資料中
X_train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)

X_train_df['isolation_label'] = iso_train_labels
X_train_df['anomaly_score'] = train_anomaly_scores

X_test_df['isolation_label'] = iso_test_labels
X_test_df['anomaly_score'] = test_anomaly_scores

# === 訓練 XGBoost 模型 ===
scale_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_model = XGBClassifier(
    max_depth=8,
    n_estimators=600,
    scale_pos_weight=70,
    eval_metric='logloss',
    early_stopping_rounds=10,
    random_state=42
)

xgb_model.fit(X_train_df, y_train, eval_set=[(X_test_df, y_test)], verbose=False)

# === 預測 ===
y_probs = xgb_model.predict_proba(X_test_df)[:, 1]


## Result

In [293]:
def evaluation(y_true, y_pred, model_name="Model"):
   accuracy = accuracy_score(y_true, y_pred)
   precision = precision_score(y_true, y_pred, zero_division=0)
   recall = recall_score(y_true, y_pred)
   f1 = f1_score(y_true, y_pred)

   print(f'\n{model_name} Evaluation:')
   print('===' * 15)
   print('         Accuracy:', accuracy)
   print('  Precision Score:', precision)
   print('     Recall Score:', recall)
   print('         F1 Score:', f1)
   print("\nClassification Report:")
   print(classification_report(y_true, y_pred))

# precision-recall 曲線找最佳 threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
best_threshold = thresholds[np.argmax(f1_scores)]

print(f"Best Threshold by F1: {best_threshold:.4f}")

y_pred = (y_probs >= 0.4988).astype(int)
evaluation(y_test, y_pred, model_name="Hybrid Model")

🔍 Best Threshold by F1: 0.4988

Hybrid Model Evaluation:
         Accuracy: 0.9996839998595555
  Precision Score: 0.936
     Recall Score: 0.8602941176470589
         F1 Score: 0.896551724137931

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.86      0.90       136

    accuracy                           1.00     85443
   macro avg       0.97      0.93      0.95     85443
weighted avg       1.00      1.00      1.00     85443

