
### Random Forest
1. RF baseline  
2. RF + SMOTE  
3. RF + PCA(80%, 35維)  
4. RF + PCA(90%, 44維)  
5. RF + PCA(80%, 35維) + SMOTE  
6. RF + PCA(90%, 44維) + SMOTE  


In [3]:

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import numpy as np
import warnings
warnings.filterwarnings('ignore')

file_path = "/Users/ys/Documents/GitHub/NCCU_ML_AI/data/filled_data_label_numeric.xlsx" # 路徑可自行修改
df = pd.read_excel(file_path,engine='openpyxl')
X = df.drop(columns=['abnormal_target'])
y = df['abnormal_target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

print(f"訓練集大小: {X_train.shape}, 測試集大小: {X_test.shape}")
print(f"類別比例 (train): {np.bincount(y_train)} (test): {np.bincount(y_test)})")


訓練集大小: (3215, 116), 測試集大小: (804, 116)
類別比例 (train): [3185   30] (test): [796   8])


In [5]:

results = []

def run_rf_and_collect(X_train, X_test, y_train, y_test, desc, **kwargs):
    clf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42, **kwargs)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    results.append({
        '方法': desc,
        '準確率': report['accuracy'],
        'Recall_1': report['1']['recall'] if '1' in report else np.nan,
        'F1_1': report['1']['f1-score'] if '1' in report else np.nan,
        'F1_macro': macro_f1,
        '混淆矩陣': cm.tolist()
    })
    print(f"\n========== {desc} ==========")
    print("混淆矩陣：\n", cm)
    print(classification_report(y_test, y_pred, target_names=["Normal (0)", "Abnormal (1)"]))
    print(f"F1-score（macro）：{macro_f1:.4f}")


## 1. Random Forest baseline（無SMOTE、無PCA）

In [6]:
run_rf_and_collect(X_train, X_test, y_train, y_test, "RF baseline")


混淆矩陣：
 [[796   0]
 [  4   4]]
              precision    recall  f1-score   support

  Normal (0)       0.99      1.00      1.00       796
Abnormal (1)       1.00      0.50      0.67         8

    accuracy                           1.00       804
   macro avg       1.00      0.75      0.83       804
weighted avg       1.00      1.00      0.99       804

F1-score（macro）：0.8321


## 2. RF + SMOTE

In [7]:

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
run_rf_and_collect(X_train_sm, X_test, y_train_sm, y_test, "RF + SMOTE")



混淆矩陣：
 [[796   0]
 [  3   5]]
              precision    recall  f1-score   support

  Normal (0)       1.00      1.00      1.00       796
Abnormal (1)       1.00      0.62      0.77         8

    accuracy                           1.00       804
   macro avg       1.00      0.81      0.88       804
weighted avg       1.00      1.00      1.00       804

F1-score（macro）：0.8837


## 3. RF + PCA(80%, 35維)

In [8]:

pca_80_35 = PCA(n_components=35, random_state=42)
X_train_pca_80_35 = pca_80_35.fit_transform(X_train)
X_test_pca_80_35 = pca_80_35.transform(X_test)
run_rf_and_collect(X_train_pca_80_35, X_test_pca_80_35, y_train, y_test, "RF + PCA(80%, 35)")



混淆矩陣：
 [[796   0]
 [  8   0]]
              precision    recall  f1-score   support

  Normal (0)       0.99      1.00      0.99       796
Abnormal (1)       0.00      0.00      0.00         8

    accuracy                           0.99       804
   macro avg       0.50      0.50      0.50       804
weighted avg       0.98      0.99      0.99       804

F1-score（macro）：0.4975


## 4. RF + PCA(90%, 44維)

In [9]:

pca_90_44 = PCA(n_components=44, random_state=42)
X_train_pca_90_44 = pca_90_44.fit_transform(X_train)
X_test_pca_90_44 = pca_90_44.transform(X_test)
run_rf_and_collect(X_train_pca_90_44, X_test_pca_90_44, y_train, y_test, "RF + PCA(90%, 44)")



混淆矩陣：
 [[796   0]
 [  8   0]]
              precision    recall  f1-score   support

  Normal (0)       0.99      1.00      0.99       796
Abnormal (1)       0.00      0.00      0.00         8

    accuracy                           0.99       804
   macro avg       0.50      0.50      0.50       804
weighted avg       0.98      0.99      0.99       804

F1-score（macro）：0.4975


## 5. RF + PCA(80%, 35) + SMOTE

In [10]:

X_train_pca_80_35_sm, y_train_pca_80_35_sm = smote.fit_resample(X_train_pca_80_35, y_train)
run_rf_and_collect(X_train_pca_80_35_sm, X_test_pca_80_35, y_train_pca_80_35_sm, y_test, "RF + PCA(80%, 35) + SMOTE")



混淆矩陣：
 [[788   8]
 [  6   2]]
              precision    recall  f1-score   support

  Normal (0)       0.99      0.99      0.99       796
Abnormal (1)       0.20      0.25      0.22         8

    accuracy                           0.98       804
   macro avg       0.60      0.62      0.61       804
weighted avg       0.98      0.98      0.98       804

F1-score（macro）：0.6067


## 6. RF + PCA(90%, 44) + SMOTE

In [11]:

X_train_pca_90_44_sm, y_train_pca_90_44_sm = smote.fit_resample(X_train_pca_90_44, y_train)
run_rf_and_collect(X_train_pca_90_44_sm, X_test_pca_90_44, y_train_pca_90_44_sm, y_test, "RF + PCA(90%, 44) + SMOTE")



混淆矩陣：
 [[787   9]
 [  7   1]]
              precision    recall  f1-score   support

  Normal (0)       0.99      0.99      0.99       796
Abnormal (1)       0.10      0.12      0.11         8

    accuracy                           0.98       804
   macro avg       0.55      0.56      0.55       804
weighted avg       0.98      0.98      0.98       804

F1-score（macro）：0.5505


## 所有方法 summary 一覽

In [12]:

summary = pd.DataFrame(results)
summary[['方法', '準確率', 'Recall_1', 'F1_1', 'F1_macro', '混淆矩陣']]


Unnamed: 0,方法,準確率,Recall_1,F1_1,F1_macro,混淆矩陣
0,RF baseline,0.995025,0.5,0.666667,0.83208,"[[796, 0], [4, 4]]"
1,RF + SMOTE,0.996269,0.625,0.769231,0.883675,"[[796, 0], [3, 5]]"
2,"RF + PCA(80%, 35)",0.99005,0.0,0.0,0.4975,"[[796, 0], [8, 0]]"
3,"RF + PCA(90%, 44)",0.99005,0.0,0.0,0.4975,"[[796, 0], [8, 0]]"
4,"RF + PCA(80%, 35) + SMOTE",0.982587,0.25,0.222222,0.606709,"[[788, 8], [6, 2]]"
5,"RF + PCA(90%, 44) + SMOTE",0.9801,0.125,0.111111,0.550524,"[[787, 9], [7, 1]]"
