In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import json
import xgboost as xgb
from sklearn.model_selection import ParameterSampler, train_test_split, GridSearchCV, StratifiedKFold, ParameterGrid
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import (
    accuracy_score, confusion_matrix, roc_auc_score, roc_curve, auc,
    precision_score, recall_score, f1_score, classification_report, balanced_accuracy_score
)
from sklearn.svm import SVC
from itertools import product
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

In [2]:
df = pd.read_csv('ALL_NHANES_MERGED_20072018.csv')

In [3]:
df = df.rename(columns={
    "SEQN": "ID",
    "DIQ010": "diabetes",
    "RIDAGEYR": "age",
    "RIAGENDR":"gender",

    "BMXHT": "height_cm",
    "BMXWT": "weight_kg",
    "BMXBMI": "bmi",
    "BMXWAIST": "waist_cm",

    "BPXSY1":"systolic1",
    "BPXDI1":"diastolic1",
    "BPXSY2":"systolic2",
    "BPXDI2":"diastolic2",
    "BPXSY3":"systolic3",
    "BPXDI3":"diastolic3",

    "LBXGLU":"glucose_fast",
    "LBXIN":"insulin",
    "LBXGH":"HbA1c",
    "LBXTC":"cholesterol_total",
    "LBDHDD":"hdl",
    "LBDLDL":"ldl",
    "LBXTR":"triglycerides",

    "SMQ020": "ever_smoked",
    "MCQ300C": "family_diabetes",
    "ALQ130":"alcohol_drink",
    "PAQ665":"moderate_excercise",
    "PAQ650":"serious_excercise",
    "SLD012":"sleep_time",
    "HUQ010":"health_score"
})


In [4]:
# 先確保欄位是數值型態，才能比較 < 1e-10
# 假設所有相關欄位在重命名後都是字串，需要先轉為數值：
for col in ["systolic1", "diastolic1", "systolic2", "diastolic2", "systolic3", "diastolic3", "age"]:
    df[col] = pd.to_numeric(df[col], errors='coerce') 

# 定義要檢查的血壓和年齡欄位
columns_to_check = ['diastolic1', 'diastolic2', 'diastolic3', 'age'] 

# 對這些欄位中極小的數值賦予 NaN (代表可能是 LOD 或錯誤值)
# 這會將整個 df 中滿足條件的儲存格設為 NaN
df[columns_to_check] = df[columns_to_check].mask(df[columns_to_check].lt(1e-10))

In [5]:
print(df.head())

        ID  diabetes   age  gender  height_cm  weight_kg    bmi  waist_cm  \
0  41475.0       2.0  62.0     2.0      154.7      138.9  58.04     156.3   
1  41476.0       2.0   6.0     2.0      120.4       22.0  15.18      52.7   
2  41477.0       1.0  71.0     1.0      167.1       83.9  30.05     109.5   
3  41478.0       2.0   1.0     2.0        NaN       11.5    NaN       NaN   
4  41479.0       2.0  52.0     1.0      154.4       65.7  27.56      95.4   

   systolic1  diastolic1  ...    ldl  triglycerides  ever_smoked  \
0      128.0        64.0  ...    NaN            NaN          2.0   
1        NaN         NaN  ...    NaN            NaN          NaN   
2      144.0        60.0  ...    NaN            NaN          1.0   
3        NaN         NaN  ...    NaN            NaN          NaN   
4      112.0        70.0  ...  121.0           99.0          2.0   

   family_diabetes  alcohol_drink  moderate_excercise  serious_excercise  \
0              1.0            NaN                 2.

#### NHANES原始代碼：


In [6]:
# 3. 目標變數 'diabetes' 的清洗與轉換
# 將 'diabetes' 欄位轉為數值 (原本可能是 '1.0', '2.0', '3.0' 字串)
df["diabetes"] = pd.to_numeric(df["diabetes"], errors='coerce')

# 篩選掉 'diabetes' 為 NaN 的樣本
df = df[df["diabetes"].notna()].copy() 

# 篩選掉 'diabetes' 為 3.0 的樣本 (移除不確定或邊緣類別)
# 由於已經轉為數值，與 3.0 比較
df = df[df["diabetes"] != 3.0].copy() 

# 將 'diabetes' 中表示「沒有糖尿病」的數值 2.0 更改為 0.0 (二元化)
df.loc[df["diabetes"] == 2.0, "diabetes"] = 0.0

In [7]:
print(sum(df["diabetes"]==0.0))
print(sum(df["diabetes"]==1.0))

51753
4710


In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 56463 entries, 0 to 57380
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  56463 non-null  float64
 1   diabetes            56463 non-null  float64
 2   age                 56463 non-null  float64
 3   gender              56463 non-null  float64
 4   height_cm           51889 non-null  float64
 5   weight_kg           53553 non-null  float64
 6   bmi                 51821 non-null  float64
 7   waist_cm            49463 non-null  float64
 8   systolic1           41252 non-null  float64
 9   diastolic1          40774 non-null  float64
 10  systolic2           41818 non-null  float64
 11  diastolic2          41211 non-null  float64
 12  systolic3           41663 non-null  float64
 13  diastolic3          40956 non-null  float64
 14  glucose_fast        18164 non-null  float64
 15  insulin             17735 non-null  float64
 16  HbA1c    

In [9]:
seed = 2025
# 假設 'Diagnosis' 是目標欄位（請依你資料中的實際欄位名調整）
X = df.drop(columns=['ID',"diabetes", "Source"])
# 將目標變數從浮點數 (float) 轉換為整數 (int)
y = df['diabetes'].astype(int)

# Step 1: 先切出 20% 的測試資料
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=seed
)

# 檢查比例
print(f"Train: {len(X_train)} samples")
print(f"Test: {len(X_test)} samples")


Train: 45170 samples
Test: 11293 samples


In [10]:
from collections import Counter
def print_class_distribution(name, y):
    counter = Counter(y)
    total = len(y)
    print(f"{name} Set Class Distribution:")
    for cls, count in counter.items():
        print(f"Class {cls}: {count} samples ({count / total:.2%})")
    print()

# 顯示類別比例
print_class_distribution("Train", y_train)
print_class_distribution("Test", y_test)

Train Set Class Distribution:
Class 0: 41402 samples (91.66%)
Class 1: 3768 samples (8.34%)

Test Set Class Distribution:
Class 0: 10351 samples (91.66%)
Class 1: 942 samples (8.34%)



# Data Preprocessing Pipeline


In [11]:
# 三個 systolic 欄位
sys_cols = ["systolic1", "systolic2", "systolic3"]

# 三個 diastolic 欄位
dia_cols = ["diastolic1", "diastolic2", "diastolic3"]

def missing_pattern_report(df, cols, name=""):
    # 計算每列缺失數量
    missing_count = df[cols].isna().sum(axis=1)

    total = len(df)

    report = {
        "all_missing (3/3)": (missing_count == 3).sum(),
        "two_missing (2/3)": (missing_count == 2).sum(),
        "one_missing (1/3)": (missing_count == 1).sum(),
        "none_missing (0/3)": (missing_count == 0).sum(),
    }

    # 轉成比例
    report_pct = {k: f"{v} ({v/total*100:.2f}%)" for k, v in report.items()}

    print(f"\n=== Missing Report for {name} ===")
    for k, v in report_pct.items():
        print(f"{k}: {v}")

    return report_pct


# 對 systolic 舒張壓做報告
sys_report = missing_pattern_report(X_train, sys_cols, name="Systolic BP")
dia_report = missing_pattern_report(X_train, dia_cols, name="Diastolic BP")



=== Missing Report for Systolic BP ===
all_missing (3/3): 10825 (23.97%)
two_missing (2/3): 671 (1.49%)
one_missing (1/3): 2023 (4.48%)
none_missing (0/3): 31651 (70.07%)

=== Missing Report for Diastolic BP ===
all_missing (3/3): 11046 (24.45%)
two_missing (2/3): 869 (1.92%)
one_missing (1/3): 2378 (5.26%)
none_missing (0/3): 30877 (68.36%)


In [12]:
## height/weight/bmi
def fill_height_weight_bmi_partial(df):
    h = df["height_cm"]
    w = df["weight_kg"]
    b = df["bmi"]

    # 補 BMI：有身高與體重，但 BMI 缺
    mask_bmi_missing = b.isna() & h.notna() & w.notna()
    df.loc[mask_bmi_missing, "bmi"] = df.loc[mask_bmi_missing, "weight_kg"] / (df.loc[mask_bmi_missing, "height_cm"] / 100) ** 2

    # 補體重：有身高與 BMI，但體重缺
    mask_weight_missing = w.isna() & h.notna() & b.notna()
    df.loc[mask_weight_missing, "weight_kg"] = df.loc[mask_weight_missing, "bmi"] * (df.loc[mask_weight_missing, "height_cm"] / 100) ** 2

    # 補身高：有體重與 BMI，但身高缺
    mask_height_missing = h.isna() & w.notna() & b.notna()
    df.loc[mask_height_missing, "height_cm"] = np.sqrt(df.loc[mask_height_missing, "weight_kg"] / df.loc[mask_height_missing, "bmi"]) * 100

    return df

def fill_remaining_height_weight_bmi(df, median_height, median_weight):
    # === 情況 1: height_weight_missing (身高 + 體重缺，BMI 有值) ===
    mask_height_weight_missing = df["height_cm"].isna() & df["weight_kg"].isna() & df["bmi"].notna()
    df.loc[mask_height_weight_missing, "weight_kg"] = median_weight
    # 使用填補後的體重 & 現有 BMI 算出身高
    df.loc[mask_height_weight_missing, "height_cm"] = np.sqrt(
        df.loc[mask_height_weight_missing, "weight_kg"] / df.loc[mask_height_weight_missing, "bmi"]
    ) * 100

    # === 情況 2: height_bmi_missing (身高 + BMI 缺) ===
    mask_height_bmi_missing = df["height_cm"].isna() & df["weight_kg"].notna() & df["bmi"].isna()
    df.loc[mask_height_bmi_missing, "height_cm"] = median_height
    # 使用補完的身高 & 現有體重計算 BMI
    df.loc[mask_height_bmi_missing, "bmi"] = df.loc[mask_height_bmi_missing, "weight_kg"] / (
        (df.loc[mask_height_bmi_missing, "height_cm"] / 100) ** 2
    )

    # === 情況 3: weight_bmi_missing (體重 + BMI 缺) ===
    mask_weight_bmi_missing = df["weight_kg"].isna() & df["height_cm"].notna() & df["bmi"].isna()
    df.loc[mask_weight_bmi_missing, "weight_kg"] = median_weight
    # 使用補完的體重 & 現有身高計算 BMI
    df.loc[mask_weight_bmi_missing, "bmi"] = df.loc[mask_weight_bmi_missing, "weight_kg"] / (
        (df.loc[mask_weight_bmi_missing, "height_cm"] / 100) ** 2
    )

    # === 情況 4: all_missing (全部缺) ===
    mask_all_missing = df["height_cm"].isna() & df["weight_kg"].isna() & df["bmi"].isna()
    df.loc[mask_all_missing, "height_cm"] = median_height
    df.loc[mask_all_missing, "weight_kg"] = median_weight
    df.loc[mask_all_missing, "bmi"] = df.loc[mask_all_missing, "weight_kg"] / (
        (df.loc[mask_all_missing, "height_cm"] / 100) ** 2
    )

    return df


# 1. 先補 X_train 的 height/weight/bmi 的可推情況（不補中位數）
X_train = fill_height_weight_bmi_partial(X_train)

# 2. 現在再算中位數
train_median_height = X_train["height_cm"].median()
train_median_weight = X_train["weight_kg"].median()

# 3. 補剩下缺的欄位
X_train = fill_remaining_height_weight_bmi(X_train, train_median_height, train_median_weight)

# 4. 用同樣中位數去補 X_test
X_test = fill_height_weight_bmi_partial(X_test)
X_test = fill_remaining_height_weight_bmi(X_test, train_median_height, train_median_weight)

# 假設欄位已重命名並轉為數值
systolic_cols = ['systolic1', 'systolic2', 'systolic3']
diastolic_cols = ['diastolic1', 'diastolic2', 'diastolic3']
X_train['systolic_avg'] = X_train[systolic_cols].mean(axis=1)
X_test['systolic_avg']  = X_test[systolic_cols].mean(axis=1)

X_train['diastolic_avg'] = X_train[diastolic_cols].mean(axis=1)
X_test['diastolic_avg']  = X_test[diastolic_cols].mean(axis=1)

# 年齡 >= 20 → 用中位數補
alcohol_median = X_train.loc[X_train["age"] >= 20, "alcohol_drink"].median()

X_train.loc[X_train["age"] < 20, "alcohol_drink"] = X_train.loc[X_train["age"] < 20, "alcohol_drink"].fillna(0)
X_test.loc[X_test["age"] < 20, "alcohol_drink"] = X_test.loc[X_test["age"] < 20, "alcohol_drink"].fillna(0)

X_train["alcohol_drink"] = X_train["alcohol_drink"].fillna(alcohol_median)
X_test["alcohol_drink"] = X_test["alcohol_drink"].fillna(alcohol_median)

## 中位數填補
# 1. 定義要補的欄位
median_fill_columns = [
    "age", "waist_cm", "glucose_fast", "insulin", "HbA1c", "cholesterol_total", "hdl", "ldl", "triglycerides",
    "sleep_time", "health_score", 'systolic_avg', 'diastolic_avg'
]


# 2. 根據 X_train 計算中位數
train_medians = X_train[median_fill_columns].median()

# 3. 使用中位數填補缺失
X_train[median_fill_columns] = X_train[median_fill_columns].fillna(train_medians)
X_test[median_fill_columns] = X_test[median_fill_columns].fillna(train_medians)


X_train.loc[X_train["age"] < 20, "ever_smoked"] = X_train.loc[X_train["age"] < 20, "ever_smoked"].fillna(2)
X_test.loc[X_test["age"] < 20, "ever_smoked"] = X_test.loc[X_test["age"] < 20, "ever_smoked"].fillna(2)

# 其他類別型欄位（排除 wheezing_sleep）
other_categorical = [
    "ever_smoked", "family_diabetes","moderate_excercise", "serious_excercise"
]

X_train[other_categorical] = X_train[other_categorical].fillna(3)
X_test[other_categorical] = X_test[other_categorical].fillna(3)

all_bp_cols = systolic_cols + diastolic_cols
X_train = X_train.drop(columns=all_bp_cols)
X_test = X_test.drop(columns=all_bp_cols)

In [13]:
def missing_report(df):
    missing_count = df.isnull().sum()
    missing_pct = (missing_count / len(df)) * 100
    report = pd.DataFrame({
        'MissingCount': missing_count,
        'MissingPercent': missing_pct
    }).sort_values(by='MissingPercent', ascending=False)
    report = report[report['MissingPercent'] > 0]
    return report
print("\n=== Training Set Missing Value Report ===")
print(missing_report(X_train))


print("\n=== Test Set Missing Value Report ===")
print(missing_report(X_test))


=== Training Set Missing Value Report ===
Empty DataFrame
Columns: [MissingCount, MissingPercent]
Index: []

=== Test Set Missing Value Report ===
Empty DataFrame
Columns: [MissingCount, MissingPercent]
Index: []


In [14]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 45170 entries, 9986 to 47610
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 45170 non-null  float64
 1   gender              45170 non-null  float64
 2   height_cm           45170 non-null  float64
 3   weight_kg           45170 non-null  float64
 4   bmi                 45170 non-null  float64
 5   waist_cm            45170 non-null  float64
 6   glucose_fast        45170 non-null  float64
 7   insulin             45170 non-null  float64
 8   HbA1c               45170 non-null  float64
 9   cholesterol_total   45170 non-null  float64
 10  hdl                 45170 non-null  float64
 11  ldl                 45170 non-null  float64
 12  triglycerides       45170 non-null  float64
 13  ever_smoked         45170 non-null  float64
 14  family_diabetes     45170 non-null  float64
 15  alcohol_drink       45170 non-null  float64
 16  modera

In [15]:
print(X_train.head())

        age  gender  height_cm  weight_kg        bmi  waist_cm  glucose_fast  \
9986    9.0     2.0      132.6       33.9  19.280000      66.7          99.0   
33272  40.0     1.0      179.8      101.5  31.400000      94.6          93.0   
7086    8.0     1.0      141.4       34.3  17.160000      58.6          99.0   
15800   1.0     2.0      161.9       12.5   4.768879      88.5          99.0   
49867   3.0     1.0      100.1       16.2  16.200000      88.5          99.0   

       insulin  HbA1c  cholesterol_total  ...  triglycerides  ever_smoked  \
9986     10.16    5.5              144.0  ...           94.0          2.0   
33272     7.43    5.2              153.0  ...           64.0          1.0   
7086     10.16    5.5              152.0  ...           94.0          2.0   
15800    10.16    5.5              178.0  ...           94.0          2.0   
49867    10.16    5.5              178.0  ...           94.0          2.0   

       family_diabetes  alcohol_drink  moderate_excercis

In [16]:
print(X_test.head())

        age  gender  height_cm  weight_kg    bmi  waist_cm  glucose_fast  \
48569  20.0     1.0      165.2      100.6  36.90     110.5          99.0   
18733  65.0     1.0      175.2      100.0  32.58     116.4          99.0   
26803  20.0     1.0      170.6       84.4  29.00      93.4         105.0   
22057   7.0     2.0      121.2       28.9  19.70      64.5          99.0   
5777    7.0     2.0      133.8       41.1  22.96      81.4          99.0   

       insulin  HbA1c  cholesterol_total  ...  triglycerides  ever_smoked  \
48569    10.16    5.1              229.0  ...           94.0          2.0   
18733    10.16    8.7              160.0  ...           94.0          1.0   
26803     8.58    5.0              218.0  ...          119.0          1.0   
22057    10.16    5.5              160.0  ...           94.0          2.0   
5777     10.16    5.5              119.0  ...           94.0          2.0   

       family_diabetes  alcohol_drink  moderate_excercise  serious_excercise  \


## Preprocessing


In [17]:
minmax_columns = [
    "age", "height_cm", "weight_kg", "bmi",
    "waist_cm", "glucose_fast", "insulin", "HbA1c", "cholesterol_total", "hdl", "ldl", "triglycerides",
    "alcohol_drink", "sleep_time", "health_score", 'systolic_avg', 'diastolic_avg'
]

onehot_columns = [
    "gender", "ever_smoked", "family_diabetes","moderate_excercise", "serious_excercise"
]

# MinMax normalization
scaler = MinMaxScaler()
X_train[minmax_columns] = scaler.fit_transform(X_train[minmax_columns])
X_test[minmax_columns] = scaler.transform(X_test[minmax_columns])

for col in onehot_columns:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# One-Hot Encoding
X_train = pd.get_dummies(X_train, columns=onehot_columns)
X_test = pd.get_dummies(X_test, columns=onehot_columns)

# 補齊測試集欄位
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# 確保所有 one-hot 欄位為 int 而非 bool
X_train = X_train.astype({col: int for col in X_train.select_dtypes(bool).columns})
X_test = X_test.astype({col: int for col in X_test.select_dtypes(bool).columns})
print(X_train.head())
print(X_test.head())

            age  height_cm  weight_kg       bmi  waist_cm  glucose_fast  \
9986   0.101266   0.429365   0.118026  0.204278  0.204674      0.138544   
33272  0.493671   0.803968   0.408155  0.351315  0.402266      0.127886   
7086   0.088608   0.499206   0.119742  0.178559  0.147309      0.138544   
15800  0.000000   0.661905   0.026180  0.028233  0.359065      0.138544   
49867  0.025316   0.171429   0.042060  0.166913  0.359065      0.138544   

        insulin     HbA1c  cholesterol_total       hdl  ...  ever_smoked_3.0  \
9986   0.015478  0.225806           0.112732  0.173516  ...                0   
33272  0.011261  0.206452           0.124668  0.136986  ...                0   
7086   0.015478  0.225806           0.123342  0.255708  ...                0   
15800  0.015478  0.225806           0.157825  0.200913  ...                0   
49867  0.015478  0.225806           0.157825  0.200913  ...                0   

       family_diabetes_1.0  family_diabetes_2.0  family_diabetes_3.0

In [18]:
print(list(X_train.columns))

['age', 'height_cm', 'weight_kg', 'bmi', 'waist_cm', 'glucose_fast', 'insulin', 'HbA1c', 'cholesterol_total', 'hdl', 'ldl', 'triglycerides', 'alcohol_drink', 'sleep_time', 'health_score', 'systolic_avg', 'diastolic_avg', 'gender_1.0', 'gender_2.0', 'ever_smoked_1.0', 'ever_smoked_2.0', 'ever_smoked_3.0', 'family_diabetes_1.0', 'family_diabetes_2.0', 'family_diabetes_3.0', 'moderate_excercise_1.0', 'moderate_excercise_2.0', 'moderate_excercise_3.0', 'serious_excercise_1.0', 'serious_excercise_2.0', 'serious_excercise_3.0']


In [21]:
x = X_train.values
y = y_train.values

## balanced-accuracy

#### Random Forest(balanced_weight)

In [None]:
def compute_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    balanced_acc = (sensitivity + specificity) / 2
    return balanced_acc, accuracy, sensitivity, specificity

In [None]:
# 參數網格
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 3, 4, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "max_leaf_nodes": [10, 20, 30, 40]
}


param_list = list(ParameterGrid(param_grid))

print(f"Total combinations: {len(param_list)}")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_score = 0
best_params = None
best_metrics = None

print("Running Grid Search with 5-Fold CV (train/val metrics)...\n")

for i, param in tqdm(enumerate(param_list)):
    fold_train_bal_acc = []
    fold_val_bal_acc = []

    fold_train_acc = []
    fold_val_acc = []

    fold_train_sens = []
    fold_val_sens = []

    fold_train_spec = []
    fold_val_spec = []

    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(x, y)):
        x_train_fold, y_train_fold = x[train_idx], y[train_idx]
        x_val_fold, y_val_fold = x[val_idx], y[val_idx]

        clf = RandomForestClassifier(
            **param,
            random_state=seed,
            n_jobs=-1,
            class_weight='balanced'
        )
        clf.fit(x_train_fold, y_train_fold)

        # 預測
        y_train_pred = clf.predict(x_train_fold)
        y_val_pred = clf.predict(x_val_fold)

        # 計算指標
        train_bal_acc, train_acc, train_sens, train_spec = compute_metrics(y_train_fold, y_train_pred)
        val_bal_acc, val_acc, val_sens, val_spec = compute_metrics(y_val_fold, y_val_pred)

        # 累積
        fold_train_bal_acc.append(train_bal_acc)
        fold_val_bal_acc.append(val_bal_acc)

        fold_train_acc.append(train_acc)
        fold_val_acc.append(val_acc)

        fold_train_sens.append(train_sens)
        fold_val_sens.append(val_sens)

        fold_train_spec.append(train_spec)
        fold_val_spec.append(val_spec)

    # 平均
    avg_train_bal_acc = np.mean(fold_train_bal_acc)
    avg_val_bal_acc = np.mean(fold_val_bal_acc)

    avg_train_acc = np.mean(fold_train_acc)
    avg_val_acc = np.mean(fold_val_acc)

    avg_train_sens = np.mean(fold_train_sens)
    avg_val_sens = np.mean(fold_val_sens)

    avg_train_spec = np.mean(fold_train_spec)
    avg_val_spec = np.mean(fold_val_spec)

    if avg_val_bal_acc > best_score:
        best_score = avg_val_bal_acc
        best_params = param
        best_metrics = {
            "Balanced Accuracy": avg_val_bal_acc,
            "Accuracy": avg_val_acc,
            "Sensitivity": avg_val_sens,
            "Specificity": avg_val_spec,
            "Train Balanced Accuracy": avg_train_bal_acc,
            "Train Accuracy": avg_train_acc,
            "Train Sensitivity": avg_train_sens,
            "Train Specificity": avg_train_spec
        }




In [None]:
print("Best Params:", best_params)
print(f"Best Balanced Accuracy: {best_score:.4f}")
print(f"Best Accuracy:           {best_metrics['Accuracy']:.4f}")
print(f"Best Sensitivity:        {best_metrics['Sensitivity']:.4f}")
print(f"Best Specificity:        {best_metrics['Specificity']:.4f}")
print(f"Best Training Balanced Accuracy:   {best_metrics['Train Balanced Accuracy']:.4f}")
print(f"Best Training Accuracy:            {best_metrics['Train Accuracy']:.4f}")
print(f"Best Training Sensitivity:         {best_metrics['Train Sensitivity']:.4f}")
print(f"Best Training Specificity:         {best_metrics['Train Specificity']:.4f}")

##### Test

In [None]:
print("\nTraining final model on full training data with best parameters...")
final_model = RandomForestClassifier(
    **best_params,
    random_state=seed,
    n_jobs=-1,
    class_weight='balanced'
)
final_model.fit(x, y)

# 預測 Test Set
y_test_proba = final_model.predict_proba(X_test.values)[:, 1]
y_test_pred = (y_test_proba >= 0.5).astype(int)

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
auc = roc_auc_score(y_test, y_test_proba)
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"AUC:                       {auc:.4f}")
print(f"F1 Score:         {f1:.4f}")


Training final model on full training data with best parameters...

✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 112                  63
Actual Negative                 143                1644
Sensitivity (Recall, TPR): 0.6400
Specificity (TNR):         0.9200
Accuracy:                  0.8950
Balanced Accuracy:         0.7800
AUC:                       0.8565
F1 Score:         0.5209


#### Logistic Regression(balanced-weight)

In [71]:
C_list = [2 ** i for i in range(-10, 11)]
penalty_list = ['l1', 'l2'] # 新增正則化類型列表

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_score = 0
best_params = None
best_metrics = None

print("Running Grid Search over C with 5-Fold Stratified CV...\n")

# --- 雙層迴圈開始 ---
for penalty in penalty_list:
    print(f"================ Running Penalty: {penalty.upper()} ================")
    
    # 針對 L1 必須使用 saga，針對 L2 使用 lbfgs (除非使用 liblinear)
    if penalty == 'l1':
        # saga 支持 L1 和 L2
        solver_type = "saga" 
    else:
        # lbfgs 速度快，但只支持 L2
        solver_type = "lbfgs" 

    for C in tqdm(C_list, desc=f"C Search ({penalty.upper()})"):
        
        # 指標列表初始化
        fold_train_bal_acc = []
        fold_train_acc = []
        fold_train_sens = []
        fold_train_spec = []

        fold_val_bal_acc = []
        fold_val_acc = []
        fold_val_sens = []
        fold_val_spec = []

        for fold, (train_idx, val_idx) in enumerate(cv.split(x, y), 1):
            x_train_fold, y_train_fold = x[train_idx], y[train_idx]
            x_val_fold, y_val_fold = x[val_idx], y[val_idx]

            clf = LogisticRegression(
                C=C,
                penalty=penalty,
                solver=solver_type,
                class_weight="balanced",
                max_iter=5000,
                random_state=seed,
                tol=1e-4,
                n_jobs=-1
            )
            clf.fit(x_train_fold, y_train_fold)
            # --- Train
            y_train_pred = clf.predict(x_train_fold)
            tn, fp, fn, tp = confusion_matrix(y_train_fold, y_train_pred).ravel()
            train_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
            train_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
            train_acc = (tp + tn) / (tp + tn + fp + fn)
            train_bal_acc = (train_sens + train_spec) / 2

            fold_train_bal_acc.append(train_bal_acc)
            fold_train_acc.append(train_acc)
            fold_train_sens.append(train_sens)
            fold_train_spec.append(train_spec)

            # --- Validation
            y_val_pred = clf.predict(x_val_fold)
            tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
            val_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
            val_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
            val_acc = (tp + tn) / (tp + tn + fp + fn)
            val_bal_acc = (val_sens + val_spec) / 2

            fold_val_bal_acc.append(val_bal_acc)
            fold_val_acc.append(val_acc)
            fold_val_sens.append(val_sens)
            fold_val_spec.append(val_spec)

        #print(f"C={C:<10} Fold {fold}:")
        #print(f"  Train - BalAcc: {train_bal_acc:.4f} Acc: {train_acc:.4f} Sens: {train_sens:.4f} Spec: {train_spec:.4f}")
        #print(f"  Val   - BalAcc: {val_bal_acc:.4f} Acc: {val_acc:.4f} Sens: {val_sens:.4f} Spec: {val_spec:.4f}")

        # === Mean over folds
        avg_train_bal_acc = np.mean(fold_train_bal_acc)
        avg_train_acc = np.mean(fold_train_acc)
        avg_train_sens = np.mean(fold_train_sens)
        avg_train_spec = np.mean(fold_train_spec)

        avg_val_bal_acc = np.mean(fold_val_bal_acc)
        avg_val_acc = np.mean(fold_val_acc)
        avg_val_sens = np.mean(fold_val_sens)
        avg_val_spec = np.mean(fold_val_spec)

        print(f"\nMean over folds for C={C}, penalty={penalty}:")
        print(f"  Train - BalAcc: {avg_train_bal_acc:.4f} Acc: {avg_train_acc:.4f} Sens: {avg_train_sens:.4f} Spec: {avg_train_spec:.4f}")
        print(f"  Val   - BalAcc: {avg_val_bal_acc:.4f} Acc: {avg_val_acc:.4f} Sens: {avg_val_sens:.4f} Spec: {avg_val_spec:.4f}\n")

        # --- Best tracking
        if avg_val_bal_acc > best_score:
            best_score = avg_val_bal_acc
            best_C = C
            best_penalty = penalty
            best_metrics = {
            "Balanced Accuracy": avg_val_bal_acc,
            "Accuracy": avg_val_acc,
            "Sensitivity": avg_val_sens,
            "Specificity": avg_val_spec,
            "Train Balanced Accuracy": avg_train_bal_acc,
            "Train Accuracy": avg_train_acc,
            "Train Sensitivity": avg_train_sens,
            "Train Specificity": avg_train_spec,
            }

Running Grid Search over C with 5-Fold Stratified CV...



C Search (L1):   5%|▍         | 1/21 [00:07<02:36,  7.85s/it]


Mean over folds for C=0.0009765625, penalty=l1:
  Train - BalAcc: 0.8163 Acc: 0.7497 Sens: 0.8962 Spec: 0.7364
  Val   - BalAcc: 0.8163 Acc: 0.7499 Sens: 0.8960 Spec: 0.7366



C Search (L1):  10%|▉         | 2/21 [00:44<07:55, 25.03s/it]


Mean over folds for C=0.001953125, penalty=l1:
  Train - BalAcc: 0.8351 Acc: 0.7778 Sens: 0.9039 Spec: 0.7664
  Val   - BalAcc: 0.8348 Acc: 0.7777 Sens: 0.9034 Spec: 0.7663



C Search (L1):  14%|█▍        | 3/21 [01:09<07:28, 24.91s/it]


Mean over folds for C=0.00390625, penalty=l1:
  Train - BalAcc: 0.8621 Acc: 0.8253 Sens: 0.9062 Spec: 0.8180
  Val   - BalAcc: 0.8624 Acc: 0.8253 Sens: 0.9068 Spec: 0.8179



C Search (L1):  19%|█▉        | 4/21 [01:20<05:27, 19.25s/it]


Mean over folds for C=0.0078125, penalty=l1:
  Train - BalAcc: 0.8753 Acc: 0.8519 Sens: 0.9034 Spec: 0.8473
  Val   - BalAcc: 0.8749 Acc: 0.8516 Sens: 0.9029 Spec: 0.8469



C Search (L1):  24%|██▍       | 5/21 [01:28<04:03, 15.24s/it]


Mean over folds for C=0.015625, penalty=l1:
  Train - BalAcc: 0.8815 Acc: 0.8685 Sens: 0.8971 Spec: 0.8659
  Val   - BalAcc: 0.8816 Acc: 0.8683 Sens: 0.8976 Spec: 0.8656



C Search (L1):  29%|██▊       | 6/21 [01:35<03:04, 12.32s/it]


Mean over folds for C=0.03125, penalty=l1:
  Train - BalAcc: 0.8855 Acc: 0.8800 Sens: 0.8921 Spec: 0.8789
  Val   - BalAcc: 0.8853 Acc: 0.8798 Sens: 0.8920 Spec: 0.8787



C Search (L1):  33%|███▎      | 7/21 [01:41<02:26, 10.47s/it]


Mean over folds for C=0.0625, penalty=l1:
  Train - BalAcc: 0.8877 Acc: 0.8853 Sens: 0.8906 Spec: 0.8848
  Val   - BalAcc: 0.8878 Acc: 0.8852 Sens: 0.8909 Spec: 0.8847



C Search (L1):  38%|███▊      | 8/21 [01:48<02:01,  9.38s/it]


Mean over folds for C=0.125, penalty=l1:
  Train - BalAcc: 0.8886 Acc: 0.8882 Sens: 0.8891 Spec: 0.8881
  Val   - BalAcc: 0.8877 Acc: 0.8880 Sens: 0.8875 Spec: 0.8880



C Search (L1):  43%|████▎     | 9/21 [01:56<01:44,  8.74s/it]


Mean over folds for C=0.25, penalty=l1:
  Train - BalAcc: 0.8889 Acc: 0.8894 Sens: 0.8883 Spec: 0.8895
  Val   - BalAcc: 0.8873 Acc: 0.8890 Sens: 0.8854 Spec: 0.8893



C Search (L1):  48%|████▊     | 10/21 [02:03<01:31,  8.33s/it]


Mean over folds for C=0.5, penalty=l1:
  Train - BalAcc: 0.8891 Acc: 0.8901 Sens: 0.8879 Spec: 0.8903
  Val   - BalAcc: 0.8875 Acc: 0.8897 Sens: 0.8848 Spec: 0.8902



C Search (L1):  52%|█████▏    | 11/21 [02:11<01:21,  8.16s/it]


Mean over folds for C=1, penalty=l1:
  Train - BalAcc: 0.8891 Acc: 0.8902 Sens: 0.8877 Spec: 0.8905
  Val   - BalAcc: 0.8875 Acc: 0.8899 Sens: 0.8846 Spec: 0.8904



C Search (L1):  57%|█████▋    | 12/21 [02:24<01:26,  9.59s/it]


Mean over folds for C=2, penalty=l1:
  Train - BalAcc: 0.8892 Acc: 0.8904 Sens: 0.8877 Spec: 0.8906
  Val   - BalAcc: 0.8876 Acc: 0.8903 Sens: 0.8843 Spec: 0.8909



C Search (L1):  62%|██████▏   | 13/21 [02:39<01:31, 11.39s/it]


Mean over folds for C=4, penalty=l1:
  Train - BalAcc: 0.8892 Acc: 0.8904 Sens: 0.8878 Spec: 0.8907
  Val   - BalAcc: 0.8876 Acc: 0.8902 Sens: 0.8846 Spec: 0.8907



C Search (L1):  67%|██████▋   | 14/21 [03:12<02:04, 17.78s/it]


Mean over folds for C=8, penalty=l1:
  Train - BalAcc: 0.8893 Acc: 0.8905 Sens: 0.8878 Spec: 0.8908
  Val   - BalAcc: 0.8877 Acc: 0.8903 Sens: 0.8846 Spec: 0.8908



C Search (L1):  71%|███████▏  | 15/21 [03:48<02:19, 23.32s/it]


Mean over folds for C=16, penalty=l1:
  Train - BalAcc: 0.8893 Acc: 0.8905 Sens: 0.8877 Spec: 0.8908
  Val   - BalAcc: 0.8877 Acc: 0.8903 Sens: 0.8846 Spec: 0.8908



C Search (L1):  76%|███████▌  | 16/21 [04:47<02:50, 34.11s/it]


Mean over folds for C=32, penalty=l1:
  Train - BalAcc: 0.8893 Acc: 0.8906 Sens: 0.8877 Spec: 0.8909
  Val   - BalAcc: 0.8877 Acc: 0.8903 Sens: 0.8846 Spec: 0.8908



C Search (L1):  81%|████████  | 17/21 [05:01<01:51, 27.97s/it]


Mean over folds for C=64, penalty=l1:
  Train - BalAcc: 0.8892 Acc: 0.8906 Sens: 0.8876 Spec: 0.8909
  Val   - BalAcc: 0.8877 Acc: 0.8904 Sens: 0.8846 Spec: 0.8909



C Search (L1):  86%|████████▌ | 18/21 [05:13<01:09, 23.28s/it]


Mean over folds for C=128, penalty=l1:
  Train - BalAcc: 0.8893 Acc: 0.8906 Sens: 0.8877 Spec: 0.8908
  Val   - BalAcc: 0.8877 Acc: 0.8903 Sens: 0.8846 Spec: 0.8909



C Search (L1):  90%|█████████ | 19/21 [05:25<00:39, 19.96s/it]


Mean over folds for C=256, penalty=l1:
  Train - BalAcc: 0.8893 Acc: 0.8906 Sens: 0.8877 Spec: 0.8909
  Val   - BalAcc: 0.8877 Acc: 0.8903 Sens: 0.8846 Spec: 0.8909



C Search (L1):  95%|█████████▌| 20/21 [05:38<00:17, 17.69s/it]


Mean over folds for C=512, penalty=l1:
  Train - BalAcc: 0.8893 Acc: 0.8906 Sens: 0.8877 Spec: 0.8909
  Val   - BalAcc: 0.8876 Acc: 0.8903 Sens: 0.8843 Spec: 0.8909



C Search (L1): 100%|██████████| 21/21 [05:50<00:00, 16.71s/it]



Mean over folds for C=1024, penalty=l1:
  Train - BalAcc: 0.8893 Acc: 0.8906 Sens: 0.8877 Spec: 0.8909
  Val   - BalAcc: 0.8876 Acc: 0.8903 Sens: 0.8843 Spec: 0.8909



C Search (L2):   5%|▍         | 1/21 [00:50<16:41, 50.09s/it]


Mean over folds for C=0.0009765625, penalty=l2:
  Train - BalAcc: 0.8183 Acc: 0.7506 Sens: 0.8995 Spec: 0.7370
  Val   - BalAcc: 0.8172 Acc: 0.7505 Sens: 0.8973 Spec: 0.7371



C Search (L2):  10%|▉         | 2/21 [01:34<14:48, 46.77s/it]


Mean over folds for C=0.001953125, penalty=l2:
  Train - BalAcc: 0.8278 Acc: 0.7674 Sens: 0.9003 Spec: 0.7553
  Val   - BalAcc: 0.8286 Acc: 0.7678 Sens: 0.9015 Spec: 0.7556



C Search (L2):  14%|█▍        | 3/21 [01:55<10:31, 35.10s/it]


Mean over folds for C=0.00390625, penalty=l2:
  Train - BalAcc: 0.8384 Acc: 0.7858 Sens: 0.9016 Spec: 0.7753
  Val   - BalAcc: 0.8390 Acc: 0.7858 Sens: 0.9029 Spec: 0.7752



C Search (L2):  19%|█▉        | 4/21 [02:01<06:38, 23.46s/it]


Mean over folds for C=0.0078125, penalty=l2:
  Train - BalAcc: 0.8508 Acc: 0.8068 Sens: 0.9037 Spec: 0.7980
  Val   - BalAcc: 0.8507 Acc: 0.8065 Sens: 0.9037 Spec: 0.7977



C Search (L2):  24%|██▍       | 5/21 [02:07<04:33, 17.06s/it]


Mean over folds for C=0.015625, penalty=l2:
  Train - BalAcc: 0.8616 Acc: 0.8253 Sens: 0.9051 Spec: 0.8180
  Val   - BalAcc: 0.8608 Acc: 0.8247 Sens: 0.9042 Spec: 0.8174



C Search (L2):  29%|██▊       | 6/21 [02:12<03:18, 13.23s/it]


Mean over folds for C=0.03125, penalty=l2:
  Train - BalAcc: 0.8696 Acc: 0.8415 Sens: 0.9034 Spec: 0.8358
  Val   - BalAcc: 0.8690 Acc: 0.8414 Sens: 0.9021 Spec: 0.8358



C Search (L2):  33%|███▎      | 7/21 [02:18<02:31, 10.83s/it]


Mean over folds for C=0.0625, penalty=l2:
  Train - BalAcc: 0.8753 Acc: 0.8551 Sens: 0.8995 Spec: 0.8511
  Val   - BalAcc: 0.8745 Acc: 0.8551 Sens: 0.8978 Spec: 0.8512



C Search (L2):  38%|███▊      | 8/21 [02:24<02:01,  9.31s/it]


Mean over folds for C=0.125, penalty=l2:
  Train - BalAcc: 0.8813 Acc: 0.8664 Sens: 0.8991 Spec: 0.8635
  Val   - BalAcc: 0.8808 Acc: 0.8663 Sens: 0.8984 Spec: 0.8633



C Search (L2):  43%|████▎     | 9/21 [02:30<01:39,  8.33s/it]


Mean over folds for C=0.25, penalty=l2:
  Train - BalAcc: 0.8847 Acc: 0.8750 Sens: 0.8965 Spec: 0.8730
  Val   - BalAcc: 0.8839 Acc: 0.8746 Sens: 0.8952 Spec: 0.8727



C Search (L2):  48%|████▊     | 10/21 [02:37<01:24,  7.68s/it]


Mean over folds for C=0.5, penalty=l2:
  Train - BalAcc: 0.8870 Acc: 0.8813 Sens: 0.8938 Spec: 0.8802
  Val   - BalAcc: 0.8855 Acc: 0.8803 Sens: 0.8917 Spec: 0.8793



C Search (L2):  52%|█████▏    | 11/21 [02:43<01:11,  7.16s/it]


Mean over folds for C=1, penalty=l2:
  Train - BalAcc: 0.8881 Acc: 0.8857 Sens: 0.8909 Spec: 0.8852
  Val   - BalAcc: 0.8871 Acc: 0.8851 Sens: 0.8896 Spec: 0.8847



C Search (L2):  57%|█████▋    | 12/21 [02:49<01:01,  6.80s/it]


Mean over folds for C=2, penalty=l2:
  Train - BalAcc: 0.8886 Acc: 0.8882 Sens: 0.8892 Spec: 0.8881
  Val   - BalAcc: 0.8874 Acc: 0.8878 Sens: 0.8869 Spec: 0.8878



C Search (L2):  62%|██████▏   | 13/21 [02:55<00:52,  6.56s/it]


Mean over folds for C=4, penalty=l2:
  Train - BalAcc: 0.8890 Acc: 0.8895 Sens: 0.8884 Spec: 0.8896
  Val   - BalAcc: 0.8873 Acc: 0.8888 Sens: 0.8854 Spec: 0.8892



C Search (L2):  67%|██████▋   | 14/21 [03:01<00:44,  6.39s/it]


Mean over folds for C=8, penalty=l2:
  Train - BalAcc: 0.8889 Acc: 0.8898 Sens: 0.8878 Spec: 0.8900
  Val   - BalAcc: 0.8876 Acc: 0.8894 Sens: 0.8854 Spec: 0.8898



C Search (L2):  71%|███████▏  | 15/21 [03:07<00:37,  6.28s/it]


Mean over folds for C=16, penalty=l2:
  Train - BalAcc: 0.8892 Acc: 0.8904 Sens: 0.8877 Spec: 0.8907
  Val   - BalAcc: 0.8879 Acc: 0.8902 Sens: 0.8851 Spec: 0.8907



C Search (L2):  76%|███████▌  | 16/21 [03:13<00:30,  6.18s/it]


Mean over folds for C=32, penalty=l2:
  Train - BalAcc: 0.8892 Acc: 0.8905 Sens: 0.8876 Spec: 0.8908
  Val   - BalAcc: 0.8877 Acc: 0.8902 Sens: 0.8846 Spec: 0.8908



C Search (L2):  81%|████████  | 17/21 [03:19<00:24,  6.12s/it]


Mean over folds for C=64, penalty=l2:
  Train - BalAcc: 0.8894 Acc: 0.8906 Sens: 0.8879 Spec: 0.8909
  Val   - BalAcc: 0.8879 Acc: 0.8900 Sens: 0.8854 Spec: 0.8905



C Search (L2):  86%|████████▌ | 18/21 [03:25<00:18,  6.10s/it]


Mean over folds for C=128, penalty=l2:
  Train - BalAcc: 0.8894 Acc: 0.8907 Sens: 0.8877 Spec: 0.8910
  Val   - BalAcc: 0.8881 Acc: 0.8904 Sens: 0.8854 Spec: 0.8909



C Search (L2):  90%|█████████ | 19/21 [03:31<00:12,  6.09s/it]


Mean over folds for C=256, penalty=l2:
  Train - BalAcc: 0.8893 Acc: 0.8907 Sens: 0.8877 Spec: 0.8910
  Val   - BalAcc: 0.8880 Acc: 0.8903 Sens: 0.8851 Spec: 0.8908



C Search (L2):  95%|█████████▌| 20/21 [03:37<00:06,  6.06s/it]


Mean over folds for C=512, penalty=l2:
  Train - BalAcc: 0.8893 Acc: 0.8907 Sens: 0.8877 Spec: 0.8910
  Val   - BalAcc: 0.8879 Acc: 0.8904 Sens: 0.8848 Spec: 0.8909



C Search (L2): 100%|██████████| 21/21 [03:43<00:00, 10.63s/it]


Mean over folds for C=1024, penalty=l2:
  Train - BalAcc: 0.8893 Acc: 0.8907 Sens: 0.8877 Spec: 0.8910
  Val   - BalAcc: 0.8880 Acc: 0.8904 Sens: 0.8851 Spec: 0.8909






In [72]:
print("\n Best Grid Search Result:")
print(f"Best C: {best_C}")
print(f"Best Penalty: {best_penalty}")
print(f"Best Validation Balanced Accuracy: {best_metrics['Balanced Accuracy']:.4f}")
print(f"Best Validation Accuracy:          {best_metrics['Accuracy']:.4f}")
print(f"Best Validation Sensitivity:       {best_metrics['Sensitivity']:.4f}")
print(f"Best Validation Specificity:       {best_metrics['Specificity']:.4f}")
print(f"Best Training Balanced Accuracy:   {best_metrics['Train Balanced Accuracy']:.4f}")
print(f"Best Training Accuracy:            {best_metrics['Train Accuracy']:.4f}")
print(f"Best Training Sensitivity:         {best_metrics['Train Sensitivity']:.4f}")
print(f"Best Training Specificity:         {best_metrics['Train Specificity']:.4f}")


 Best Grid Search Result:
Best C: 128
Best Penalty: l2
Best Validation Balanced Accuracy: 0.8881
Best Validation Accuracy:          0.8904
Best Validation Sensitivity:       0.8854
Best Validation Specificity:       0.8909
Best Training Balanced Accuracy:   0.8894
Best Training Accuracy:            0.8907
Best Training Sensitivity:         0.8877
Best Training Specificity:         0.8910


##### Test

In [73]:
if best_penalty == 'l1':
    # saga 支持 L1 和 L2
    solver_type = "saga" 
else:
    # lbfgs 速度快，但只支持 L2
    solver_type = "lbfgs" 


# 取得最佳參數
final_model = LogisticRegression(
    C=best_C,
    penalty=best_penalty,
    max_iter=5000,
    solver=solver_type,
    class_weight='balanced',
    random_state=seed
)
final_model.fit(x, y)


# 預測
y_test_proba = final_model.predict_proba(X_test.values)[:, 1]
y_test_pred = (y_test_proba >= 0.5).astype(int)


# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
auc = roc_auc_score(y_test, y_test_proba)
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"AUC:                       {auc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 848                  94
Actual Negative                1132                9219
Sensitivity (Recall, TPR): 0.9002
Specificity (TNR):         0.8906
Accuracy:                  0.8914
Balanced Accuracy:         0.8954
AUC:                       0.9589
F1 Score:         0.5804


#### SVM(balanced-weight)

In [None]:
# 定義超參數範圍
C_list = [2 ** i for i in range(-10, -5)]
gamma_list = [2 ** i for i in range(-10, 11)]
param_grid = list(product(C_list, gamma_list))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_score = 0
best_params = None
best_metrics = None


print("Running Grid Search over C and gamma with 5-Fold Stratified CV...\n")

for C, gamma in tqdm(param_grid):
    fold_train_bal_acc = []
    fold_train_acc = []
    fold_train_sens = []
    fold_train_spec = []

    fold_val_bal_acc = []
    fold_val_acc = []
    fold_val_sens = []
    fold_val_spec = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(x, y), 1):
        x_train_fold, y_train_fold = x[train_idx], y[train_idx]
        x_val_fold, y_val_fold = x[val_idx], y[val_idx]

        clf = SVC(
            C=C,
            gamma=gamma,
            kernel='rbf',
            class_weight='balanced',
            probability=False,
            random_state=seed,
            cache_size=1000
        )
        clf.fit(x_train_fold, y_train_fold)

        # --- Train
        y_train_pred = clf.predict(x_train_fold)
        tn, fp, fn, tp = confusion_matrix(y_train_fold, y_train_pred).ravel()
        train_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        train_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        train_acc = (tp + tn) / (tp + tn + fp + fn)
        train_bal_acc = (train_sens + train_spec) / 2

        fold_train_bal_acc.append(train_bal_acc)
        fold_train_acc.append(train_acc)
        fold_train_sens.append(train_sens)
        fold_train_spec.append(train_spec)

        # --- Validation
        y_val_pred = clf.predict(x_val_fold)
        tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
        val_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        val_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        val_acc = (tp + tn) / (tp + tn + fp + fn)
        val_bal_acc = (val_sens + val_spec) / 2

        fold_val_bal_acc.append(val_bal_acc)
        fold_val_acc.append(val_acc)
        fold_val_sens.append(val_sens)
        fold_val_spec.append(val_spec)

    # === Mean over folds
    avg_train_bal_acc = np.mean(fold_train_bal_acc)
    avg_train_acc = np.mean(fold_train_acc)
    avg_train_sens = np.mean(fold_train_sens)
    avg_train_spec = np.mean(fold_train_spec)

    avg_val_bal_acc = np.mean(fold_val_bal_acc)
    avg_val_acc = np.mean(fold_val_acc)
    avg_val_sens = np.mean(fold_val_sens)
    avg_val_spec = np.mean(fold_val_spec)

    print(f"\nMean over folds for C={C}, gamma={gamma}:")
    print(f"  Train - BalAcc: {avg_train_bal_acc:.4f} Acc: {avg_train_acc:.4f} Sens: {avg_train_sens:.4f} Spec: {avg_train_spec:.4f}")
    print(f"  Val   - BalAcc: {avg_val_bal_acc:.4f} Acc: {avg_val_acc:.4f} Sens: {avg_val_sens:.4f} Spec: {avg_val_spec:.4f}\n")

    # --- Best tracking
    if avg_val_bal_acc > best_score:
        best_score = avg_val_bal_acc
        best_params = (C, gamma)
        best_metrics = {
            "Balanced Accuracy": avg_val_bal_acc,
            "Accuracy": avg_val_acc,
            "Sensitivity": avg_val_sens,
            "Specificity": avg_val_spec,
            "Train Balanced Accuracy": avg_train_bal_acc,
            "Train Accuracy": avg_train_acc,
            "Train Sensitivity": avg_train_sens,
            "Train Specificity": avg_train_spec,
        }

Running Grid Search over C and gamma with 5-Fold Stratified CV...



  1%|          | 1/105 [12:39<21:55:59, 759.22s/it]


Mean over folds for C=0.0009765625, gamma=0.0009765625:
  Train - BalAcc: 0.5000 Acc: 0.5833 Sens: 0.4000 Spec: 0.6000
  Val   - BalAcc: 0.5000 Acc: 0.5833 Sens: 0.4000 Spec: 0.6000



  2%|▏         | 2/105 [25:12<21:37:39, 755.92s/it]


Mean over folds for C=0.0009765625, gamma=0.001953125:
  Train - BalAcc: 0.5000 Acc: 0.5833 Sens: 0.4000 Spec: 0.6000
  Val   - BalAcc: 0.5000 Acc: 0.5833 Sens: 0.4000 Spec: 0.6000



  3%|▎         | 3/105 [37:49<21:25:26, 756.14s/it]


Mean over folds for C=0.0009765625, gamma=0.00390625:
  Train - BalAcc: 0.5000 Acc: 0.5833 Sens: 0.4000 Spec: 0.6000
  Val   - BalAcc: 0.5000 Acc: 0.5833 Sens: 0.4000 Spec: 0.6000



  4%|▍         | 4/105 [50:12<21:04:32, 751.21s/it]


Mean over folds for C=0.0009765625, gamma=0.0078125:
  Train - BalAcc: 0.5000 Acc: 0.5833 Sens: 0.4000 Spec: 0.6000
  Val   - BalAcc: 0.5000 Acc: 0.5833 Sens: 0.4000 Spec: 0.6000



  5%|▍         | 5/105 [1:02:36<20:47:38, 748.58s/it]


Mean over folds for C=0.0009765625, gamma=0.015625:
  Train - BalAcc: 0.7461 Acc: 0.5788 Sens: 0.9469 Spec: 0.5453
  Val   - BalAcc: 0.7440 Acc: 0.5770 Sens: 0.9445 Spec: 0.5435



  6%|▌         | 6/105 [1:13:33<19:43:30, 717.28s/it]


Mean over folds for C=0.0009765625, gamma=0.03125:
  Train - BalAcc: 0.7485 Acc: 0.5662 Sens: 0.9672 Spec: 0.5297
  Val   - BalAcc: 0.7480 Acc: 0.5660 Sens: 0.9666 Spec: 0.5295



  7%|▋         | 7/105 [1:23:09<18:15:58, 671.01s/it]


Mean over folds for C=0.0009765625, gamma=0.0625:
  Train - BalAcc: 0.7579 Acc: 0.5887 Sens: 0.9609 Spec: 0.5548
  Val   - BalAcc: 0.7578 Acc: 0.5886 Sens: 0.9610 Spec: 0.5547



  8%|▊         | 8/105 [1:32:04<16:54:57, 627.81s/it]


Mean over folds for C=0.0009765625, gamma=0.125:
  Train - BalAcc: 0.7778 Acc: 0.6342 Sens: 0.9502 Spec: 0.6055
  Val   - BalAcc: 0.7777 Acc: 0.6343 Sens: 0.9498 Spec: 0.6056



  9%|▊         | 9/105 [1:40:58<15:57:44, 598.59s/it]


Mean over folds for C=0.0009765625, gamma=0.25:
  Train - BalAcc: 0.7895 Acc: 0.6723 Sens: 0.9301 Spec: 0.6489
  Val   - BalAcc: 0.7898 Acc: 0.6721 Sens: 0.9310 Spec: 0.6486



In [None]:

print("\n✅ Best Grid Search Result:")
print(f"Best C: {best_params[0]}")
print(f"Best gamma: {best_params[1]}")
print(f"Best Validation Balanced Accuracy: {best_metrics['Balanced Accuracy']:.4f}")
print(f"Best Validation Accuracy:          {best_metrics['Accuracy']:.4f}")
print(f"Best Validation Sensitivity:       {best_metrics['Sensitivity']:.4f}")
print(f"Best Validation Specificity:       {best_metrics['Specificity']:.4f}")
print(f"Best Training Balanced Accuracy:   {best_metrics['Train Balanced Accuracy']:.4f}")
print(f"Best Training Accuracy:            {best_metrics['Train Accuracy']:.4f}")
print(f"Best Training Sensitivity:         {best_metrics['Train Sensitivity']:.4f}")
print(f"Best Training Specificity:         {best_metrics['Train Specificity']:.4f}")

##### Test

In [None]:
# 取得最佳參數
best_C = best_params[0]
best_gamma = best_params[1]

final_model = SVC(
    C=best_C,
    gamma=best_gamma,
    kernel='rbf',
    probability=False,
    class_weight='balanced',
    random_state=seed,
    cache_size=1000
)

final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test.values)                  # 分類標籤
y_test_score = final_model.decision_function(X_test.values)
# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
auc = roc_auc_score(y_test, y_test_score)
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"AUC:                       {auc:.4f}")
print(f"F1 Score:         {f1:.4f}")

TypeError: 'NoneType' object is not subscriptable

In [None]:
# 定義超參數範圍
C_list = [2 ** i for i in range(-5, 0)]
gamma_list = [2 ** i for i in range(-10, 11)]
param_grid = list(product(C_list, gamma_list))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_score = 0
best_params = None
best_metrics = None


print("Running Grid Search over C and gamma with 5-Fold Stratified CV...\n")

for C, gamma in tqdm(param_grid):
    fold_train_bal_acc = []
    fold_train_acc = []
    fold_train_sens = []
    fold_train_spec = []

    fold_val_bal_acc = []
    fold_val_acc = []
    fold_val_sens = []
    fold_val_spec = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(x, y), 1):
        x_train_fold, y_train_fold = x[train_idx], y[train_idx]
        x_val_fold, y_val_fold = x[val_idx], y[val_idx]

        clf = SVC(
            C=C,
            gamma=gamma,
            kernel='rbf',
            class_weight='balanced',
            probability=False,
            random_state=seed,
            cache_size=1000
        )
        clf.fit(x_train_fold, y_train_fold)

        # --- Train
        y_train_pred = clf.predict(x_train_fold)
        tn, fp, fn, tp = confusion_matrix(y_train_fold, y_train_pred).ravel()
        train_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        train_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        train_acc = (tp + tn) / (tp + tn + fp + fn)
        train_bal_acc = (train_sens + train_spec) / 2

        fold_train_bal_acc.append(train_bal_acc)
        fold_train_acc.append(train_acc)
        fold_train_sens.append(train_sens)
        fold_train_spec.append(train_spec)

        # --- Validation
        y_val_pred = clf.predict(x_val_fold)
        tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
        val_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        val_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        val_acc = (tp + tn) / (tp + tn + fp + fn)
        val_bal_acc = (val_sens + val_spec) / 2

        fold_val_bal_acc.append(val_bal_acc)
        fold_val_acc.append(val_acc)
        fold_val_sens.append(val_sens)
        fold_val_spec.append(val_spec)

    # === Mean over folds
    avg_train_bal_acc = np.mean(fold_train_bal_acc)
    avg_train_acc = np.mean(fold_train_acc)
    avg_train_sens = np.mean(fold_train_sens)
    avg_train_spec = np.mean(fold_train_spec)

    avg_val_bal_acc = np.mean(fold_val_bal_acc)
    avg_val_acc = np.mean(fold_val_acc)
    avg_val_sens = np.mean(fold_val_sens)
    avg_val_spec = np.mean(fold_val_spec)

    print(f"\nMean over folds for C={C}, gamma={gamma}:")
    print(f"  Train - BalAcc: {avg_train_bal_acc:.4f} Acc: {avg_train_acc:.4f} Sens: {avg_train_sens:.4f} Spec: {avg_train_spec:.4f}")
    print(f"  Val   - BalAcc: {avg_val_bal_acc:.4f} Acc: {avg_val_acc:.4f} Sens: {avg_val_sens:.4f} Spec: {avg_val_spec:.4f}\n")

    # --- Best tracking
    if avg_val_bal_acc > best_score:
        best_score = avg_val_bal_acc
        best_params = (C, gamma)
        best_metrics = {
            "Balanced Accuracy": avg_val_bal_acc,
            "Accuracy": avg_val_acc,
            "Sensitivity": avg_val_sens,
            "Specificity": avg_val_spec,
            "Train Balanced Accuracy": avg_train_bal_acc,
            "Train Accuracy": avg_train_acc,
            "Train Sensitivity": avg_train_sens,
            "Train Specificity": avg_train_spec,
        }

In [None]:

print("\n✅ Best Grid Search Result:")
print(f"Best C: {best_params[0]}")
print(f"Best gamma: {best_params[1]}")
print(f"Best Validation Balanced Accuracy: {best_metrics['Balanced Accuracy']:.4f}")
print(f"Best Validation Accuracy:          {best_metrics['Accuracy']:.4f}")
print(f"Best Validation Sensitivity:       {best_metrics['Sensitivity']:.4f}")
print(f"Best Validation Specificity:       {best_metrics['Specificity']:.4f}")
print(f"Best Training Balanced Accuracy:   {best_metrics['Train Balanced Accuracy']:.4f}")
print(f"Best Training Accuracy:            {best_metrics['Train Accuracy']:.4f}")
print(f"Best Training Sensitivity:         {best_metrics['Train Sensitivity']:.4f}")
print(f"Best Training Specificity:         {best_metrics['Train Specificity']:.4f}")

In [None]:
# 取得最佳參數
best_C = best_params[0]
best_gamma = best_params[1]

final_model = SVC(
    C=best_C,
    gamma=best_gamma,
    kernel='rbf',
    probability=False,
    class_weight='balanced',
    random_state=seed,
    cache_size=1000
)

final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test.values)                  # 分類標籤
y_test_score = final_model.decision_function(X_test.values)
# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
auc = roc_auc_score(y_test, y_test_score)
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"AUC:                       {auc:.4f}")
print(f"F1 Score:         {f1:.4f}")

#### XGboost


In [None]:
from collections import Counter

# 先計算 scale_pos_weight
counter = Counter(y)
print(counter)

n_pos = counter[1]
n_neg = counter[0]
scale_pos_weight = n_neg / n_pos
print(f"scale_pos_weight = {scale_pos_weight:.4f}")

Counter({np.int64(0): 7149, np.int64(1): 698})
scale_pos_weight = 10.2421


In [None]:
# 參數網格
param_grid = {
    "colsample_bytree": [0.3, 0.5, 0.7, 1.0],
    "gamma": [0, 0.1, 0.5, 1, 5],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [200, 400, 600, 800, 1000],
    "subsample": [0.3, 0.5, 0.7]
}


param_list = list(ParameterGrid(param_grid))

print(f"Total combinations: {len(param_list)}")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_score = 0
best_params = None
best_metrics = None

print("Running Grid Search with 5-Fold CV (train/val metrics)...\n")

for i, param in tqdm(enumerate(param_list)):
    fold_train_bal_acc = []
    fold_val_bal_acc = []

    fold_train_acc = []
    fold_val_acc = []

    fold_train_sens = []
    fold_val_sens = []

    fold_train_spec = []
    fold_val_spec = []

    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(x, y)):
        x_train_fold, y_train_fold = x[train_idx], y[train_idx]
        x_val_fold, y_val_fold = x[val_idx], y[val_idx]

        clf = xgb.XGBClassifier(
            **param,
            eval_metric="logloss",
            tree_method="hist",
            device="cuda",
            scale_pos_weight=scale_pos_weight,
            random_state=seed
        )
        clf.fit(x_train_fold, y_train_fold)

        # 預測
        y_train_pred = clf.predict(x_train_fold)
        y_val_pred = clf.predict(x_val_fold)

        # 計算指標
        train_bal_acc, train_acc, train_sens, train_spec = compute_metrics(y_train_fold, y_train_pred)
        val_bal_acc, val_acc, val_sens, val_spec = compute_metrics(y_val_fold, y_val_pred)

        # 累積
        fold_train_bal_acc.append(train_bal_acc)
        fold_val_bal_acc.append(val_bal_acc)

        fold_train_acc.append(train_acc)
        fold_val_acc.append(val_acc)

        fold_train_sens.append(train_sens)
        fold_val_sens.append(val_sens)

        fold_train_spec.append(train_spec)
        fold_val_spec.append(val_spec)

    # 平均
    avg_train_bal_acc = np.mean(fold_train_bal_acc)
    avg_val_bal_acc = np.mean(fold_val_bal_acc)

    avg_train_acc = np.mean(fold_train_acc)
    avg_val_acc = np.mean(fold_val_acc)

    avg_train_sens = np.mean(fold_train_sens)
    avg_val_sens = np.mean(fold_val_sens)

    avg_train_spec = np.mean(fold_train_spec)
    avg_val_spec = np.mean(fold_val_spec)

    if avg_val_bal_acc > best_score:
        best_score = avg_val_bal_acc
        best_params = param
        best_metrics = {
            "Balanced Accuracy": avg_val_bal_acc,
            "Accuracy": avg_val_acc,
            "Sensitivity": avg_val_sens,
            "Specificity": avg_val_spec,
            "Train Balanced Accuracy": avg_train_bal_acc,
            "Train Accuracy": avg_train_acc,
            "Train Sensitivity": avg_train_sens,
            "Train Specificity": avg_train_spec
        }

In [None]:
print("Best Params:", best_params)
print(f"Best Balanced Accuracy: {best_score:.4f}")
print(f"Best Accuracy:           {best_metrics['Accuracy']:.4f}")
print(f"Best Sensitivity:        {best_metrics['Sensitivity']:.4f}")
print(f"Best Specificity:        {best_metrics['Specificity']:.4f}")
print(f"Best Training Balanced Accuracy:   {best_metrics['Train Balanced Accuracy']:.4f}")
print(f"Best Training Accuracy:            {best_metrics['Train Accuracy']:.4f}")
print(f"Best Training Sensitivity:         {best_metrics['Train Sensitivity']:.4f}")
print(f"Best Training Specificity:         {best_metrics['Train Specificity']:.4f}")

##### Test

In [None]:
print("\nTraining final model on full training data with best parameters...")
final_model = xgb.XGBClassifier(
    **best_params,
    eval_metric="logloss",
    tree_method="hist",
    device="cuda",
    scale_pos_weight=scale_pos_weight,
    random_state=seed
)
final_model.fit(x, y)


# 預測 Test Set
y_test_proba = final_model.predict_proba(X_test.values)[:, 1]
y_test_pred = (y_test_proba >= 0.5).astype(int)

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
auc = roc_auc_score(y_test, y_test_proba)
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"AUC:                       {auc:.4f}")
print(f"F1 Score:                  {f1:.4f}")

#### Adaboost(balanced-weight)

In [None]:

# === 超參數範圍 (三層 Grid Search) ===
n_estimators_list = [50, 100, 150, 200, 250, 300, 350, 400]
learning_rate_list = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]
base_estimators = [
    DecisionTreeClassifier(max_depth=1, class_weight="balanced"),
    DecisionTreeClassifier(max_depth=2, class_weight="balanced"),
    DecisionTreeClassifier(max_depth=3, class_weight="balanced"),
]
param_grid = list(product(n_estimators_list, learning_rate_list, base_estimators))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_score = 0
best_params = None
best_metrics = None

print("Running Grid Search over n_estimators, learning_rate, and base_estimator with 5-Fold Stratified CV...\n")

for n_estimators, lr, base in tqdm(param_grid):
    fold_train_bal_acc, fold_val_bal_acc = [], []
    fold_train_acc, fold_val_acc = [], []
    fold_train_sens, fold_val_sens = [], []
    fold_train_spec, fold_val_spec = [], []

    for train_idx, val_idx in cv.split(x, y):
        x_train_fold, y_train_fold = x[train_idx], y[train_idx]
        x_val_fold, y_val_fold = x[val_idx], y[val_idx]

        # === AdaBoost 模型 ===
        clf = AdaBoostClassifier(
            estimator=base,
            n_estimators=n_estimators,
            learning_rate=lr,
            random_state=seed
        )
        clf.fit(x_train_fold, y_train_fold)

        # === 訓練集
        y_train_pred = clf.predict(x_train_fold)
        tn, fp, fn, tp = confusion_matrix(y_train_fold, y_train_pred).ravel()
        train_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        train_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        train_acc = (tp + tn) / (tp + tn + fp + fn)
        train_bal_acc = (train_sens + train_spec) / 2

        fold_train_bal_acc.append(train_bal_acc)
        fold_train_acc.append(train_acc)
        fold_train_sens.append(train_sens)
        fold_train_spec.append(train_spec)

        # === 驗證集
        y_val_pred = clf.predict(x_val_fold)
        tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
        val_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        val_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        val_acc = (tp + tn) / (tp + tn + fp + fn)
        val_bal_acc = (val_sens + val_spec) / 2

        fold_val_bal_acc.append(val_bal_acc)
        fold_val_acc.append(val_acc)
        fold_val_sens.append(val_sens)
        fold_val_spec.append(val_spec)

    # === 平均結果
    avg_train_bal_acc = np.mean(fold_train_bal_acc)
    avg_train_acc = np.mean(fold_train_acc)
    avg_train_sens = np.mean(fold_train_sens)
    avg_train_spec = np.mean(fold_train_spec)

    avg_val_bal_acc = np.mean(fold_val_bal_acc)
    avg_val_acc = np.mean(fold_val_acc)
    avg_val_sens = np.mean(fold_val_sens)
    avg_val_spec = np.mean(fold_val_spec)

    print(f"\nMean over folds for n_estimators={n_estimators}, lr={lr}, base={base}:")
    print(f"  Train - BalAcc: {avg_train_bal_acc:.4f} Acc: {avg_train_acc:.4f} Sens: {avg_train_sens:.4f} Spec: {avg_train_spec:.4f}")
    print(f"  Val   - BalAcc: {avg_val_bal_acc:.4f} Acc: {avg_val_acc:.4f} Sens: {avg_val_sens:.4f} Spec: {avg_val_spec:.4f}\n")

    # === 更新最佳結果
    if avg_val_bal_acc > best_score:
        best_score = avg_val_bal_acc
        best_params = (n_estimators, lr, base)
        best_metrics = {
            "Balanced Accuracy": avg_val_bal_acc,
            "Accuracy": avg_val_acc,
            "Sensitivity": avg_val_sens,
            "Specificity": avg_val_spec,
            "Train Balanced Accuracy": avg_train_bal_acc,
            "Train Accuracy": avg_train_acc,
            "Train Sensitivity": avg_train_sens,
            "Train Specificity": avg_train_spec,
        }

print("\n✅ Best Grid Search Result:")
print(f"Best n_estimators: {best_params[0]}")
print(f"Best learning_rate: {best_params[1]}")
print(f"Best base_estimator: {best_params[2]}")
print(f"Best Train Balanced Accuracy: {best_metrics['Train Balanced Accuracy']:.4f}")
print(f"Best Train Accuracy:          {best_metrics['Train Accuracy']:.4f}")
print(f"Best Train Sensitivity:       {best_metrics['Train Sensitivity']:.4f}")
print(f"Best Train Specificity:       {best_metrics['Train Specificity']:.4f}")
print(f"Best Validation Balanced Accuracy: {best_metrics['Balanced Accuracy']:.4f}")
print(f"Best Validation Accuracy:          {best_metrics['Accuracy']:.4f}")
print(f"Best Validation Sensitivity:       {best_metrics['Sensitivity']:.4f}")
print(f"Best Validation Specificity:       {best_metrics['Specificity']:.4f}")


Running Grid Search over n_estimators, learning_rate, and base_estimator with 5-Fold Stratified CV...



  0%|          | 0/168 [00:00<?, ?it/s]


Mean over folds for n_estimators=50, lr=0.001, base=DecisionTreeClassifier(class_weight='balanced', max_depth=1):
  Train - BalAcc: 0.7995 Acc: 0.8951 Sens: 0.6834 Spec: 0.9157
  Val   - BalAcc: 0.7987 Acc: 0.8949 Sens: 0.6818 Spec: 0.9157


Mean over folds for n_estimators=50, lr=0.001, base=DecisionTreeClassifier(class_weight='balanced', max_depth=2):
  Train - BalAcc: 0.8067 Acc: 0.8536 Sens: 0.7496 Spec: 0.8638
  Val   - BalAcc: 0.8046 Acc: 0.8537 Sens: 0.7449 Spec: 0.8643


Mean over folds for n_estimators=50, lr=0.001, base=DecisionTreeClassifier(class_weight='balanced', max_depth=3):
  Train - BalAcc: 0.8096 Acc: 0.8751 Sens: 0.7300 Spec: 0.8893
  Val   - BalAcc: 0.7971 Acc: 0.8708 Sens: 0.7075 Spec: 0.8867


Mean over folds for n_estimators=50, lr=0.005, base=DecisionTreeClassifier(class_weight='balanced', max_depth=1):
  Train - BalAcc: 0.7995 Acc: 0.8951 Sens: 0.6834 Spec: 0.9157
  Val   - BalAcc: 0.7987 Acc: 0.8949 Sens: 0.6818 Spec: 0.9157


Mean over folds for n_estimator

##### Test

In [None]:
# === 用最佳參數訓練 Final Model ===
best_n_estimators, best_lr, best_base = best_params
final_model = AdaBoostClassifier(
    estimator=best_base,
    n_estimators=best_n_estimators,
    learning_rate=best_lr,
    random_state=seed
)
final_model.fit(x, y)


y_test_pred = final_model.predict(X_test.values)
y_test_proba = final_model.predict_proba(X_test.values)[:, 1]

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
auc = roc_auc_score(y_test, y_test_proba)
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 混淆矩陣 DataFrame (Wikipedia 格式)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"AUC:                       {auc:.4f}")
print(f"F1 Score:                  {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 127                  48
Actual Negative                 246                1541
Sensitivity (Recall, TPR): 0.7257
Specificity (TNR):         0.8623
Accuracy:                  0.8502
Balanced Accuracy:         0.7940
AUC:                       0.7940
F1 Score:                  0.4635


#### Catboost(balanced-weight)

In [None]:
from collections import Counter

# 先計算 scale_pos_weight
counter = Counter(y)
print(counter)

n_pos = counter[1]
n_neg = counter[0]
scale_pos_weight = n_neg / n_pos
print(f"scale_pos_weight = {scale_pos_weight:.4f}")

Counter({np.int64(0): 7149, np.int64(1): 698})
scale_pos_weight = 10.2421


In [None]:
x = X_train
y = y_train

In [None]:
# === 超參數範圍 (Grid Search for CatBoost) ===
n_estimators_list = [100, 200, 300, 400, 500, 600, 700, 800]
learning_rate_list = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
max_depth_list = [3, 5, 7, 9]
l2_leaf_reg_list = [2**i for i in range(1, 7)]
param_grid = list(product(n_estimators_list, learning_rate_list, max_depth_list, l2_leaf_reg_list))
class_weights = {0: 1.0, 1: scale_pos_weight}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_score = 0
best_params = None
best_metrics = None

print("Running Grid Search over n_estimators, learning_rate, max_depth with 5-Fold Stratified CV...\n")

for n_estimators, lr, max_depth, l2_leaf_reg in tqdm(param_grid):
    fold_train_bal_acc, fold_val_bal_acc = [], []
    fold_train_acc, fold_val_acc = [], []
    fold_train_sens, fold_val_sens = [], []
    fold_train_spec, fold_val_spec = [], []

    for train_idx, val_idx in cv.split(x, y):
        x_train_fold, y_train_fold = x.iloc[train_idx], y.iloc[train_idx]
        x_val_fold, y_val_fold     = x.iloc[val_idx], y.iloc[val_idx]

        # === CatBoost 模型 ===
        clf = CatBoostClassifier(
        iterations=n_estimators,
        learning_rate=lr,
        depth=max_depth,
        l2_leaf_reg=l2_leaf_reg,
        loss_function="Logloss",
        eval_metric="BalancedAccuracy",  # CPU 模式支援 BalancedAccuracy
        class_weights=class_weights,
        random_seed=seed,
        verbose=0,
        task_type="CPU"   # 改成 CPU
        )

        clf.fit(x_train_fold, y_train_fold)
        # === 訓練集
        y_train_pred = clf.predict(x_train_fold)
        tn, fp, fn, tp = confusion_matrix(y_train_fold, y_train_pred).ravel()
        train_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        train_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        train_acc = (tp + tn) / (tp + tn + fp + fn)
        train_bal_acc = (train_sens + train_spec) / 2

        fold_train_bal_acc.append(train_bal_acc)
        fold_train_acc.append(train_acc)
        fold_train_sens.append(train_sens)
        fold_train_spec.append(train_spec)

        # === 驗證集
        y_val_pred = clf.predict(x_val_fold)
        tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
        val_sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        val_spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        val_acc = (tp + tn) / (tp + tn + fp + fn)
        val_bal_acc = (val_sens + val_spec) / 2

        fold_val_bal_acc.append(val_bal_acc)
        fold_val_acc.append(val_acc)
        fold_val_sens.append(val_sens)
        fold_val_spec.append(val_spec)

    # === 平均結果
    avg_train_bal_acc = np.mean(fold_train_bal_acc)
    avg_train_acc = np.mean(fold_train_acc)
    avg_train_sens = np.mean(fold_train_sens)
    avg_train_spec = np.mean(fold_train_spec)

    avg_val_bal_acc = np.mean(fold_val_bal_acc)
    avg_val_acc = np.mean(fold_val_acc)
    avg_val_sens = np.mean(fold_val_sens)
    avg_val_spec = np.mean(fold_val_spec)

    print(f"\nMean over folds for n_estimators={n_estimators}, lr={lr}, max_depth={max_depth}, l2_leaf_reg={l2_leaf_reg}:")
    print(f"  Train - BalAcc: {avg_train_bal_acc:.4f} Acc: {avg_train_acc:.4f} Sens: {avg_train_sens:.4f} Spec: {avg_train_spec:.4f}")
    print(f"  Val   - BalAcc: {avg_val_bal_acc:.4f} Acc: {avg_val_acc:.4f} Sens: {avg_val_sens:.4f} Spec: {avg_val_spec:.4f}\n")

    # === 更新最佳結果
    if avg_val_bal_acc > best_score:
        best_score = avg_val_bal_acc
        best_params =  (n_estimators, lr, max_depth, l2_leaf_reg)
        best_metrics = {
            "Balanced Accuracy": avg_val_bal_acc,
            "Accuracy": avg_val_acc,
            "Sensitivity": avg_val_sens,
            "Specificity": avg_val_spec,
            "Train Balanced Accuracy": avg_train_bal_acc,
            "Train Accuracy": avg_train_acc,
            "Train Sensitivity": avg_train_sens,
            "Train Specificity": avg_train_spec,
        }

print("\n✅ Best Grid Search Result:")
print("\n✅ Best Grid Search Result:")
print(f"Best n_estimators: {best_params[0]}")
print(f"Best learning_rate: {best_params[1]}")
print(f"Best max_depth:     {best_params[2]}")
print(f"Best l2_leaf_reg:   {best_params[3]}")
print(f"Best Train Balanced Accuracy: {best_metrics['Train Balanced Accuracy']:.4f}")
print(f"Best Train Accuracy:          {best_metrics['Train Accuracy']:.4f}")
print(f"Best Train Sensitivity:       {best_metrics['Train Sensitivity']:.4f}")
print(f"Best Train Specificity:       {best_metrics['Train Specificity']:.4f}")
print(f"Best Validation Balanced Accuracy: {best_metrics['Balanced Accuracy']:.4f}")
print(f"Best Validation Accuracy:          {best_metrics['Accuracy']:.4f}")
print(f"Best Validation Sensitivity:       {best_metrics['Sensitivity']:.4f}")
print(f"Best Validation Specificity:       {best_metrics['Specificity']:.4f}")


Running Grid Search over n_estimators, learning_rate, max_depth with 5-Fold Stratified CV...



  0%|          | 0/1152 [00:00<?, ?it/s]

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
Mean over folds for n_estimators=200, lr=0.001, max_depth=5, l2_leaf_reg=64:
  Train - BalAcc: 0.7989 Acc: 0.8951 Sens: 0.6819 Spec: 0.9159
  Val   - BalAcc: 0.7989 Acc: 0.8951 Sens: 0.6818 Spec: 0.9159


Mean over folds for n_estimators=200, lr=0.001, max_depth=7, l2_leaf_reg=2:
  Train - BalAcc: 0.8050 Acc: 0.8962 Sens: 0.6941 Spec: 0.9159
  Val   - BalAcc: 0.7988 Acc: 0.8950 Sens: 0.6818 Spec: 0.9158


Mean over folds for n_estimators=200, lr=0.001, max_depth=7, l2_leaf_reg=4:
  Train - BalAcc: 0.8038 Acc: 0.8960 Sens: 0.6916 Spec: 0.9159
  Val   - BalAcc: 0.7987 Acc: 0.8949 Sens: 0.6818 Spec: 0.9157


Mean over folds for n_estimators=200, lr=0.001, max_depth=7, l2_leaf_reg=8:
  Train - BalAcc: 0.8029 Acc: 0.8958 Sens: 0.6898 Spec: 0.9159
  Val   - BalAcc: 0.7988 Acc: 0.8950 Sens: 0.6818 Spec: 0.9158


Mean over folds for n_estimators=200, lr=0.001, max_depth=7, l2_leaf_reg=16:
  Train - BalAcc: 0.8024 Acc: 0.8956 Sens: 0.6891 Spec: 0.9158
  Val   

##### Test

In [None]:
# === 測試集評估 ===
best_n_estimators, best_lr, best_max_depth, best_l2 = best_params
final_model = CatBoostClassifier(
    iterations=best_n_estimators,
    learning_rate=best_lr,
    depth=best_max_depth,
    l2_leaf_reg=best_l2,   #  改成最佳的 l2
    loss_function="Logloss",
    class_weights=class_weights,
    random_seed=seed,
    verbose=0,
    task_type="CPU"
)
final_model.fit(x, y)


y_test_pred = final_model.predict(X_test)
y_test_proba = final_model.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
auc = roc_auc_score(y_test, y_test_proba)
f1 = f1_score(y_test, y_test_pred, zero_division=0)

confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"AUC:                       {auc:.4f}")
print(f"F1 Score:                  {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 119                  56
Actual Negative                 180                1607
Sensitivity (Recall, TPR): 0.6800
Specificity (TNR):         0.8993
Accuracy:                  0.8797
Balanced Accuracy:         0.7896
AUC:                       0.8662
F1 Score:                  0.5021


## Ensemble soft=>equal weight

In [None]:
from collections import Counter

# 先計算 scale_pos_weight
counter = Counter(y)
print(counter)

n_pos = counter[1]
n_neg = counter[0]
scale_pos_weight = n_neg / n_pos
print(f"scale_pos_weight = {scale_pos_weight:.4f}")

Counter({np.int64(0): 7149, np.int64(1): 698})
scale_pos_weight = 10.2421


In [None]:
rf_best_params =  {'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': 40, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
rf_clf = RandomForestClassifier(
            **rf_best_params,
            random_state=seed,
            n_jobs=-1,
            class_weight='balanced'
)


lr_best_params = 512
lr_clf = LogisticRegression(
            C=lr_best_params,
            penalty='l2',
            max_iter=1000,
            solver='lbfgs',
            class_weight='balanced',
            random_state=seed
        )


svc_best_c =  0.0078125
svc_best_gamma = 0.25
svc_clf = SVC(
       C=svc_best_c,
       gamma=svc_best_gamma,
       kernel='rbf',
       class_weight='balanced',
       probability=True,
       random_state=seed
      )

xgb_best_params = {'colsample_bytree': 0.8831528055561403, 'gamma': 0.021020125361070625, 'learning_rate': 0.03754890989284685, 'max_depth': 3, 'n_estimators': 181, 'subsample': 0.8246355080259108}
xgb_clf = xgb.XGBClassifier(
            **xgb_best_params,
            eval_metric="logloss",
            tree_method="hist",
            scale_pos_weight=scale_pos_weight,
            random_state=seed
        )



In [None]:
from sklearn.ensemble import VotingClassifier
from itertools import combinations
# 定義 base 模型
models = {
    'rf': rf_clf,
    'lr': lr_clf,
    'svc': svc_clf,
    'xgb': xgb_clf,
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = []

for r in range(2, len(models) + 1):  # 組合長度 2~4
    for subset in combinations(models.items(), r):
        clf_names, clf_list = zip(*subset)

        eclf = VotingClassifier(
            estimators=list(subset),
            voting='soft',
            n_jobs=-1
        )

        train_bal_accs, train_accs, train_sens, train_spec = [], [], [], []
        val_bal_accs, val_accs, val_sens, val_spec = [], [], [], []

        for train_idx, val_idx in cv.split(x, y):
            X_train_fold, y_train_fold = x[train_idx], y[train_idx]
            X_val_fold, y_val_fold = x[val_idx], y[val_idx]

            eclf.fit(X_train_fold, y_train_fold)

            # 預測
            y_train_pred = eclf.predict(X_train_fold)
            y_val_pred = eclf.predict(X_val_fold)

            # === Train metrics ===
            train_bal_accs.append(balanced_accuracy_score(y_train_fold, y_train_pred))
            train_accs.append(accuracy_score(y_train_fold, y_train_pred))
            train_sens.append(recall_score(y_train_fold, y_train_pred, pos_label=1))
            train_spec.append(recall_score(y_train_fold, y_train_pred, pos_label=0))

            # === Val metrics ===
            val_bal_accs.append(balanced_accuracy_score(y_val_fold, y_val_pred))
            val_accs.append(accuracy_score(y_val_fold, y_val_pred))
            val_sens.append(recall_score(y_val_fold, y_val_pred, pos_label=1))
            val_spec.append(recall_score(y_val_fold, y_val_pred, pos_label=0))

        # 平均結果
        result = {
            'models': clf_names,
            'train_bal_acc': np.mean(train_bal_accs),
            'train_acc': np.mean(train_accs),
            'train_sens': np.mean(train_sens),
            'train_spec': np.mean(train_spec),
            'val_bal_acc': np.mean(val_bal_accs),
            'val_acc': np.mean(val_accs),
            'val_sens': np.mean(val_sens),
            'val_spec': np.mean(val_spec),
        }

        results.append(result)

        # 輸出格式化結果
        print(f"Models: {clf_names}")
        print(f" Train - Balanced Acc: {result['train_bal_acc']:.4f} | Acc: {result['train_acc']:.4f} | Sens: {result['train_sens']:.4f} | Spec: {result['train_spec']:.4f}")
        print(f" Val - Balanced Acc: {result['val_bal_acc']:.4f} | Acc: {result['val_acc']:.4f} | Sens: {result['val_sens']:.4f} | Spec: {result['val_spec']:.4f}")
        print("")

# 選出最佳組合（以 val_bal_acc 為主）
best_result = max(results, key=lambda r: r['val_bal_acc'])

print("=== Best Model Combination ===")
print(f"Models: {best_result['models']}")
print(f" Train - Balanced Acc: {best_result['train_bal_acc']:.4f} | Acc: {best_result['train_acc']:.4f} | Sens: {best_result['train_sens']:.4f} | Spec: {best_result['train_spec']:.4f}")
print(f"   Val - Balanced Acc: {best_result['val_bal_acc']:.4f} | Acc: {best_result['val_acc']:.4f} | Sens: {best_result['val_sens']:.4f} | Spec: {best_result['val_spec']:.4f}")


Models: ('rf', 'lr')
 Train - Balanced Acc: 0.8135 | Acc: 0.8952 | Sens: 0.7142 | Spec: 0.9129
 Val - Balanced Acc: 0.8007 | Acc: 0.8913 | Sens: 0.6904 | Spec: 0.9109

Models: ('rf', 'svc')
 Train - Balanced Acc: 0.8037 | Acc: 0.9043 | Sens: 0.6812 | Spec: 0.9261
 Val - Balanced Acc: 0.7937 | Acc: 0.8997 | Sens: 0.6647 | Spec: 0.9226

Models: ('rf', 'xgb')
 Train - Balanced Acc: 0.8289 | Acc: 0.8976 | Sens: 0.7453 | Spec: 0.9124
 Val - Balanced Acc: 0.8011 | Acc: 0.8898 | Sens: 0.6933 | Spec: 0.9089

Models: ('lr', 'svc')
 Train - Balanced Acc: 0.7974 | Acc: 0.9035 | Sens: 0.6683 | Spec: 0.9265
 Val - Balanced Acc: 0.7915 | Acc: 0.9016 | Sens: 0.6575 | Spec: 0.9254

Models: ('lr', 'xgb')
 Train - Balanced Acc: 0.8206 | Acc: 0.8928 | Sens: 0.7328 | Spec: 0.9084
 Val - Balanced Acc: 0.8000 | Acc: 0.8866 | Sens: 0.6947 | Spec: 0.9053

Models: ('svc', 'xgb')
 Train - Balanced Acc: 0.8059 | Acc: 0.9138 | Sens: 0.6748 | Spec: 0.9371
 Val - Balanced Acc: 0.7862 | Acc: 0.9061 | Sens: 0.6403 | 

##### Test

In [None]:
# 取得最佳參數
best_estimators = [(name, models[name]) for name in best_result['models']]
final_model = VotingClassifier(
            estimators=best_estimators,
            voting='soft',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test.values)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 119                  56
Actual Negative                 154                1633
Sensitivity (Recall, TPR): 0.6800
Specificity (TNR):         0.9138
Accuracy:                  0.8930
Balanced Accuracy:         0.7969
F1 Score:         0.5312


## Ensemble soft=>adjust weight

In [None]:
from collections import Counter

# 先計算 scale_pos_weight
counter = Counter(y)
print(counter)

n_pos = counter[1]
n_neg = counter[0]
scale_pos_weight = n_neg / n_pos
print(f"scale_pos_weight = {scale_pos_weight:.4f}")

Counter({np.int64(0): 7149, np.int64(1): 698})
scale_pos_weight = 10.2421


In [None]:
rf_best_params =  {'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': 40, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
rf_clf = RandomForestClassifier(
            **rf_best_params,
            random_state=seed,
            n_jobs=-1,
            class_weight='balanced'
)


lr_best_params = 512
lr_clf = LogisticRegression(
            C=lr_best_params,
            penalty='l2',
            max_iter=1000,
            solver='lbfgs',
            class_weight='balanced',
            random_state=seed
        )


svc_best_c =  0.0078125
svc_best_gamma = 0.25
svc_clf = SVC(
       C=svc_best_c,
       gamma=svc_best_gamma,
       kernel='rbf',
       class_weight='balanced',
       probability=True,
       random_state=seed
      )

xgb_best_params = {'colsample_bytree': 0.8831528055561403, 'gamma': 0.021020125361070625, 'learning_rate': 0.03754890989284685, 'max_depth': 3, 'n_estimators': 181, 'subsample': 0.8246355080259108}
xgb_clf = xgb.XGBClassifier(
            **xgb_best_params,
            eval_metric="logloss",
            tree_method="hist",
            scale_pos_weight=scale_pos_weight,
            random_state=seed
        )



In [None]:
from sklearn.ensemble import VotingClassifier
from itertools import combinations
# 定義 base 模型
models = {
    'rf': rf_clf,
    'lr': lr_clf,
    'svc': svc_clf,
    'xgb': xgb_clf,
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = []

# 所有模型子集的長度（2~4）
for r in range(2, len(models) + 1):
    for subset in combinations(models.items(), r):
        clf_names, clf_list = zip(*subset)
        name_model_pairs = list(subset)

        # 權重搜尋範圍，例如：1 ~ 3（可以調整）
        weight_options = [1, 2, 3]

        # 枚舉所有權重排列（長度要等於模型個數）
        for weights in product(weight_options, repeat=r):
            eclf = VotingClassifier(
                estimators=name_model_pairs,
                voting='soft',
                weights=weights,
                n_jobs=-1
            )

            train_bal_accs, train_accs, train_sens, train_spec = [], [], [], []
            val_bal_accs, val_accs, val_sens, val_spec = [], [], [], []

            for train_idx, val_idx in cv.split(x, y):
                X_train_fold, y_train_fold = x[train_idx], y[train_idx]
                X_val_fold, y_val_fold = x[val_idx], y[val_idx]

                eclf.fit(X_train_fold, y_train_fold)

                y_train_pred = eclf.predict(X_train_fold)
                y_val_pred = eclf.predict(X_val_fold)

                train_bal_accs.append(balanced_accuracy_score(y_train_fold, y_train_pred))
                train_accs.append(accuracy_score(y_train_fold, y_train_pred))
                train_sens.append(recall_score(y_train_fold, y_train_pred, pos_label=1))
                train_spec.append(recall_score(y_train_fold, y_train_pred, pos_label=0))

                val_bal_accs.append(balanced_accuracy_score(y_val_fold, y_val_pred))
                val_accs.append(accuracy_score(y_val_fold, y_val_pred))
                val_sens.append(recall_score(y_val_fold, y_val_pred, pos_label=1))
                val_spec.append(recall_score(y_val_fold, y_val_pred, pos_label=0))

            result = {
                'models': clf_names,
                'weights': weights,  # 加入權重
                'train_bal_acc': np.mean(train_bal_accs),
                'train_acc': np.mean(train_accs),
                'train_sens': np.mean(train_sens),
                'train_spec': np.mean(train_spec),
                'val_bal_acc': np.mean(val_bal_accs),
                'val_acc': np.mean(val_accs),
                'val_sens': np.mean(val_sens),
                'val_spec': np.mean(val_spec),
            }

            results.append(result)

            print(f"Models: {clf_names} | Weights: {weights}")
            print(f" Train - Balanced Acc: {result['train_bal_acc']:.4f} | Acc: {result['train_acc']:.4f} | Sens: {result['train_sens']:.4f} | Spec: {result['train_spec']:.4f}")
            print(f" Val - Balanced Acc: {result['val_bal_acc']:.4f} | Acc: {result['val_acc']:.4f} | Sens: {result['val_sens']:.4f} | Spec: {result['val_spec']:.4f}")
            print("")


best_result = max(results, key=lambda r: r['val_bal_acc'])

print("=== Best Model Combination ===")
print(f"Models: {best_result['models']}")
print(f"Weights: {best_result['weights']}")
print(f" Train - Balanced Acc: {best_result['train_bal_acc']:.4f} | Acc: {best_result['train_acc']:.4f} | Sens: {best_result['train_sens']:.4f} | Spec: {best_result['train_spec']:.4f}")
print(f"   Val - Balanced Acc: {best_result['val_bal_acc']:.4f} | Acc: {best_result['val_acc']:.4f} | Sens: {best_result['val_sens']:.4f} | Spec: {best_result['val_spec']:.4f}")


Models: ('rf', 'lr') | Weights: (1, 1)
 Train - Balanced Acc: 0.8135 | Acc: 0.8952 | Sens: 0.7142 | Spec: 0.9129
 Val - Balanced Acc: 0.8007 | Acc: 0.8913 | Sens: 0.6904 | Spec: 0.9109

Models: ('rf', 'lr') | Weights: (1, 2)
 Train - Balanced Acc: 0.8135 | Acc: 0.8934 | Sens: 0.7163 | Spec: 0.9107
 Val - Balanced Acc: 0.8017 | Acc: 0.8896 | Sens: 0.6947 | Spec: 0.9087

Models: ('rf', 'lr') | Weights: (1, 3)
 Train - Balanced Acc: 0.8127 | Acc: 0.8917 | Sens: 0.7167 | Spec: 0.9088
 Val - Balanced Acc: 0.8019 | Acc: 0.8877 | Sens: 0.6976 | Spec: 0.9063

Models: ('rf', 'lr') | Weights: (2, 1)
 Train - Balanced Acc: 0.8131 | Acc: 0.8955 | Sens: 0.7127 | Spec: 0.9134
 Val - Balanced Acc: 0.8019 | Acc: 0.8924 | Sens: 0.6919 | Spec: 0.9120

Models: ('rf', 'lr') | Weights: (2, 2)
 Train - Balanced Acc: 0.8135 | Acc: 0.8952 | Sens: 0.7142 | Spec: 0.9129
 Val - Balanced Acc: 0.8007 | Acc: 0.8913 | Sens: 0.6904 | Spec: 0.9109

Models: ('rf', 'lr') | Weights: (2, 3)
 Train - Balanced Acc: 0.8138 |

##### Test

In [None]:
# 取得最佳參數
best_estimators = [(name, models[name]) for name in best_result['models']]

final_model = VotingClassifier(
    estimators=best_estimators,
    voting='soft',
    weights=best_result['weights'],  # 加上這個
    n_jobs=-1
)
final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test.values)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 115                  60
Actual Negative                 141                1646
Sensitivity (Recall, TPR): 0.6571
Specificity (TNR):         0.9211
Accuracy:                  0.8976
Balanced Accuracy:         0.7891
F1 Score:         0.5336


## Ensemble hard=>equal weight

In [None]:
from collections import Counter

# 先計算 scale_pos_weight
counter = Counter(y)
print(counter)

n_pos = counter[1]
n_neg = counter[0]
scale_pos_weight = n_neg / n_pos
print(f"scale_pos_weight = {scale_pos_weight:.4f}")

Counter({np.int64(0): 7149, np.int64(1): 698})
scale_pos_weight = 10.2421


In [None]:
rf_best_params =  {'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': 40, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
rf_clf = RandomForestClassifier(
            **rf_best_params,
            random_state=seed,
            n_jobs=-1,
            class_weight='balanced'
)


lr_best_params = 512
lr_clf = LogisticRegression(
            C=lr_best_params,
            penalty='l2',
            max_iter=1000,
            solver='lbfgs',
            class_weight='balanced',
            random_state=seed
        )


svc_best_c =  0.0078125
svc_best_gamma = 0.25
svc_clf = SVC(
       C=svc_best_c,
       gamma=svc_best_gamma,
       kernel='rbf',
       class_weight='balanced',
       probability=False,
       random_state=seed
      )

xgb_best_params = {'colsample_bytree': 0.8831528055561403, 'gamma': 0.021020125361070625, 'learning_rate': 0.03754890989284685, 'max_depth': 3, 'n_estimators': 181, 'subsample': 0.8246355080259108}
xgb_clf = xgb.XGBClassifier(
            **xgb_best_params,
            eval_metric="logloss",
            tree_method="hist",
            scale_pos_weight=scale_pos_weight,
            random_state=seed
        )



In [None]:
from sklearn.ensemble import VotingClassifier
from itertools import combinations
# 定義 base 模型
models = {
    'rf': rf_clf,
    'lr': lr_clf,
    'svc': svc_clf,
    'xgb': xgb_clf,
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = []

for r in range(2, len(models) + 1):  # 組合長度 2~4
    for subset in combinations(models.items(), r):
        clf_names, clf_list = zip(*subset)

        eclf = VotingClassifier(
            estimators=list(subset),
            voting='hard',
            n_jobs=-1
        )

        train_bal_accs, train_accs, train_sens, train_spec = [], [], [], []
        val_bal_accs, val_accs, val_sens, val_spec = [], [], [], []

        for train_idx, val_idx in cv.split(x, y):
            X_train_fold, y_train_fold = x[train_idx], y[train_idx]
            X_val_fold, y_val_fold = x[val_idx], y[val_idx]

            eclf.fit(X_train_fold, y_train_fold)

            # 預測
            y_train_pred = eclf.predict(X_train_fold)
            y_val_pred = eclf.predict(X_val_fold)

            # === Train metrics ===
            train_bal_accs.append(balanced_accuracy_score(y_train_fold, y_train_pred))
            train_accs.append(accuracy_score(y_train_fold, y_train_pred))
            train_sens.append(recall_score(y_train_fold, y_train_pred, pos_label=1))
            train_spec.append(recall_score(y_train_fold, y_train_pred, pos_label=0))

            # === Val metrics ===
            val_bal_accs.append(balanced_accuracy_score(y_val_fold, y_val_pred))
            val_accs.append(accuracy_score(y_val_fold, y_val_pred))
            val_sens.append(recall_score(y_val_fold, y_val_pred, pos_label=1))
            val_spec.append(recall_score(y_val_fold, y_val_pred, pos_label=0))

        # 平均結果
        result = {
            'models': clf_names,
            'train_bal_acc': np.mean(train_bal_accs),
            'train_acc': np.mean(train_accs),
            'train_sens': np.mean(train_sens),
            'train_spec': np.mean(train_spec),
            'val_bal_acc': np.mean(val_bal_accs),
            'val_acc': np.mean(val_accs),
            'val_sens': np.mean(val_sens),
            'val_spec': np.mean(val_spec),
        }

        results.append(result)

        # 輸出格式化結果
        print(f"Models: {clf_names}")
        print(f" Train - Balanced Acc: {result['train_bal_acc']:.4f} | Acc: {result['train_acc']:.4f} | Sens: {result['train_sens']:.4f} | Spec: {result['train_spec']:.4f}")
        print(f" Val - Balanced Acc: {result['val_bal_acc']:.4f} | Acc: {result['val_acc']:.4f} | Sens: {result['val_sens']:.4f} | Spec: {result['val_spec']:.4f}")
        print("")

# 選出最佳組合（以 val_bal_acc 為主）
best_result = max(results, key=lambda r: r['val_bal_acc'])

print("=== Best Model Combination ===")
print(f"Models: {best_result['models']}")
print(f" Train - Balanced Acc: {best_result['train_bal_acc']:.4f} | Acc: {best_result['train_acc']:.4f} | Sens: {best_result['train_sens']:.4f} | Spec: {best_result['train_spec']:.4f}")
print(f"   Val - Balanced Acc: {best_result['val_bal_acc']:.4f} | Acc: {best_result['val_acc']:.4f} | Sens: {best_result['val_sens']:.4f} | Spec: {best_result['val_spec']:.4f}")


Models: ('rf', 'lr')
 Train - Balanced Acc: 0.8110 | Acc: 0.8995 | Sens: 0.7034 | Spec: 0.9186
 Val - Balanced Acc: 0.7975 | Acc: 0.8961 | Sens: 0.6775 | Spec: 0.9175

Models: ('rf', 'svc')
 Train - Balanced Acc: 0.7993 | Acc: 0.8952 | Sens: 0.6827 | Spec: 0.9159
 Val - Balanced Acc: 0.7989 | Acc: 0.8951 | Sens: 0.6818 | Spec: 0.9159

Models: ('rf', 'xgb')
 Train - Balanced Acc: 0.8205 | Acc: 0.9041 | Sens: 0.7188 | Spec: 0.9222
 Val - Balanced Acc: 0.7983 | Acc: 0.8988 | Sens: 0.6761 | Spec: 0.9205

Models: ('lr', 'svc')
 Train - Balanced Acc: 0.8004 | Acc: 0.8977 | Sens: 0.6819 | Spec: 0.9188
 Val - Balanced Acc: 0.7955 | Acc: 0.8960 | Sens: 0.6732 | Spec: 0.9177

Models: ('lr', 'xgb')
 Train - Balanced Acc: 0.8148 | Acc: 0.9004 | Sens: 0.7106 | Spec: 0.9190
 Val - Balanced Acc: 0.7981 | Acc: 0.8949 | Sens: 0.6804 | Spec: 0.9158

Models: ('svc', 'xgb')
 Train - Balanced Acc: 0.8030 | Acc: 0.9013 | Sens: 0.6834 | Spec: 0.9226
 Val - Balanced Acc: 0.7965 | Acc: 0.8989 | Sens: 0.6718 | 

##### Test

In [None]:
# 取得最佳參數
best_estimators = [(name, models[name]) for name in best_result['models']]
final_model = VotingClassifier(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test.values)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 115                  60
Actual Negative                 150                1637
Sensitivity (Recall, TPR): 0.6571
Specificity (TNR):         0.9161
Accuracy:                  0.8930
Balanced Accuracy:         0.7866
F1 Score:         0.5227


## Ensemble hard=>equal weight(all model)

In [None]:
x = X_train
y = y_train

In [None]:
from collections import Counter

# 先計算 scale_pos_weight
counter = Counter(y)
print(counter)

n_pos = counter[1]
n_neg = counter[0]
scale_pos_weight = n_neg / n_pos
print(f"scale_pos_weight = {scale_pos_weight:.4f}")

Counter({0: 7149, 1: 698})
scale_pos_weight = 10.2421


In [None]:
import lightgbm as lgb
rf_best_params =  {'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': 40, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
rf_clf = RandomForestClassifier(
            **rf_best_params,
            random_state=seed,
            n_jobs=-1,
            class_weight='balanced'
)


lr_best_params = 512
lr_clf = LogisticRegression(
            C=lr_best_params,
            penalty='l2',
            max_iter=1000,
            solver='lbfgs',
            class_weight='balanced',
            random_state=seed
        )


svc_best_c =  0.0078125
svc_best_gamma = 0.25
svc_clf = SVC(
       C=svc_best_c,
       gamma=svc_best_gamma,
       kernel='rbf',
       class_weight='balanced',
       probability=False,
       random_state=seed
      )

xgb_best_params = {'colsample_bytree': 0.8831528055561403, 'gamma': 0.021020125361070625, 'learning_rate': 0.03754890989284685, 'max_depth': 3, 'n_estimators': 181, 'subsample': 0.8246355080259108}
xgb_clf = xgb.XGBClassifier(
            **xgb_best_params,
            eval_metric="logloss",
            tree_method="hist",
            scale_pos_weight=scale_pos_weight,
            random_state=seed
        )

ada_best_n_estimators = 50
ada_best_learning_rate = 0.001
ada_best_base_estimator = DecisionTreeClassifier(class_weight='balanced', max_depth=2)

ada_clf = AdaBoostClassifier(
    estimator=ada_best_base_estimator,
    n_estimators=ada_best_n_estimators,
    learning_rate=ada_best_learning_rate,
    random_state=seed
)

lgb_best_n_estimators = 400
lgb_best_learning_rate = 0.01
lgb_best_num_leaves = 15
lgb_best_max_depth = 9

lgb_clf = lgb.LGBMClassifier(
    n_estimators=lgb_best_n_estimators,
    learning_rate=lgb_best_learning_rate,
    num_leaves=lgb_best_num_leaves,
    max_depth=lgb_best_max_depth,
    class_weight="balanced",
    random_state=seed,
    n_jobs=-1)

class_weights = {0: 1.0, 1: scale_pos_weight}

cat_best_n_estimators = 200
cat_best_learning_rate = 0.05
cat_best_max_depth = 3
cat_best_l2_leaf_reg = 64

cat_clf = CatBoostClassifier(
        iterations=cat_best_n_estimators,
        learning_rate=cat_best_learning_rate,
        depth=cat_best_max_depth,
        l2_leaf_reg=cat_best_l2_leaf_reg,
        loss_function="Logloss",
        eval_metric="BalancedAccuracy",  # CPU 模式支援 BalancedAccuracy
        class_weights=class_weights,
        random_seed=seed,
        verbose=0,
        task_type="CPU"   # 改成 CPU
)

In [None]:
from sklearn.ensemble import VotingClassifier
from itertools import combinations
# 定義 base 模型
models = {
    'rf': rf_clf,
    'lr': lr_clf,
    'svc': svc_clf,
    'xgb': xgb_clf,
    'ada': ada_clf,
    'lgb': lgb_clf,
    'cat': cat_clf,
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = []

for r in range(3, len(models) + 1, 2):
    for subset in combinations(models.items(), r):
        clf_names, clf_list = zip(*subset)

        eclf = VotingClassifier(
            estimators=list(subset),
            voting='hard',
            n_jobs=-1
        )

        train_bal_accs, train_accs, train_sens, train_spec = [], [], [], []
        val_bal_accs, val_accs, val_sens, val_spec = [], [], [], []

        for train_idx, val_idx in cv.split(x, y):
            x_train_fold, y_train_fold = x.iloc[train_idx], y.iloc[train_idx]
            x_val_fold, y_val_fold     = x.iloc[val_idx], y.iloc[val_idx]

            eclf.fit(x_train_fold, y_train_fold)

            # 預測
            y_train_pred = eclf.predict(x_train_fold)
            y_val_pred = eclf.predict(x_val_fold)

            # === Train metrics ===
            train_bal_accs.append(balanced_accuracy_score(y_train_fold, y_train_pred))
            train_accs.append(accuracy_score(y_train_fold, y_train_pred))
            train_sens.append(recall_score(y_train_fold, y_train_pred, pos_label=1))
            train_spec.append(recall_score(y_train_fold, y_train_pred, pos_label=0))

            # === Val metrics ===
            val_bal_accs.append(balanced_accuracy_score(y_val_fold, y_val_pred))
            val_accs.append(accuracy_score(y_val_fold, y_val_pred))
            val_sens.append(recall_score(y_val_fold, y_val_pred, pos_label=1))
            val_spec.append(recall_score(y_val_fold, y_val_pred, pos_label=0))

        # 平均結果
        result = {
            'models': clf_names,
            'train_bal_acc': np.mean(train_bal_accs),
            'train_acc': np.mean(train_accs),
            'train_sens': np.mean(train_sens),
            'train_spec': np.mean(train_spec),
            'val_bal_acc': np.mean(val_bal_accs),
            'val_acc': np.mean(val_accs),
            'val_sens': np.mean(val_sens),
            'val_spec': np.mean(val_spec),
        }

        results.append(result)

        # 輸出格式化結果
        print(f"Models: {clf_names}")
        print(f" Train - Balanced Acc: {result['train_bal_acc']:.4f} | Acc: {result['train_acc']:.4f} | Sens: {result['train_sens']:.4f} | Spec: {result['train_spec']:.4f}")
        print(f" Val - Balanced Acc: {result['val_bal_acc']:.4f} | Acc: {result['val_acc']:.4f} | Sens: {result['val_sens']:.4f} | Spec: {result['val_spec']:.4f}")
        print("")

# 選出最佳組合（以 val_bal_acc 為主）
best_result = max(results, key=lambda r: r['val_bal_acc'])

print("=== Best Model Combination ===")
print(f"Models: {best_result['models']}")
print(f" Train - Balanced Acc: {best_result['train_bal_acc']:.4f} | Acc: {best_result['train_acc']:.4f} | Sens: {best_result['train_sens']:.4f} | Spec: {best_result['train_spec']:.4f}")
print(f"   Val - Balanced Acc: {best_result['val_bal_acc']:.4f} | Acc: {best_result['val_acc']:.4f} | Sens: {best_result['val_sens']:.4f} | Spec: {best_result['val_spec']:.4f}")


Models: ('rf', 'lr', 'svc')
 Train - Balanced Acc: 0.8106 | Acc: 0.8970 | Sens: 0.7056 | Spec: 0.9157
 Val - Balanced Acc: 0.8009 | Acc: 0.8952 | Sens: 0.6861 | Spec: 0.9157

Models: ('rf', 'lr', 'xgb')
 Train - Balanced Acc: 0.8195 | Acc: 0.8957 | Sens: 0.7267 | Spec: 0.9122
 Val - Balanced Acc: 0.8017 | Acc: 0.8909 | Sens: 0.6933 | Spec: 0.9102

Models: ('rf', 'lr', 'ada')
 Train - Balanced Acc: 0.8155 | Acc: 0.8917 | Sens: 0.7228 | Spec: 0.9082
 Val - Balanced Acc: 0.8035 | Acc: 0.8894 | Sens: 0.6990 | Spec: 0.9080

Models: ('rf', 'lr', 'lgb')
 Train - Balanced Acc: 0.8213 | Acc: 0.8967 | Sens: 0.7296 | Spec: 0.9131
 Val - Balanced Acc: 0.8029 | Acc: 0.8918 | Sens: 0.6947 | Spec: 0.9110

Models: ('rf', 'lr', 'cat')
 Train - Balanced Acc: 0.8164 | Acc: 0.8939 | Sens: 0.7221 | Spec: 0.9107
 Val - Balanced Acc: 0.8038 | Acc: 0.8912 | Sens: 0.6976 | Spec: 0.9101

Models: ('rf', 'svc', 'xgb')
 Train - Balanced Acc: 0.8175 | Acc: 0.8981 | Sens: 0.7196 | Spec: 0.9155
 Val - Balanced Acc: 0

##### Test

In [None]:
# 取得最佳參數
best_estimators = [(name, models[name]) for name in best_result['models']]
final_model = VotingClassifier(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 119                  56
Actual Negative                 180                1607
Sensitivity (Recall, TPR): 0.6800
Specificity (TNR):         0.8993
Accuracy:                  0.8797
Balanced Accuracy:         0.7896
F1 Score:         0.5021


In [None]:
from scipy.stats import mode
class VotingClassifierWithTieBreaker(VotingClassifier):
    def predict(self, X):
        """
        自訂 hard voting 的 predict 方法：
        - 若 0 和 1 票數相同，預測為 1。
        """
        # 預測每個子模型的結果，轉成 (n_samples, n_classifiers)
        predictions = np.asarray([clf.predict(X) for clf in self.estimators_]).T

        majority_votes = []
        for row in predictions:
            vote_counts = np.bincount(row, minlength=2)
            # 強制在平手時猜 1
            majority_votes.append(0 if vote_counts[0] > vote_counts[1] else 1)

        return np.array(majority_votes)

In [None]:
from sklearn.ensemble import VotingClassifier
from itertools import combinations
# 定義 base 模型
models = {
    'rf': rf_clf,
    'lr': lr_clf,
    'svc': svc_clf,
    'xgb': xgb_clf,
    'ada': ada_clf,
    'lgb': lgb_clf,
    'cat': cat_clf,
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = []

for r in range(2, len(models), 2):
    for subset in combinations(models.items(), r):
        clf_names, clf_list = zip(*subset)

        eclf = VotingClassifierWithTieBreaker(
            estimators=list(subset),
            voting='hard',
            n_jobs=-1
        )

        train_bal_accs, train_accs, train_sens, train_spec = [], [], [], []
        val_bal_accs, val_accs, val_sens, val_spec = [], [], [], []

        for train_idx, val_idx in cv.split(x, y):
            x_train_fold, y_train_fold = x.iloc[train_idx], y.iloc[train_idx]
            x_val_fold, y_val_fold     = x.iloc[val_idx], y.iloc[val_idx]

            eclf.fit(x_train_fold, y_train_fold)

            # 預測
            y_train_pred = eclf.predict(x_train_fold)
            y_val_pred = eclf.predict(x_val_fold)

            # === Train metrics ===
            train_bal_accs.append(balanced_accuracy_score(y_train_fold, y_train_pred))
            train_accs.append(accuracy_score(y_train_fold, y_train_pred))
            train_sens.append(recall_score(y_train_fold, y_train_pred, pos_label=1))
            train_spec.append(recall_score(y_train_fold, y_train_pred, pos_label=0))

            # === Val metrics ===
            val_bal_accs.append(balanced_accuracy_score(y_val_fold, y_val_pred))
            val_accs.append(accuracy_score(y_val_fold, y_val_pred))
            val_sens.append(recall_score(y_val_fold, y_val_pred, pos_label=1))
            val_spec.append(recall_score(y_val_fold, y_val_pred, pos_label=0))

        # 平均結果
        result = {
            'models': clf_names,
            'train_bal_acc': np.mean(train_bal_accs),
            'train_acc': np.mean(train_accs),
            'train_sens': np.mean(train_sens),
            'train_spec': np.mean(train_spec),
            'val_bal_acc': np.mean(val_bal_accs),
            'val_acc': np.mean(val_accs),
            'val_sens': np.mean(val_sens),
            'val_spec': np.mean(val_spec),
        }

        results.append(result)

        # 輸出格式化結果
        print(f"Models: {clf_names}")
        print(f" Train - Balanced Acc: {result['train_bal_acc']:.4f} | Acc: {result['train_acc']:.4f} | Sens: {result['train_sens']:.4f} | Spec: {result['train_spec']:.4f}")
        print(f" Val - Balanced Acc: {result['val_bal_acc']:.4f} | Acc: {result['val_acc']:.4f} | Sens: {result['val_sens']:.4f} | Spec: {result['val_spec']:.4f}")
        print("")

# 選出最佳組合（以 val_bal_acc 為主）
best_result = max(results, key=lambda r: r['val_bal_acc'])

print("=== Best Model Combination ===")
print(f"Models: {best_result['models']}")
print(f" Train - Balanced Acc: {best_result['train_bal_acc']:.4f} | Acc: {best_result['train_acc']:.4f} | Sens: {best_result['train_sens']:.4f} | Spec: {best_result['train_spec']:.4f}")
print(f"   Val - Balanced Acc: {best_result['val_bal_acc']:.4f} | Acc: {best_result['val_acc']:.4f} | Sens: {best_result['val_sens']:.4f} | Spec: {best_result['val_spec']:.4f}")


Models: ('rf', 'lr')
 Train - Balanced Acc: 0.8161 | Acc: 0.8819 | Sens: 0.7360 | Spec: 0.8961
 Val - Balanced Acc: 0.8041 | Acc: 0.8800 | Sens: 0.7119 | Spec: 0.8963

Models: ('rf', 'svc')
 Train - Balanced Acc: 0.8179 | Acc: 0.8981 | Sens: 0.7203 | Spec: 0.9155
 Val - Balanced Acc: 0.8015 | Acc: 0.8951 | Sens: 0.6876 | Spec: 0.9154

Models: ('rf', 'xgb')
 Train - Balanced Acc: 0.8319 | Acc: 0.8875 | Sens: 0.7643 | Spec: 0.8995
 Val - Balanced Acc: 0.8000 | Acc: 0.8771 | Sens: 0.7062 | Spec: 0.8938

Models: ('rf', 'ada')
 Train - Balanced Acc: 0.8074 | Acc: 0.8538 | Sens: 0.7511 | Spec: 0.8638
 Val - Balanced Acc: 0.8046 | Acc: 0.8537 | Sens: 0.7449 | Spec: 0.8643

Models: ('rf', 'lgb')
 Train - Balanced Acc: 0.8555 | Acc: 0.8943 | Sens: 0.8084 | Spec: 0.9026
 Val - Balanced Acc: 0.8031 | Acc: 0.8816 | Sens: 0.7076 | Spec: 0.8986

Models: ('rf', 'cat')
 Train - Balanced Acc: 0.8166 | Acc: 0.8835 | Sens: 0.7353 | Spec: 0.8980
 Val - Balanced Acc: 0.8058 | Acc: 0.8807 | Sens: 0.7148 | S

In [None]:
# 取得最佳參數
best_estimators = [(name, models[name]) for name in best_result['models']]
final_model = VotingClassifierWithTieBreaker(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 119                  56
Actual Negative                 180                1607
Sensitivity (Recall, TPR): 0.6800
Specificity (TNR):         0.8993
Accuracy:                  0.8797
Balanced Accuracy:         0.7896
F1 Score:         0.5021


In [None]:
# 取得最佳參數
selected_models = ('xgb', 'lgb', 'cat')
# selected_models = ('ada', 'lgb', 'cat')
# selected_models = ('xgb', 'ada', 'cat')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifier(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 120                  55
Actual Negative                 172                1615
Sensitivity (Recall, TPR): 0.6857
Specificity (TNR):         0.9037
Accuracy:                  0.8843
Balanced Accuracy:         0.7947
F1 Score:         0.5139


In [None]:
# 取得最佳參數
selected_models = ('ada', 'lgb', 'cat')
# selected_models = ('xgb', 'ada', 'cat')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifier(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 123                  52
Actual Negative                 189                1598
Sensitivity (Recall, TPR): 0.7029
Specificity (TNR):         0.8942
Accuracy:                  0.8772
Balanced Accuracy:         0.7985
F1 Score:         0.5051


In [None]:
# 取得最佳參數
selected_models = ('xgb', 'ada', 'cat')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifier(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 121                  54
Actual Negative                 190                1597
Sensitivity (Recall, TPR): 0.6914
Specificity (TNR):         0.8937
Accuracy:                  0.8756
Balanced Accuracy:         0.7926
F1 Score:         0.4979


In [None]:
# 取得最佳參數
selected_models = ('ada', 'lgb')
#selected_models = ('xgb', 'lgb')
#selected_models = ('xgb', 'cat')
#selected_models = ('lgb', 'cat')
#selected_models = ('ada', 'cat')
#selected_models = ('ada', 'xgb')
#selected_models = ('ada', 'lgb', 'cat' 'xgb')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifierWithTieBreaker(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 127                  48
Actual Negative                 247                1540
Sensitivity (Recall, TPR): 0.7257
Specificity (TNR):         0.8618
Accuracy:                  0.8496
Balanced Accuracy:         0.7937
F1 Score:         0.4627


In [None]:
# 取得最佳參數
#selected_models = ('ada', 'lgb')
selected_models = ('xgb', 'lgb')
#selected_models = ('xgb', 'cat')
#selected_models = ('lgb', 'cat')
#selected_models = ('ada', 'cat')
#selected_models = ('ada', 'xgb')
#selected_models = ('ada', 'lgb', 'cat' 'xgb')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifierWithTieBreaker(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 122                  53
Actual Negative                 183                1604
Sensitivity (Recall, TPR): 0.6971
Specificity (TNR):         0.8976
Accuracy:                  0.8797
Balanced Accuracy:         0.7974
F1 Score:         0.5083


In [None]:
# 取得最佳參數
#selected_models = ('ada', 'lgb')
#selected_models = ('xgb', 'lgb')
selected_models = ('xgb', 'cat')
#selected_models = ('lgb', 'cat')
#selected_models = ('ada', 'cat')
#selected_models = ('ada', 'xgb')
#selected_models = ('ada', 'lgb', 'cat' 'xgb')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifierWithTieBreaker(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 121                  54
Actual Negative                 191                1596
Sensitivity (Recall, TPR): 0.6914
Specificity (TNR):         0.8931
Accuracy:                  0.8751
Balanced Accuracy:         0.7923
F1 Score:         0.4969


In [None]:
# 取得最佳參數
#selected_models = ('ada', 'lgb')
#selected_models = ('xgb', 'lgb')
#selected_models = ('xgb', 'cat')
selected_models = ('lgb', 'cat')
#selected_models = ('ada', 'cat')
#selected_models = ('ada', 'xgb')
#selected_models = ('ada', 'lgb', 'cat' 'xgb')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifierWithTieBreaker(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 123                  52
Actual Negative                 190                1597
Sensitivity (Recall, TPR): 0.7029
Specificity (TNR):         0.8937
Accuracy:                  0.8767
Balanced Accuracy:         0.7983
F1 Score:         0.5041


In [None]:
# 取得最佳參數
#selected_models = ('ada', 'lgb')
#selected_models = ('xgb', 'lgb')
#selected_models = ('xgb', 'cat')
#selected_models = ('lgb', 'cat')
selected_models = ('ada', 'cat')
#selected_models = ('ada', 'xgb')
#selected_models = ('ada', 'lgb', 'cat' 'xgb')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifierWithTieBreaker(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")

In [None]:
# 取得最佳參數
#selected_models = ('ada', 'lgb')
#selected_models = ('xgb', 'lgb')
#selected_models = ('xgb', 'cat')
#selected_models = ('lgb', 'cat')
#selected_models = ('ada', 'cat')
selected_models = ('ada', 'xgb')
#selected_models = ('ada', 'lgb', 'cat' 'xgb')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifierWithTieBreaker(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 127                  48
Actual Negative                 247                1540
Sensitivity (Recall, TPR): 0.7257
Specificity (TNR):         0.8618
Accuracy:                  0.8496
Balanced Accuracy:         0.7937
F1 Score:         0.4627


In [None]:
# 取得最佳參數
#selected_models = ('ada', 'lgb')
#selected_models = ('xgb', 'lgb')
#selected_models = ('xgb', 'cat')
#selected_models = ('lgb', 'cat')
#selected_models = ('ada', 'cat')
#selected_models = ('ada', 'xgb')
selected_models = ('ada', 'lgb', 'cat', 'xgb')
best_estimators = [(name, models[name]) for name in selected_models]
final_model = VotingClassifierWithTieBreaker(
            estimators=best_estimators,
            voting='hard',
            n_jobs=-1
        )


final_model.fit(x, y)


# 預測
y_test_pred = final_model.predict(X_test)                  # 分類標籤

# 混淆矩陣
from sklearn.metrics import confusion_matrix, roc_auc_score

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# 指標計算
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
balanced_acc = (sensitivity + specificity) / 2
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# 轉成 DataFrame (Wikipedia 標準)
confusion_df = pd.DataFrame(
    data=[[tp, fn], [fp, tn]],
    index=["Actual Positive", "Actual Negative"],
    columns=["Predicted Positive", "Predicted Negative"]
)

print("\n✅ Confusion Matrix (DataFrame, Wikipedia format):")
print(confusion_df)

# 顯示結果
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR):         {specificity:.4f}")
print(f"Accuracy:                  {accuracy:.4f}")
print(f"Balanced Accuracy:         {balanced_acc:.4f}")
print(f"F1 Score:         {f1:.4f}")


✅ Confusion Matrix (DataFrame, Wikipedia format):
                 Predicted Positive  Predicted Negative
Actual Positive                 123                  52
Actual Negative                 194                1593
Sensitivity (Recall, TPR): 0.7029
Specificity (TNR):         0.8914
Accuracy:                  0.8746
Balanced Accuracy:         0.7971
F1 Score:         0.5000
