In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
import xgboost as xgb
from dataset.load_dat import load_keel_dat
from kmeans_smote import KMeansSMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import numpy as np
from loss import *
from model import FocalXGBClassifier
from optimize import *
print(xgb.train)
def compute_gmean(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape != (2, 2):
        print("Warning: confusion matrix is not binary, cannot compute G-Mean.")
        return 0.0

    TN, FP, FN, TP = cm.ravel()
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    gmean = np.sqrt(sensitivity * specificity)
    return gmean
# **EasyEnsemble 实现**
def easy_ensemble(X, y, n_estimators=10):
    """
    EasyEnsemble: 对多数类进行多次欠采样，生成多个子数据集。
    Args:
        X: 特征矩阵
        y: 标签
        n_estimators: 欲生成的弱分类器数量（欠采样次数）
    Returns:
        samples: List of (X_resampled, y_resampled)
    """
    kmeans_smote = KMeansSMOTE(
    kmeans_args={
        'n_clusters': 100
    },
    smote_args={
        'k_neighbors': 10
    }
    )
    rus = RandomUnderSampler(random_state=42)
    samples = []
    print('开始过采样')
    for _ in range(n_estimators):
        #X_resampled, y_resampled = rus.fit_resample(X, y)
        X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)
        samples.append((X_resampled, y_resampled))
    return samples

# **加载数据集（以 glass1 数据集为例）**
def load_keel(name):
    # 从 UCI 或本地加载 'glass1' 数据集
    # 下载地址: https://sci2s.ugr.es/keel/dataset/data/classification/glass1.zip
    # 假设数据已经保存为 glass1.csv
    data = load_keel_dat("dataset/"+name)  # 替换为实际路径
    print(data)
    X = data.iloc[:, :-1].values  # 特征
    y = data.iloc[:, -1].values   # 标签
    return X, y

# 主流程（K 折）
def main():
    #name = "abalone19.dat" #scale_pos_weight=0.1
    name = "yeast6.dat"
    X, y = load_keel(name)
    print(X.shape)
    print(y)
    #X, y, selected_feature_names = mrmr_with_combinations(X, y, keep_ratio=0.8)
    #print("选中特征名：", selected_feature_names)
    # print("筛选之后的特征")
    # print(X.shape)
    # print(y)
    if ' positive' in y:
        y = (y == ' positive').astype(int)  # 统一标签为 0 和 1
    print("🔍 正在执行全局超参数搜索（使用训练集）...")
    # 使用整个数据集调参（或某一部分）
    X_subtrain, X_val, y_subtrain, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    best_params = optimize_xgb_multiobjective(X_subtrain, y_subtrain, X_val, y_val, n_trials=30)
    print("✅ 全局最优参数：", best_params)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    all_auc = []
    all_gmean = []
    First = True
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n--- Fold {fold} ---")
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

        # EasyEnsemble
        n_estimators = 3
        samples = easy_ensemble(X_train, y_train, n_estimators=n_estimators)

        classifiers = []
        for i, (X_resampled, y_resampled) in enumerate(samples):
            print(f"Training classifier {i+1}/{n_estimators} for fold {fold}")
            # model = XGBClassifier(
            #     n_estimators=100,
            #     max_depth=6,
            #     learning_rate=0.1,
            #     subsample=0.8,
            #     colsample_bytree=0.8,
            #     use_label_encoder=False,
            #     random_state=42
            # )

            # # fit with custom objective
            # model.fit(
            #     X_resampled, y_resampled,
            # )
            # model = FocalXGBClassifier(num_boost_round=100, alpha=0.75, gamma=2.0)
            # model.fit(X_resampled, y_resampled ,eval_set=[(X_resampled, y_resampled)],verbose_eval=True)
            

            #if First:
            #    best_params = optimize_xgb_multiobjective(X_resampled, y_resampled, X_test, y_test, n_trials=30)
            #    First = False

            model = XGBClassifier(**best_params)
            model.fit(
                X_resampled, y_resampled,
            )
            classifiers.append(model)

        # 测试集预测
        ensemble_preds = np.zeros_like(y_test, dtype=float)
        for model in classifiers:
            ensemble_preds += model.predict_proba(X_test)[:, 1]
        ensemble_preds /= n_estimators

        print(ensemble_preds)
        y_pred = (ensemble_preds >= 0.01).astype(int)
        auc = roc_auc_score(y_test, ensemble_preds)
        all_auc.append(auc)

        print(classification_report(y_test, y_pred))
        print(f"AUC-ROC for Fold {fold}: {auc:.4f}")
        gmean = compute_gmean(y_test, y_pred)
        all_gmean.append(gmean)
        print(f"G-Mean for Fold {fold}: {gmean:.4f}")

    # 所有折的平均 AUC
    print(f"\nAverage AUC-ROC over {skf.n_splits} folds: {np.mean(all_auc):.4f}")
    print(f"\nAverage G-Mean over {skf.n_splits} folds: {np.mean(all_gmean):.4f}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
  if ' positive' in y:
[I 2025-05-07 14:36:35,225] A new study created in memory with name: no-name-99184cea-d76b-4f01-ab2e-57243b1ae59a
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


<function train at 0x13e717c70>
       Mcg   Gvh   Alm   Mit  Erl  Pox   Vac   Nuc  Class
0     0.58  0.61  0.47  0.13  0.5  0.0  0.48  0.22      0
1     0.43  0.67  0.48  0.27  0.5  0.0  0.53  0.22      0
2     0.64  0.62  0.49  0.15  0.5  0.0  0.53  0.22      0
3     0.58  0.44  0.57  0.13  0.5  0.0  0.54  0.22      0
4     0.42  0.44  0.48  0.54  0.5  0.0  0.48  0.22      0
...    ...   ...   ...   ...  ...  ...   ...   ...    ...
1479  0.81  0.62  0.43  0.17  0.5  0.0  0.53  0.22      0
1480  0.47  0.43  0.61  0.40  0.5  0.0  0.48  0.47      0
1481  0.67  0.57  0.36  0.19  0.5  0.0  0.56  0.22      0
1482  0.43  0.40  0.60  0.16  0.5  0.0  0.53  0.39      0
1483  0.65  0.54  0.54  0.13  0.5  0.0  0.53  0.22      0

[1484 rows x 9 columns]
(1484, 8)
[0 0 0 ... 0 0 0]
🔍 正在执行全局超参数搜索（使用训练集）...


[I 2025-05-07 14:36:35,550] Trial 0 finished with values: [0.9502463054187191, 0.844862771900765] and parameters: {'n_estimators': 417, 'max_depth': 11, 'learning_rate': 0.27076332534668524, 'subsample': 0.9961520327656137, 'colsample_bytree': 0.9159040707564969, 'gamma': 2.2971292264731256, 'min_child_weight': 9, 'scale_pos_weight': 2.8174218757835394}.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-07 14:36:35,999] Trial 1 finished with values: [0.9433497536945812, 0.6158817620063862] and parameters: {'n_estimators': 382, 'max_depth': 7, 'learning_rate': 0.012164688152637083, 'subsample': 0.6781313276975014, 'colsample_bytree': 0.6620614933380435, 'gamma': 0.32171153448682044, 'min_child_weight': 10, 'scale_pos_weight': 7.284188227595061}.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-07 14:36:36,309] Trial 2 finished with values: [0.8665024630541872, 0.7327658648772


✅ Pareto 最优解数量: 2
  AUC=0.9670, G-Mean=0.9414, params={'n_estimators': 437, 'max_depth': 9, 'learning_rate': 0.04139699245119708, 'subsample': 0.9878175635778628, 'colsample_bytree': 0.5690907855548613, 'gamma': 1.333716052814569, 'min_child_weight': 6, 'scale_pos_weight': 0.9946986946297245}
  AUC=0.9695, G-Mean=0.0000, params={'n_estimators': 180, 'max_depth': 11, 'learning_rate': 0.011653999420831437, 'subsample': 0.8401358343218621, 'colsample_bytree': 0.7643971206067106, 'gamma': 2.828750071259394, 'min_child_weight': 10, 'scale_pos_weight': 3.41722423003412}
✅ 全局最优参数： {'n_estimators': 437, 'max_depth': 9, 'learning_rate': 0.04139699245119708, 'subsample': 0.9878175635778628, 'colsample_bytree': 0.5690907855548613, 'gamma': 1.333716052814569, 'min_child_weight': 6, 'scale_pos_weight': 0.9946986946297245}

--- Fold 1 ---
开始过采样
Training classifier 1/3 for fold 1
Training classifier 2/3 for fold 1
Training classifier 3/3 for fold 1
[0.00368624 0.00523295 0.00365244 0.00380244 0.0392