In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, roc_auc_score, confusion_matrix)
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate
import joblib
import warnings

warnings.filterwarnings('ignore')


def calculate_metrics(y_true, y_pred, y_scores):
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    # 初始化存储每个类别的指标
    class_metrics = []

    for class_idx in range(4):  # 4个类别
        # 确保y_true是numpy数组
        y_true_np = np.array(y_true)
        y_pred_np = np.array(y_pred)

        # 二分类指标计算
        y_true_class = (y_true_np == class_idx).astype(int)
        y_pred_class = (y_pred_np == class_idx).astype(int)
        y_scores_class = y_scores[:, class_idx]

        try:
            accuracy = accuracy_score(y_true_class, y_pred_class)
            f1 = f1_score(y_true_class, y_pred_class, zero_division=0)
            precision = precision_score(y_true_class, y_pred_class, zero_division=0)
            recall = recall_score(y_true_class, y_pred_class, zero_division=0)

            # 处理AUPRC计算
            if len(np.unique(y_true_class)) > 1:
                auprc = average_precision_score(y_true_class, y_scores_class)
            else:
                auprc = 0.0

            # 处理AUROC计算
            if len(np.unique(y_true_class)) > 1:
                auroc = roc_auc_score(y_true_class, y_scores_class)
            else:
                auroc = 0.0

            class_metrics.append({
                'accuracy': accuracy,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'auprc': auprc,
                'auroc': auroc
            })
        except Exception as e:
            print(f"计算类别{class_idx}指标时出错: {str(e)}")
            class_metrics.append({
                'accuracy': 0,
                'f1': 0,
                'precision': 0,
                'recall': 0,
                'auprc': 0,
                'auroc': 0
            })

    # 计算加权平均指标
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'auprc': average_precision_score(y_true, y_scores, average='weighted'),
        'auroc': roc_auc_score(y_true, y_scores, multi_class='ovr', average='weighted'),
        'class_metrics': class_metrics
    }

    return metrics


def interpolate_pr_curve(precision, recall):
    """插值PR曲线到固定长度的点"""
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(1.0, 0.0))
    new_recall = np.linspace(0, 1, 100)
    new_precision = f(new_recall)
    return new_precision, new_recall


def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(4):
        for j in range(4):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'

            # 将个数和百分百分比显示在格子中
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'CI_XGB_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()


def train_test_split(X, y, splits=10, batch_size=32):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    best_model_info = {'val_score': -float('inf'), 'model': None, 'fold': -1}

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 数据切分
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 使用XGBoost替代RandomForest
        xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=2025, n_jobs=-1, objective='multi:softprob', num_class=4)

        # 训练模型
        xgb_model.fit(X_train, y_train)

        # 预测
        y_pred = xgb_model.predict_proba(X_test)  # 预测类别概率
        y_pred_class = np.argmax(y_pred, axis=1)

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred_class, y_pred)

        # 检查是否为最佳模型
        current_score = metrics['auroc']  # 使用AUROC作为选择最佳模型的标准
        if current_score > best_model_info['val_score']:
            best_model_info['val_score'] = current_score
            best_model_info['model'] = xgb_model
            best_model_info['fold'] = fold + 1
            best_model_info['scaler'] = scaler

        # 保存结果
        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"AUROC: {metrics['auroc']:.4f}")

    # 保存最佳模型
    joblib.dump(best_model_info['model'], 'cirrhosis_xgb.pkl')
    joblib.dump(best_model_info['scaler'], 'scaler.pkl')

    print(f"\nSaved best model from fold {best_model_info['fold']} with AUROC {best_model_info['val_score']:.4f} as cirrhosis_xgb.pkl")
    print("Saved corresponding scaler as scaler.pkl")

    # 绘制最佳折的混淆矩阵
    print(f"Plotting confusion matrix for best fold: {best_model_info['fold']}")
    plot_confusion_matrix(y_test, y_pred_class, best_model_info['fold'])

if __name__ == "__main__":
    # 加载数据
    data = pd.read_csv('preparations/cirrhosis_output.csv')  # 请替换为您的实际文件路径

    # 检查数据
    print("数据前5行:")
    print(data.head())
    print("\n类别分布:")
    print(data['Stage'].value_counts())

    # 分离特征和目标
    feature_cols = ['N_Days', 'Age', 'Bilirubin', 'Albumin', 'Copper', 'SGOT',
                    'Tryglicerides', 'Platelets', 'Prothrombin']
    X = data[feature_cols].values
    y = data['Stage'].values - 1  # 将类别转换为0-3

    # 转换为numpy数组
    X = X.astype(np.float32)
    y = y.astype(np.int64)

    # 运行训练和评估
    train_test_split(X, y, splits=10)

数据前5行:
   N_Days  Status  Age  Ascites  Hepatomegaly  Spiders  Edema  Bilirubin  \
0     400       2   59      2.0           2.0      2.0      2       14.5   
1    4500       0   56      0.0           2.0      2.0      0        1.1   
2    1012       2   70      0.0           0.0      0.0      1        1.4   
3    1925       2   55      0.0           2.0      2.0      1        1.8   
4    1504       1   38      0.0           2.0      2.0      0        3.4   

   Albumin  Copper    SGOT  Tryglicerides  Platelets  Prothrombin  Stage  
0     2.60   156.0  137.95          172.0      190.0         12.2    4.0  
1     4.14    54.0  113.52           88.0      221.0         10.6    3.0  
2     3.48   210.0   96.10           55.0      151.0         12.0    4.0  
3     2.54    64.0   60.63           92.0      183.0         10.3    4.0  
4     3.53   143.0  113.15           72.0      136.0         10.9    3.0  

类别分布:
Stage
3.0    161
4.0    144
2.0     92
1.0     21
Name: count, dtype: int64
数据形

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, roc_auc_score)
import matplotlib.pyplot as plt
from scipy import interpolate
import joblib
import warnings
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.metrics import confusion_matrix

warnings.filterwarnings('ignore')

# 设置字体为黑体，确保中文可见
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

def calculate_metrics(y_true, y_pred, y_scores):
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    # 初始化存储每个类别的指标
    class_metrics = []

    for class_idx in range(4):  # 4个类别
        # 确保y_true是numpy数组
        y_true_np = np.array(y_true)
        y_pred_np = np.array(y_pred)

        # 二分类指标计算
        y_true_class = (y_true_np == class_idx).astype(int)
        y_pred_class = (y_pred_np == class_idx).astype(int)
        y_scores_class = y_scores[:, class_idx]

        try:
            accuracy = accuracy_score(y_true_class, y_pred_class)
            f1 = f1_score(y_true_class, y_pred_class, zero_division=0)
            precision = precision_score(y_true_class, y_pred_class, zero_division=0)
            recall = recall_score(y_true_class, y_pred_class, zero_division=0)

            # 处理AUPRC计算
            if len(np.unique(y_true_class)) > 1:
                auprc = average_precision_score(y_true_class, y_scores_class)
            else:
                auprc = 0.0

            # 处理AUROC计算
            if len(np.unique(y_true_class)) > 1:
                auroc = roc_auc_score(y_true_class, y_scores_class)
            else:
                auroc = 0.0

            class_metrics.append({
                'accuracy': accuracy,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'auprc': auprc,
                'auroc': auroc
            })
        except Exception as e:
            print(f"计算类别{class_idx}指标时出错: {str(e)}")
            class_metrics.append({
                'accuracy': 0,
                'f1': 0,
                'precision': 0,
                'recall': 0,
                'auprc': 0,
                'auroc': 0
            })

    # 计算加权平均指标
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'auprc': average_precision_score(y_true, y_scores, average='weighted'),
        'auroc': roc_auc_score(y_true, y_scores, multi_class='ovr', average='weighted'),
        'class_metrics': class_metrics
    }

    return metrics


def interpolate_pr_curve(precision, recall):
    """插值PR曲线到固定长度的点"""
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(1.0, 0.0))
    new_recall = np.linspace(0, 1, 100)
    new_precision = f(new_recall)
    return new_precision, new_recall


def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(4):
        for j in range(4):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'

            # 将个数和百分百分比分行显示，个数在上，百分比在下
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'CI_TabNet_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()


def plot_pr_curve(y_true, y_scores, fold):
    try:
        # 多分类PR曲线 - 使用每个类别的PR曲线
        precision = dict()
        recall = dict()
        auprc = dict()

        for i in range(4):  # 4个类别
            precision[i], recall[i], _ = precision_recall_curve(y_true == i, y_scores[:, i])
            auprc[i] = auc(recall[i], precision[i])

        plt.figure()
        for i in range(4):
            plt.plot(recall[i], precision[i], label=f'Class {i} (AUPRC = {auprc[i]:.2f})')

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve (Fold {fold})')
        plt.legend()
        plt.savefig(f'pr_curve_fold{fold}.png')
        plt.close()
        return precision, recall
    except Exception as e:
        print(f"无法绘制Fold {fold}的PR曲线: {str(e)}")
        return None, None


def train_test_split(X, y, splits=10, batch_size=32):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    all_class_metrics = []  # 存储所有折的每个类别的指标
    best_model_info = {'val_score': -float('inf'), 'model': None, 'fold': -1}

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 分割数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 创建TabNet模型
        model = TabNetClassifier(
            n_d=8, n_a=8, n_steps=3, gamma=1.3, lambda_sparse=0,
            optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2),
            mask_type='sparsemax', # Optional: use sparsemax or sigmoid
            scheduler_params=dict(step_size=50, gamma=0.9),
            scheduler_fn=torch.optim.lr_scheduler.StepLR
        )

        # 训练模型
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], patience=50, batch_size=batch_size)

        # 预测
        y_pred = model.predict(X_test)
        y_scores = model.predict_proba(X_test)  # Get probabilities for all classes

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred, y_scores)
        all_class_metrics.append(metrics['class_metrics'])

        # 检查是否为最佳模型
        current_score = metrics['auroc']  # 使用AUROC作为选择最佳模型的标准
        if current_score > best_model_info['val_score']:
            best_model_info['val_score'] = current_score
            best_model_info['model'] = model  # TabNet模型可以直接保存
            best_model_info['fold'] = fold + 1
            best_model_info['scaler'] = scaler

        # 绘制并保存当前折的PR曲线
        plot_pr_curve(y_test, y_scores, fold + 1)

        # 保存结果
        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"AUROC: {metrics['auroc']:.4f}")

    # 保存最佳模型
    best_model_info['model'].save_model('cirrhosis_tabnet.pkl')
    joblib.dump(best_model_info['scaler'], 'scaler.pkl')

    print(
        f"\nSaved best model from fold {best_model_info['fold']} with AUROC {best_model_info['val_score']:.4f} as cirrhosis_tabnet.pkl")
    print("Saved corresponding scaler as scaler.pkl")

    # 绘制最佳模型的混淆矩阵
    plot_confusion_matrix(y_test, y_pred, best_model_info['fold'], dpi=720)

if __name__ == "__main__":
    # 加载数据
    data = pd.read_csv('preparations/cirrhosis_output.csv')  # 请替换为您的实际文件路径

    # 检查数据
    print("数据前5行:")
    print(data.head())
    print("\n类别分布:")
    print(data['Stage'].value_counts())

    # 分离特征和目标
    feature_cols = ['N_Days', 'Age', 'Bilirubin', 'Albumin', 'Copper', 'SGOT',
                    'Tryglicerides', 'Platelets', 'Prothrombin']
    X = data[feature_cols].values
    y = data['Stage'].values - 1  # 将类别转换为0-3

    # 转换为numpy数组
    X = X.astype(np.float32)
    y = y.astype(np.int64)

    # 运行训练和评估
    train_test_split(X, y, splits=10)

数据前5行:
   N_Days  Status  Age  Ascites  Hepatomegaly  Spiders  Edema  Bilirubin  \
0     400       2   59      2.0           2.0      2.0      2       14.5   
1    4500       0   56      0.0           2.0      2.0      0        1.1   
2    1012       2   70      0.0           0.0      0.0      1        1.4   
3    1925       2   55      0.0           2.0      2.0      1        1.8   
4    1504       1   38      0.0           2.0      2.0      0        3.4   

   Albumin  Copper    SGOT  Tryglicerides  Platelets  Prothrombin  Stage  
0     2.60   156.0  137.95          172.0      190.0         12.2    4.0  
1     4.14    54.0  113.52           88.0      221.0         10.6    3.0  
2     3.48   210.0   96.10           55.0      151.0         12.0    4.0  
3     2.54    64.0   60.63           92.0      183.0         10.3    4.0  
4     3.53   143.0  113.15           72.0      136.0         10.9    3.0  

类别分布:
Stage
3.0    161
4.0    144
2.0     92
1.0     21
Name: count, dtype: int64
数据形

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, roc_auc_score)
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from scipy import interpolate
import joblib
import warnings

warnings.filterwarnings('ignore')

# 设置字体为黑体，确保中文可见
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False


def calculate_metrics(y_true, y_pred, y_scores):
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    # 初始化存储每个类别的指标
    class_metrics = []

    for class_idx in range(4):  # 4个类别
        # 确保y_true是numpy数组
        y_true_np = np.array(y_true)
        y_pred_np = np.array(y_pred)

        # 二分类指标计算
        y_true_class = (y_true_np == class_idx).astype(int)
        y_pred_class = (y_pred_np == class_idx).astype(int)
        y_scores_class = y_scores[:, class_idx]

        try:
            accuracy = accuracy_score(y_true_class, y_pred_class)
            f1 = f1_score(y_true_class, y_pred_class, zero_division=0)
            precision = precision_score(y_true_class, y_pred_class, zero_division=0)
            recall = recall_score(y_true_class, y_pred_class, zero_division=0)

            # 处理AUPRC计算
            if len(np.unique(y_true_class)) > 1:
                auprc = average_precision_score(y_true_class, y_scores_class)
            else:
                auprc = 0.0

            # 处理AUROC计算
            if len(np.unique(y_true_class)) > 1:
                auroc = roc_auc_score(y_true_class, y_scores_class)
            else:
                auroc = 0.0

            class_metrics.append({
                'accuracy': accuracy,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'auprc': auprc,
                'auroc': auroc
            })
        except Exception as e:
            print(f"计算类别{class_idx}指标时出错: {str(e)}")
            class_metrics.append({
                'accuracy': 0,
                'f1': 0,
                'precision': 0,
                'recall': 0,
                'auprc': 0,
                'auroc': 0
            })

    # 计算加权平均指标
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'auprc': average_precision_score(y_true, y_scores, average='weighted'),
        'auroc': roc_auc_score(y_true, y_scores, multi_class='ovr', average='weighted'),
        'class_metrics': class_metrics
    }

    return metrics


def interpolate_pr_curve(precision, recall):
    """插值PR曲线到固定长度的点"""
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(1.0, 0.0))
    new_recall = np.linspace(0, 1, 100)
    new_precision = f(new_recall)
    return new_precision, new_recall


def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(4):
        for j in range(4):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'

            # 将个数和百分比分行显示，个数在上，百分比在下
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()


def plot_pr_curve(y_true, y_scores, fold):
    try:
        # 多分类PR曲线 - 使用每个类别的PR曲线
        precision = dict()
        recall = dict()
        auprc = dict()

        for i in range(4):  # 4个类别
            precision[i], recall[i], _ = precision_recall_curve(y_true == i, y_scores[:, i])
            auprc[i] = auc(recall[i], precision[i])

        plt.figure()
        for i in range(4):
            plt.plot(recall[i], precision[i], label=f'Class {i} (AUPRC = {auprc[i]:.2f})')

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve (Fold {fold})')
        plt.legend()
        plt.savefig(f'pr_curve_fold{fold}.png')
        plt.close()
        return precision, recall
    except Exception as e:
        print(f"无法绘制Fold {fold}的PR曲线: {str(e)}")
        return None, None


def train_test_split(X, y, splits=10, batch_size=32):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    best_model_info = {'val_score': -float('inf'), 'model': None, 'fold': -1}

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 数据切分
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 创建SVM模型
        model = SVC(kernel='rbf', C=1, gamma='scale', probability=True, random_state=2025)

        # 训练模型
        model.fit(X_train, y_train)

        # 预测
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred, y_pred_proba)

        # 检查是否为最佳模型
        current_score = metrics['auroc']  # 使用AUROC作为选择最佳模型的标准
        if current_score > best_model_info['val_score']:
            best_model_info['val_score'] = current_score
            best_model_info['model'] = model
            best_model_info['fold'] = fold + 1
            best_model_info['scaler'] = scaler

        # 绘制并保存当前折的PR曲线
        plot_pr_curve(y_test, y_pred_proba, fold + 1)

        # 保存结果
        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"AUROC: {metrics['auroc']:.4f}")

    # 保存最佳模型
    joblib.dump(best_model_info['model'], 'svm_model.pkl')
    joblib.dump(best_model_info['scaler'], 'scaler.pkl')

    # 打印最佳模型信息
    print(f"\nSaved best model from fold {best_model_info['fold']} with AUROC {best_model_info['val_score']:.4f} as svm_model.pkl")
    print("Saved corresponding scaler as scaler.pkl")

    # 绘制并保存最佳模型的混淆矩阵图
    plot_confusion_matrix(y_test, y_pred, best_model_info['fold'], dpi=720)


if __name__ == "__main__":
    # 加载数据
    data = pd.read_csv('preparations/cirrhosis_output.csv')  # 请替换为您的实际文件路径

    # 检查数据
    print("数据前5行:")
    print(data.head())
    print("\n类别分布:")
    print(data['Stage'].value_counts())

    # 分离特征和目标
    feature_cols = ['N_Days', 'Age', 'Bilirubin', 'Albumin', 'Copper', 'SGOT',
                    'Tryglicerides', 'Platelets', 'Prothrombin']
    X = data[feature_cols].values
    y = data['Stage'].values - 1  # 将类别转换为0-3

    # 转换为numpy数组
    X = X.astype(np.float32)
    y = y.astype(np.int64)

    # 运行训练和评估
    train_test_split(X, y, splits=10)


数据前5行:
   N_Days  Status  Age  Ascites  Hepatomegaly  Spiders  Edema  Bilirubin  \
0     400       2   59      2.0           2.0      2.0      2       14.5   
1    4500       0   56      0.0           2.0      2.0      0        1.1   
2    1012       2   70      0.0           0.0      0.0      1        1.4   
3    1925       2   55      0.0           2.0      2.0      1        1.8   
4    1504       1   38      0.0           2.0      2.0      0        3.4   

   Albumin  Copper    SGOT  Tryglicerides  Platelets  Prothrombin  Stage  
0     2.60   156.0  137.95          172.0      190.0         12.2    4.0  
1     4.14    54.0  113.52           88.0      221.0         10.6    3.0  
2     3.48   210.0   96.10           55.0      151.0         12.0    4.0  
3     2.54    64.0   60.63           92.0      183.0         10.3    4.0  
4     3.53   143.0  113.15           72.0      136.0         10.9    3.0  

类别分布:
Stage
3.0    161
4.0    144
2.0     92
1.0     21
Name: count, dtype: int64
数据形

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, roc_auc_score, confusion_matrix)
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from scipy import interpolate
import joblib
import warnings

warnings.filterwarnings('ignore')


def calculate_metrics(y_true, y_pred, y_scores):
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    # 初始化存储每个类别的指标
    class_metrics = []

    for class_idx in range(4):  # 4个类别
        # 确保y_true是numpy数组
        y_true_np = np.array(y_true)
        y_pred_np = np.array(y_pred)

        # 二分类指标计算
        y_true_class = (y_true_np == class_idx).astype(int)
        y_pred_class = (y_pred_np == class_idx).astype(int)
        y_scores_class = y_scores[:, class_idx]

        try:
            accuracy = accuracy_score(y_true_class, y_pred_class)
            f1 = f1_score(y_true_class, y_pred_class, zero_division=0)
            precision = precision_score(y_true_class, y_pred_class, zero_division=0)
            recall = recall_score(y_true_class, y_pred_class, zero_division=0)

            # 处理AUPRC计算
            if len(np.unique(y_true_class)) > 1:
                auprc = average_precision_score(y_true_class, y_scores_class)
            else:
                auprc = 0.0

            # 处理AUROC计算
            if len(np.unique(y_true_class)) > 1:
                auroc = roc_auc_score(y_true_class, y_scores_class)
            else:
                auroc = 0.0

            class_metrics.append({
                'accuracy': accuracy,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'auprc': auprc,
                'auroc': auroc
            })
        except Exception as e:
            print(f"计算类别{class_idx}指标时出错: {str(e)}")
            class_metrics.append({
                'accuracy': 0,
                'f1': 0,
                'precision': 0,
                'recall': 0,
                'auprc': 0,
                'auroc': 0
            })

    # 计算加权平均指标
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'auprc': average_precision_score(y_true, y_scores, average='weighted'),
        'auroc': roc_auc_score(y_true, y_scores, multi_class='ovr', average='weighted'),
        'class_metrics': class_metrics
    }

    return metrics


def interpolate_pr_curve(precision, recall):
    """插值PR曲线到固定长度的点"""
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(1.0, 0.0))
    new_recall = np.linspace(0, 1, 100)
    new_precision = f(new_recall)
    return new_precision, new_recall


def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(4):
        for j in range(4):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'

            # 将个数和百分比分行显示，个数在上，百分比在下
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'CI_RF_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()


def train_test_split(X, y, splits=10, batch_size=32):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    best_model_info = {'val_score': -float('inf'), 'model': None, 'fold': -1}

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 分割数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 使用随机森林替代LightGBM
        rf_model = RandomForestClassifier(n_estimators=100, random_state=2025, n_jobs=-1)

        # 训练模型
        rf_model.fit(X_train, y_train)

        # 预测
        y_pred = rf_model.predict_proba(X_test)  # 预测类别概率
        y_pred_class = np.argmax(y_pred, axis=1)

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred_class, y_pred)

        # 检查是否为最佳模型
        current_score = metrics['auroc']  # 使用AUROC作为选择最佳模型的标准
        if current_score > best_model_info['val_score']:
            best_model_info['val_score'] = current_score
            best_model_info['model'] = rf_model
            best_model_info['fold'] = fold + 1
            best_model_info['scaler'] = scaler

        # 绘制并保存最佳折的混淆矩阵
        plot_confusion_matrix(y_test, y_pred_class, best_model_info['fold'])

        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"AUROC: {metrics['auroc']:.4f}")

    # 保存最佳模型
    joblib.dump(best_model_info['model'], 'cirrhosis_rf.pkl')
    joblib.dump(best_model_info['scaler'], 'scaler.pkl')

    print(
        f"\nSaved best model from fold {best_model_info['fold']} with AUROC {best_model_info['val_score']:.4f} as cirrhosis_rf.pkl")
    print("Saved corresponding scaler as scaler.pkl")

    # 计算并打印平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results]),
        'auroc': np.mean([r['auroc'] for r in results])
    }

    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_metrics['recall']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f}")
    print(f"Average AUROC: {avg_metrics['auroc']:.4f}")


if __name__ == "__main__":
    # 加载数据
    data = pd.read_csv('preparations/cirrhosis_output.csv')  # 请替换为您的实际文件路径

    # 检查数据
    print("数据前5行:")
    print(data.head())
    print("\n类别分布:")
    print(data['Stage'].value_counts())

    # 分离特征和目标
    feature_cols = ['N_Days', 'Age', 'Bilirubin', 'Albumin', 'Copper', 'SGOT',
                    'Tryglicerides', 'Platelets', 'Prothrombin']
    X = data[feature_cols].values
    y = data['Stage'].values - 1  # 将类别转换为0-3

    # 转换为numpy数组
    X = X.astype(np.float32)
    y = y.astype(np.int64)

    # 运行训练和评估
    train_test_split(X, y, splits=10)


数据前5行:
   N_Days  Status  Age  Ascites  Hepatomegaly  Spiders  Edema  Bilirubin  \
0     400       2   59      2.0           2.0      2.0      2       14.5   
1    4500       0   56      0.0           2.0      2.0      0        1.1   
2    1012       2   70      0.0           0.0      0.0      1        1.4   
3    1925       2   55      0.0           2.0      2.0      1        1.8   
4    1504       1   38      0.0           2.0      2.0      0        3.4   

   Albumin  Copper    SGOT  Tryglicerides  Platelets  Prothrombin  Stage  
0     2.60   156.0  137.95          172.0      190.0         12.2    4.0  
1     4.14    54.0  113.52           88.0      221.0         10.6    3.0  
2     3.48   210.0   96.10           55.0      151.0         12.0    4.0  
3     2.54    64.0   60.63           92.0      183.0         10.3    4.0  
4     3.53   143.0  113.15           72.0      136.0         10.9    3.0  

类别分布:
Stage
3.0    161
4.0    144
2.0     92
1.0     21
Name: count, dtype: int64
数据形

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, roc_auc_score, confusion_matrix)
import copy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate
import joblib  # 用于保存scaler


# TeLU激活函数
class TeLU(nn.Module):
    def __init__(self, alpha=0.15):
        super(TeLU, self).__init__()
        self.alpha = alpha

    def forward(self, x):
        return torch.where(x >= 0, x, self.alpha * (torch.exp(x) - 1))


# 前馈神经网络
class FFNN(nn.Module):
    def __init__(self, input_size):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.telu1 = TeLU(alpha=0.15)
        self.fc2 = nn.Linear(32, 64)
        self.telu2 = TeLU(alpha=0.1)
        self.fc3 = nn.Linear(64, 4)  # 4个输出类别

        # 初始化权重
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.fc1(x)
        x = self.telu1(x)
        x = self.fc2(x)
        x = self.telu2(x)
        x = self.fc3(x)
        return x


# 焦点损失函数
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()


def calculate_metrics(y_true, y_pred, y_scores):
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    # 初始化存储每个类别的指标
    class_metrics = []

    for class_idx in range(4):  # 4个类别
        # 确保y_true是numpy数组
        y_true_np = np.array(y_true)
        y_pred_np = np.array(y_pred)

        # 二分类指标计算
        y_true_class = (y_true_np == class_idx).astype(int)
        y_pred_class = (y_pred_np == class_idx).astype(int)
        y_scores_class = y_scores[:, class_idx]

        try:
            accuracy = accuracy_score(y_true_class, y_pred_class)
            f1 = f1_score(y_true_class, y_pred_class, zero_division=0)
            precision = precision_score(y_true_class, y_pred_class, zero_division=0)
            recall = recall_score(y_true_class, y_pred_class, zero_division=0)

            # 处理AUPRC计算
            if len(np.unique(y_true_class)) > 1:
                auprc = average_precision_score(y_true_class, y_scores_class)
            else:
                auprc = 0.0

            # 处理AUROC计算
            if len(np.unique(y_true_class)) > 1:
                auroc = roc_auc_score(y_true_class, y_scores_class)
            else:
                auroc = 0.0

            class_metrics.append({
                'accuracy': accuracy,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'auprc': auprc,
                'auroc': auroc
            })
        except Exception as e:
            print(f"计算类别{class_idx}指标时出错: {str(e)}")
            class_metrics.append({
                'accuracy': 0,
                'f1': 0,
                'precision': 0,
                'recall': 0,
                'auprc': 0,
                'auroc': 0
            })

    # 计算加权平均指标
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'auprc': average_precision_score(y_true, y_scores, average='weighted'),
        'auroc': roc_auc_score(y_true, y_scores, multi_class='ovr', average='weighted'),
        'class_metrics': class_metrics
    }

    return metrics


def plot_confusion_matrix(y_true, y_pred, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(4):
        for j in range(4):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'

            # 将个数和百分百分比显示
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(4) + 0.5, labels=np.arange(1, 5), fontsize=14, fontweight='bold')

    # 调整布局
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存最佳折叠混淆矩阵图
    plt.savefig(f'best_fold_confusion_matrix.png', dpi=dpi)
    plt.close()


def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=500, patience=20):
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            # 梯度裁剪防止爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        # 验证
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        all_scores = []
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                # 使用softmax获取概率
                scores = torch.softmax(outputs, dim=1)
                _, predicted = torch.max(scores.data, 1)

                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_scores.extend(scores.cpu().numpy())

        # 检查NaN值
        if np.isnan(np.array(all_scores)).any():
            all_scores = np.nan_to_num(all_scores)

        # 计算指标
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(val_loader.dataset)
        metrics = calculate_metrics(all_labels, all_preds, np.array(all_scores))

        # 早停
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

        # 打印进度
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
            print(f"Val Metrics - Acc: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}, "
                  f"Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, "
                  f"AUPRC: {metrics['auprc']:.4f}, AUROC: {metrics['auroc']:.4f}")

    # 加载最佳模型
    model.load_state_dict(best_model)
    return model, best_val_loss  # 返回最佳模型和对应的验证损失


def train_test_split(X, y, splits=10, epochs=500, batch_size=32, lr=0.001):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    best_model_info = {'val_loss': float('inf'), 'model_state': None}

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 分割数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 转换为张量
        X_train_tensor = torch.FloatTensor(X_train)
        y_train_tensor = torch.LongTensor(y_train)
        X_test_tensor = torch.FloatTensor(X_test)
        y_test_tensor = torch.LongTensor(y_test)

        # 创建数据集和数据加载器
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        # 初始化模型、损失函数和优化器
        model = FFNN(input_size=X.shape[1])
        criterion = FocalLoss(alpha=0.25, gamma=2)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # 训练模型
        model, val_loss = train_model(model, train_loader, test_loader, criterion, optimizer,
                                      epochs=epochs, patience=20)

        # 检查是否为最佳模型
        if val_loss < best_model_info['val_loss']:
            best_model_info['val_loss'] = val_loss
            best_model_info['model_state'] = copy.deepcopy(model.state_dict())

        # 评估测试集
        y_true, y_pred, y_scores = evaluate_model(model, test_loader)
        metrics = calculate_metrics(y_true, y_pred, y_scores)

        # 保存结果
        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"AUROC: {metrics['auroc']:.4f}")

    # 保存最佳模型
    best_model = FFNN(input_size=X.shape[1])
    best_model.load_state_dict(best_model_info['model_state'])
    torch.save(best_model.state_dict(), 'cirrhosis_mlp.pth')

    # 保存最佳模型的混淆矩阵
    y_true, y_pred, y_scores = evaluate_model(best_model, test_loader)
    plot_confusion_matrix(y_true, y_pred)

    print(f"\nSaved best model with val loss {best_model_info['val_loss']:.4f} as cirrhosis_mlp.pth")
    print("Saved corresponding confusion matrix as best_fold_confusion_matrix.png")


if __name__ == "__main__":
    # 加载数据
    data = pd.read_csv('preparations/cirrhosis_output.csv')  # 请替换为您的实际文件路径

    # 检查数据
    print("数据前5行:")
    print(data.head())
    print("\n类别分布:")
    print(data['Stage'].value_counts())

    # 分离特征和目标
    feature_cols = ['N_Days', 'Age', 'Bilirubin', 'Albumin', 'Copper', 'SGOT',
                    'Tryglicerides', 'Platelets', 'Prothrombin']
    X = data[feature_cols].values
    y = data['Stage'].values - 1  # 将类别转换为0-3

    # 转换为numpy数组
    X = X.astype(np.float32)
    y = y.astype(np.int64)

    # 运行训练和评估
    train_test_split(X, y, splits=10, epochs=500, batch_size=32, lr=0.001)

数据前5行:
   N_Days  Status  Age  Ascites  Hepatomegaly  Spiders  Edema  Bilirubin  \
0     400       2   59      2.0           2.0      2.0      2       14.5   
1    4500       0   56      0.0           2.0      2.0      0        1.1   
2    1012       2   70      0.0           0.0      0.0      1        1.4   
3    1925       2   55      0.0           2.0      2.0      1        1.8   
4    1504       1   38      0.0           2.0      2.0      0        3.4   

   Albumin  Copper    SGOT  Tryglicerides  Platelets  Prothrombin  Stage  
0     2.60   156.0  137.95          172.0      190.0         12.2    4.0  
1     4.14    54.0  113.52           88.0      221.0         10.6    3.0  
2     3.48   210.0   96.10           55.0      151.0         12.0    4.0  
3     2.54    64.0   60.63           92.0      183.0         10.3    4.0  
4     3.53   143.0  113.15           72.0      136.0         10.9    3.0  

类别分布:
Stage
3.0    161
4.0    144
2.0     92
1.0     21
Name: count, dtype: int64
数据形

In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, roc_auc_score, confusion_matrix)
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

warnings.filterwarnings('ignore')


def calculate_metrics(y_true, y_pred, y_scores):
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    # 计算加权平均指标
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'auprc': average_precision_score(y_true, y_scores, average='weighted'),
        'auroc': roc_auc_score(y_true, y_scores, multi_class='ovr', average='weighted'),
    }

    return metrics


def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            # 计算背景颜色的亮度（灰度）
            background_brightness = cm_percentage[i, j] / 100

            # 根据亮度选择字体颜色：浅色背景使用黑色字体，深色背景使用白色字体
            text_color = 'black' if background_brightness < 0.5 else 'white'

            # 将个数和百分百分比分行显示，个数在上，百分比在下
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(cm.shape[1]) + 0.5, labels=np.arange(1, cm.shape[1] + 1), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(cm.shape[0]) + 0.5, labels=np.arange(1, cm.shape[0] + 1), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'CI_LR_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()


def train_test_split(X, y, splits=10):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    best_model_info = {'val_score': -float('inf'), 'model': None, 'fold': -1}

    for fold, (train_idx, test_idx) in enumerate(StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025).split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 创建并训练逻辑回归模型
        model = LogisticRegression(
            multi_class='multinomial',
            solver='lbfgs',
            max_iter=1000,
            class_weight='balanced',  # 处理类别不平衡
            random_state=2025
        )
        model.fit(X_train, y_train)

        # 预测
        y_pred = model.predict(X_test)
        y_scores = model.predict_proba(X_test)

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred, y_scores)

        current_score = metrics['auroc']
        if current_score > best_model_info['val_score']:
            best_model_info['val_score'] = current_score
            best_model_info['model'] = model
            best_model_info['fold'] = fold + 1
            best_model_info['scaler'] = scaler

    # 保存最佳模型
    joblib.dump({
        'model': best_model_info['model'],
        'scaler': best_model_info['scaler']
    }, 'best_cirrhosis_logreg.pkl')

    print(f"\nSaved best model from fold {best_model_info['fold']} with AUROC {best_model_info['val_score']:.4f} as best_cirrhosis_logreg.pkl")

    # 绘制并保存最佳模型的混淆矩阵
    plot_confusion_matrix(y_test, best_model_info['model'].predict(X_test), best_model_info['fold'])


if __name__ == "__main__":
    data = pd.read_csv('preparations/cirrhosis_output.csv')
    print("数据前5行:")
    print(data.head())
    print("\n类别分布:")
    print(data['Stage'].value_counts())

    feature_cols = ['N_Days', 'Age', 'Bilirubin', 'Albumin', 'Copper', 'SGOT',
                    'Tryglicerides', 'Platelets', 'Prothrombin']
    X = data[feature_cols].values
    y = data['Stage'].values - 1

    X = X.astype(np.float32)
    y = y.astype(np.int64)

    train_test_split(X, y, splits=10)

数据前5行:
   N_Days  Status  Age  Ascites  Hepatomegaly  Spiders  Edema  Bilirubin  \
0     400       2   59      2.0           2.0      2.0      2       14.5   
1    4500       0   56      0.0           2.0      2.0      0        1.1   
2    1012       2   70      0.0           0.0      0.0      1        1.4   
3    1925       2   55      0.0           2.0      2.0      1        1.8   
4    1504       1   38      0.0           2.0      2.0      0        3.4   

   Albumin  Copper    SGOT  Tryglicerides  Platelets  Prothrombin  Stage  
0     2.60   156.0  137.95          172.0      190.0         12.2    4.0  
1     4.14    54.0  113.52           88.0      221.0         10.6    3.0  
2     3.48   210.0   96.10           55.0      151.0         12.0    4.0  
3     2.54    64.0   60.63           92.0      183.0         10.3    4.0  
4     3.53   143.0  113.15           72.0      136.0         10.9    3.0  

类别分布:
Stage
3.0    161
4.0    144
2.0     92
1.0     21
Name: count, dtype: int64
数据形

In [20]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, confusion_matrix)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

warnings.filterwarnings('ignore')

def calculate_metrics(y_true, y_pred, y_scores):
    # Check for NaN values
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
    }

    # Calculate AUPRC
    try:
        metrics['auprc'] = average_precision_score(y_true, y_scores)
    except:
        print("无法计算AUPRC，使用默认值0")
        metrics['auprc'] = 0

    return metrics

def plot_confusion_matrix(y_true, y_pred):
    """绘制并保存混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):  # Assuming binary classification
        for j in range(2):
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')

    # 调整布局
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵
    plt.savefig(f'best_confusion_matrix.png', dpi=720)
    plt.close()

def train_test_split(X, y, splits=10, batch_size=32):
    # Check data
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # Handle possible NaN values
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    
    best_model_info = {'val_score': -float('inf'), 'model': None, 'fold': -1}

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Standardization
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Create LightGBM datasets
        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

        # LightGBM parameters
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'min_child_samples': 20,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'lambda_l1': 0.1,
            'lambda_l2': 0.1,
            'verbose': -1,
            'random_state': 42
        }

        # Train model
        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[test_data],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False),
                       lgb.log_evaluation(period=50)]
        )

        # Evaluate on test set
        y_scores = model.predict(X_test)
        y_pred = (y_scores > 0.5).astype(int)
        metrics = calculate_metrics(y_test, y_pred, y_scores)

        current_score = metrics['auprc']
        if current_score > best_model_info['val_score']:
            best_model_info['val_score'] = current_score
            best_model_info['model'] = model
            best_model_info['fold'] = fold + 1
            best_model_info['scaler'] = scaler

    # Save best model
    joblib.dump({
        'model': best_model_info['model'],
        'scaler': best_model_info['scaler']
    }, 'best_heart_disease_model.pkl')

    print(f"\nSaved best model from fold {best_model_info['fold']} with AUPRC {best_model_info['val_score']:.4f} as best_heart_disease_model.pkl")

    # Plot and save the confusion matrix for the best model
    best_y_pred = (best_model_info['model'].predict(X_test) > 0.5).astype(int)
    plot_confusion_matrix(y_test, best_y_pred)


if __name__ == "__main__":
    data = pd.read_csv('preparations/heart_output.csv')  # Replace with your actual file path

    # Check data
    print("数据前5行:")
    print(data.head())
    print("\n类别分布:")
    print(data['HeartDisease'].value_counts())

    # Separate features and target
    X = data.drop('HeartDisease', axis=1).values
    y = data['HeartDisease'].values

    # Convert to numpy arrays
    X = X.astype(np.float32)
    y = y.astype(np.int64)

    # Run training and evaluation
    train_test_split(X, y, splits=10, batch_size=32)


数据前5行:
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    0              2        140          289          0           0   
1   49    1              1        160          180          0           0   
2   37    0              2        130          283          0           1   
3   48    1              0        138          214          0           0   
4   54    0              1        150          195          0           0   

   MaxHR  ExerciseAngina  Oldpeak  ST_Slope  HeartDisease  
0    172               0      0.0         0             0  
1    156               0      1.0         1             1  
2     98               0      0.0         0             0  
3    108               1      1.5         1             1  
4    122               0      0.0         0             0  

类别分布:
HeartDisease
1    508
0    410
Name: count, dtype: int64
数据形状: X=(918, 11), y=(918,)
类别分布: [410 508]

Fold 1/10
[50]	valid_0's binary_logloss: 0.343711
[100]	valid

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, confusion_matrix)
import copy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate
import warnings

# 忽略警告
warnings.filterwarnings('ignore', category=UserWarning)

# 设置字体为黑体，确保中文可见
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# TeLU激活函数
class TeLU(nn.Module):
    def __init__(self, alpha=0.15):
        super(TeLU, self).__init__()
        self.alpha = alpha

    def forward(self, x):
        return torch.where(x >= 0, x, self.alpha * (torch.exp(x) - 1))

# 前馈神经网络
class FFNN(nn.Module):
    def __init__(self, input_size):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.telu1 = TeLU(alpha=0.15)
        self.fc2 = nn.Linear(32, 64)
        self.telu2 = TeLU(alpha=0.1)
        self.fc3 = nn.Linear(64, 2)

        # 初始化权重
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.fc1(x)
        x = self.telu1(x)
        x = self.fc2(x)
        x = self.telu2(x)
        x = self.fc3(x)
        return x

# 焦点损失函数
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()

def calculate_metrics(y_true, y_pred, y_scores):
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
    }

    # 只有在y_scores有效时才计算AUPRC
    try:
        metrics['auprc'] = average_precision_score(y_true, y_scores[:, 1])
    except:
        print("无法计算AUPRC，使用默认值0")
        metrics['auprc'] = 0

    return metrics

def interpolate_pr_curve(precision, recall):
    """插值PR曲线到固定长度的点"""
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(1.0, 0.0))
    new_recall = np.linspace(0, 1, 100)
    new_precision = f(new_recall)
    return new_precision, new_recall

def plot_confusion_matrix(y_true, y_pred, dpi=720):
    """绘制最佳模型的混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):
        for j in range(2):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'

            # 将个数和百分比分行显示，个数在上，百分比在下
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存最佳模型的混淆矩阵图
    plt.savefig('best_model_confusion_matrix.png', dpi=dpi)
    plt.close()

def plot_pr_curve(y_true, y_scores, fold):
    try:
        precision, recall, _ = precision_recall_curve(y_true, y_scores[:, 1])
        auprc = auc(recall, precision)

        plt.figure()
        plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.2f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend()
        plt.savefig(f'pr_curve_fold{fold}.png')
        plt.close()
        return precision, recall
    except Exception as e:
        print(f"无法绘制Fold {fold}的PR曲线: {str(e)}")
        return None, None

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=500, patience=20):
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            # 梯度裁剪防止爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        # 验证
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        all_scores = []
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                # 使用softmax获取概率
                scores = torch.softmax(outputs, dim=1)
                _, predicted = torch.max(scores.data, 1)

                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_scores.extend(scores.cpu().numpy())

        # 检查NaN值
        if np.isnan(np.array(all_scores)).any():
            all_scores = np.nan_to_num(all_scores)

        # 计算指标
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(val_loader.dataset)
        metrics = calculate_metrics(all_labels, all_preds, np.array(all_scores))

        # 早停
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

        # 打印进度
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
            print(f"Val Metrics - Acc: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}, "
                  f"Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, "
                  f"AUPRC: {metrics['auprc']:.4f}")

    # 加载最佳模型
    model.load_state_dict(best_model)
    return model

def train_test_split(X, y, splits=10, epochs=500, batch_size=32, lr=0.001):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []

    # 存储所有折的PR曲线数据（插值后的）
    interp_precisions = []
    interp_recalls = np.linspace(0, 1, 100)  # 固定100个recall点

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 分割数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 转换为张量
        X_train_tensor = torch.FloatTensor(X_train)
        y_train_tensor = torch.LongTensor(y_train)
        X_test_tensor = torch.FloatTensor(X_test)
        y_test_tensor = torch.LongTensor(y_test)

        # 创建数据集和数据加载器
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        # 初始化模型、损失函数和优化器
        model = FFNN(input_size=X.shape[1])
        criterion = FocalLoss(alpha=0.25, gamma=2)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # 训练模型
        model = train_model(model, train_loader, test_loader, criterion, optimizer,
                            epochs=epochs, patience=20)

        # 评估测试集
        y_true, y_pred, y_scores = evaluate_model(model, test_loader)
        metrics = calculate_metrics(y_true, y_pred, y_scores)

        # 只保存最佳模型的混淆矩阵
        if fold == splits - 1:  # 最后一折的最佳模型
            plot_confusion_matrix(y_true, y_pred)

        # 绘制并保存当前折的PR曲线
        precision, recall = plot_pr_curve(y_true, y_scores, fold + 1)

        # 插值PR曲线到固定长度
        if precision is not None and recall is not None:
            interp_precision, _ = interpolate_pr_curve(precision, recall)
            interp_precisions.append(interp_precision)

        # 保存结果
        results.append(metrics)

    # 计算并打印平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results])
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results])
    }

    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")


# 加载数据
data = pd.read_csv('preparations/heart_output.csv')  # 请替换为您的实际文件路径

# 检查数据
print("数据前5行:")
print(data.head())
print("\n类别分布:")
print(data['HeartDisease'].value_counts())

# 分离特征和目标
X = data.drop('HeartDisease', axis=1).values
y = data['HeartDisease'].values

# 转换为numpy数组
X = X.astype(np.float32)
y = y.astype(np.int64)

# 运行训练和评估
train_test_split(X, y, splits=10, epochs=500, batch_size=32, lr=0.001)



数据前5行:
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    0              2        140          289          0           0   
1   49    1              1        160          180          0           0   
2   37    0              2        130          283          0           1   
3   48    1              0        138          214          0           0   
4   54    0              1        150          195          0           0   

   MaxHR  ExerciseAngina  Oldpeak  ST_Slope  HeartDisease  
0    172               0      0.0         0             0  
1    156               0      1.0         1             1  
2     98               0      0.0         0             0  
3    108               1      1.5         1             1  
4    122               0      0.0         0             0  

类别分布:
HeartDisease
1    508
0    410
Name: count, dtype: int64
数据形状: X=(918, 11), y=(918,)
类别分布: [410 508]

Fold 1/10
Epoch 10/500 - Train Loss: 0.0193, Val Loss: 0.0258

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc)
from imblearn.over_sampling import SMOTE
import copy
import matplotlib.pyplot as plt
import seaborn as sns


# TeLU激活函数
class TeLU(nn.Module):
    def __init__(self, alpha=0.15):
        super(TeLU, self).__init__()
        self.alpha = alpha

    def forward(self, x):
        return torch.where(x >= 0, x, self.alpha * (torch.exp(x) - 1))


# 前馈神经网络
class FFNN(nn.Module):
    def __init__(self, input_size):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.telu1 = TeLU(alpha=0.15)
        self.fc2 = nn.Linear(32, 64)
        self.telu2 = TeLU(alpha=0.1)
        self.fc3 = nn.Linear(64, 2)

        # 初始化权重
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.fc1(x)
        x = self.telu1(x)
        x = self.fc2(x)
        x = self.telu2(x)
        x = self.fc3(x)
        return x


# 焦点损失函数
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()


def calculate_metrics(y_true, y_pred, y_scores):
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
    }

    # 只有在y_scores有效时才计算AUPRC
    try:
        metrics['auprc'] = average_precision_score(y_true, y_scores[:, 1])
    except:
        print("无法计算AUPRC，使用默认值0")
        metrics['auprc'] = 0

    return metrics


def plot_confusion_matrix(y_true, y_pred, dpi=720):
    """绘制正方形混淆矩阵"""
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):  # 因为是二分类
        for j in range(2):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'

            # 将个数和百分百分比行显示，个数在上，百分比在下
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'best_fold_confusion_matrix.png', dpi=dpi)
    plt.close()


def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=500, patience=20):
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            # 梯度裁剪防止爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        # 验证
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        all_scores = []
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                # 使用softmax获取概率
                scores = torch.softmax(outputs, dim=1)
                _, predicted = torch.max(scores.data, 1)

                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_scores.extend(scores.cpu().numpy())

        # 检查NaN值
        if np.isnan(np.array(all_scores)).any():
            all_scores = np.nan_to_num(all_scores)

        # 计算指标
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(val_loader.dataset)
        metrics = calculate_metrics(all_labels, all_preds, np.array(all_scores))

        # 早停
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

        # 打印进度
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
            print(f"Val Metrics - Acc: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}, "
                  f"Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, "
                  f"AUPRC: {metrics['auprc']:.4f}")

    # 加载最佳模型
    model.load_state_dict(best_model)
    return model


def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    all_scores = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            scores = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(scores.data, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_scores.extend(scores.cpu().numpy())

    # 检查NaN值
    if np.isnan(np.array(all_scores)).any():
        all_scores = np.nan_to_num(all_scores)

    return all_labels, all_preds, np.array(all_scores)


def train_test_split(X, y, splits=10, epochs=500, batch_size=32, lr=0.001, use_smote=False):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")
    print(f"使用SMOTE: {'是' if use_smote else '否'}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 分割数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # 应用SMOTE过采样（仅对训练数据）
        if use_smote:
            smote = SMOTE(random_state=42)
            X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
            print(f"SMOTE应用后训练数据形状: X={X_train_res.shape}, y={y_train_res.shape}")

            # 使用过采样后的数据
            X_train_final = X_train_res
            y_train_final = y_train_res
        else:
            X_train_final = X_train_scaled
            y_train_final = y_train

        # 转换为张量
        X_train_tensor = torch.FloatTensor(X_train_final)
        y_train_tensor = torch.LongTensor(y_train_final)
        X_test_tensor = torch.FloatTensor(X_test_scaled)
        y_test_tensor = torch.LongTensor(y_test)

        # 创建数据集和数据加载器
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        # 初始化模型、损失函数和优化器
        model = FFNN(input_size=X.shape[1])
        criterion = FocalLoss(alpha=0.25, gamma=2)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # 训练模型
        model = train_model(model, train_loader, test_loader, criterion, optimizer,
                            epochs=epochs, patience=20)

        # 评估测试集
        y_true, y_pred, y_scores = evaluate_model(model, test_loader)

        # 绘制并保存最佳折的混淆矩阵
        if fold == 0:
            plot_confusion_matrix(y_true, y_pred)

        # 计算并保存结果
        metrics = calculate_metrics(y_true, y_pred, y_scores)
        results.append(metrics)

    # 计算并打印平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results])
    }

    print(f'\nAverage Metrics:')
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f}")
    print(f"Average F1: {avg_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_metrics['recall']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f}")


# 加载数据
data = pd.read_csv('preparations/heart_output.csv')  # 请替换为您的实际文件路径

# 检查数据
print("数据前5行:")
print(data.head())
print("\n类别分布:")
print(data['HeartDisease'].value_counts())

# 分离特征和目标
X = data.drop('HeartDisease', axis=1).values
y = data['HeartDisease'].values

# 转换为numpy数组
X = X.astype(np.float32)
y = y.astype(np.int64)

# 运行训练和评估（不带SMOTE）
print("\n训练模型（不带SMOTE）:")
train_test_split(X, y, splits=10, epochs=500, batch_size=32, lr=0.001, use_smote=False)

# 运行训练和评估（带SMOTE）
print("\n训练模型（带SMOTE）:")
train_test_split(X, y, splits=10, epochs=500, batch_size=32, lr=0.001, use_smote=True)



数据前5行:
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    0              2        140          289          0           0   
1   49    1              1        160          180          0           0   
2   37    0              2        130          283          0           1   
3   48    1              0        138          214          0           0   
4   54    0              1        150          195          0           0   

   MaxHR  ExerciseAngina  Oldpeak  ST_Slope  HeartDisease  
0    172               0      0.0         0             0  
1    156               0      1.0         1             1  
2     98               0      0.0         0             0  
3    108               1      1.5         1             1  
4    122               0      0.0         0             0  

类别分布:
HeartDisease
1    508
0    410
Name: count, dtype: int64

训练模型（不带SMOTE）:
数据形状: X=(918, 11), y=(918,)
类别分布: [410 508]
使用SMOTE: 否

Fold 1/10
Epoch 10/500 - Train Los

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体为黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

def plot_confusion_matrix(y_true, y_pred, dpi=720):
    """绘制最佳模型混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):  # 假设是二分类，可以根据需要调整
        for j in range(2):
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'best_model_confusion_matrix.png', dpi=dpi)
    plt.close()


def train_test_split(X, y, splits=10):
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'auroc': roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        }

        # 跟踪最佳折叠（根据 AUROC）
        if best_fold_metrics is None or metrics['auroc'] > best_fold_metrics['auroc']:
            best_fold_metrics = metrics
            best_fold = fold
            best_y_true = y_test
            best_y_pred = y_pred

    # 绘制最佳折叠的混淆矩阵并保存
    print(f"绘制最佳折叠的混淆矩阵: {best_fold + 1}")
    plot_confusion_matrix(best_y_true, best_y_pred)


# 加载数据
data = pd.read_csv('preparations/heart_output.csv')

# 检查数据
print("数据前5行:")
print(data.head())

# 分离特征和目标
X = data.drop('HeartDisease', axis=1).values
y = data['HeartDisease'].values

# 运行训练和评估
train_test_split(X, y, splits=10)

数据前5行:
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    0              2        140          289          0           0   
1   49    1              1        160          180          0           0   
2   37    0              2        130          283          0           1   
3   48    1              0        138          214          0           0   
4   54    0              1        150          195          0           0   

   MaxHR  ExerciseAngina  Oldpeak  ST_Slope  HeartDisease  
0    172               0      0.0         0             0  
1    156               0      1.0         1             1  
2     98               0      0.0         0             0  
3    108               1      1.5         1             1  
4    122               0      0.0         0             0  

Fold 1/10

Fold 2/10

Fold 3/10

Fold 4/10

Fold 5/10

Fold 6/10

Fold 7/10

Fold 8/10

Fold 9/10

Fold 10/10
绘制最佳折叠的混淆矩阵: 2


In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, auc
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

def plot_confusion_matrix(y_true, y_pred, dpi=720):
    """绘制最佳模型混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):  # 假设是二分类，可以根据需要调整
        for j in range(2):
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'best_model_confusion_matrix.png', dpi=dpi)
    plt.close()


def calculate_metrics(y_true, y_pred, y_scores):
    """计算评估指标"""
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
    }

    # 计算AUPRC
    try:
        metrics['auprc'] = average_precision_score(y_true, y_scores)
    except:
        print("无法计算AUPRC，使用默认值0")
        metrics['auprc'] = 0

    return metrics


def train_test_split(X, y, splits=10, batch_size=32):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []

    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 切分数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 创建RandomForest模型
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

        # 训练模型
        rf_model.fit(X_train, y_train)

        # 测试集评估
        y_scores = rf_model.predict_proba(X_test)[:, 1]  # 获取类1（正类）的概率
        y_pred = (y_scores > 0.5).astype(int)
        metrics = calculate_metrics(y_test, y_pred, y_scores)

        # 跟踪最佳折叠（根据AUPRC）
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold
            best_y_true = y_test
            best_y_pred = y_pred

        # 保存每折的结果
        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} 测试集指标:')
        print(f"准确度: {metrics['accuracy']:.4f}")
        print(f"F1分数: {metrics['f1']:.4f}")
        print(f"精确率: {metrics['precision']:.4f}")
        print(f"召回率: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")

    # 绘制最佳折叠的混淆矩阵并保存
    print(f"绘制最佳折叠的混淆矩阵: {best_fold + 1}")
    plot_confusion_matrix(best_y_true, best_y_pred)


# 加载数据
data = pd.read_csv('preparations/heart_output.csv')  # 替换为实际文件路径

# 检查数据
print("数据前5行:")
print(data.head())

# 分离特征和目标
X = data.drop('HeartDisease', axis=1).values
y = data['HeartDisease'].values

# 转换为numpy数组
X = X.astype(np.float32)
y = y.astype(np.int64)

# 运行训练和评估
train_test_split(X, y, splits=10)


数据前5行:
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    0              2        140          289          0           0   
1   49    1              1        160          180          0           0   
2   37    0              2        130          283          0           1   
3   48    1              0        138          214          0           0   
4   54    0              1        150          195          0           0   

   MaxHR  ExerciseAngina  Oldpeak  ST_Slope  HeartDisease  
0    172               0      0.0         0             0  
1    156               0      1.0         1             1  
2     98               0      0.0         0             0  
3    108               1      1.5         1             1  
4    122               0      0.0         0             0  
数据形状: X=(918, 11), y=(918,)
类别分布: [410 508]

Fold 1/10

Fold 1 测试集指标:
准确度: 0.9022
F1分数: 0.9109
精确率: 0.9200
召回率: 0.9020
AUPRC: 0.9299

Fold 2/10

Fold 2 测试集指标:
准确度: 0.8913

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, auc
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

def plot_confusion_matrix(y_true, y_pred, dpi=720):
    """绘制最佳模型混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):  # 假设是二分类，可以根据需要调整
        for j in range(2):
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'best_model_confusion_matrix.png', dpi=dpi)
    plt.close()


def calculate_metrics(y_true, y_pred, y_scores):
    """计算评估指标"""
    # 检查NaN值
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
    }

    # 计算AUPRC
    try:
        metrics['auprc'] = average_precision_score(y_true, y_scores)
    except:
        print("无法计算AUPRC，使用默认值0")
        metrics['auprc'] = 0

    return metrics


def train_test_split(X, y, splits=10, batch_size=32):
    # 检查数据
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # 处理可能的NaN值
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []

    # 跟踪最佳折叠（根据AUPRC）
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 切分数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 创建SVM模型
        model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)

        # 训练模型
        model.fit(X_train, y_train)

        # 测试集评估
        y_scores = model.predict_proba(X_test)[:, 1]  # 获取类1（正类）的概率
        y_pred = (y_scores > 0.5).astype(int)
        metrics = calculate_metrics(y_test, y_pred, y_scores)

        # 跟踪最佳折叠
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold
            best_y_true = y_test
            best_y_pred = y_pred

        # 保存每折的结果
        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} 测试集指标:')
        print(f"准确度: {metrics['accuracy']:.4f}")
        print(f"F1分数: {metrics['f1']:.4f}")
        print(f"精确率: {metrics['precision']:.4f}")
        print(f"召回率: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")

    # 绘制最佳折叠的混淆矩阵并保存
    print(f"绘制最佳折叠的混淆矩阵: {best_fold + 1}")
    plot_confusion_matrix(best_y_true, best_y_pred)


# 加载数据
data = pd.read_csv('preparations/heart_output.csv')  # 替换为实际文件路径

# 检查数据
print("数据前5行:")
print(data.head())

# 分离特征和目标
X = data.drop('HeartDisease', axis=1).values
y = data['HeartDisease'].values

# 转换为numpy数组
X = X.astype(np.float32)
y = y.astype(np.int64)

# 运行训练和评估
train_test_split(X, y, splits=10)

数据前5行:
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    0              2        140          289          0           0   
1   49    1              1        160          180          0           0   
2   37    0              2        130          283          0           1   
3   48    1              0        138          214          0           0   
4   54    0              1        150          195          0           0   

   MaxHR  ExerciseAngina  Oldpeak  ST_Slope  HeartDisease  
0    172               0      0.0         0             0  
1    156               0      1.0         1             1  
2     98               0      0.0         0             0  
3    108               1      1.5         1             1  
4    122               0      0.0         0             0  
数据形状: X=(918, 11), y=(918,)
类别分布: [410 508]

Fold 1/10

Fold 1 测试集指标:
准确度: 0.8478
F1分数: 0.8679
精确率: 0.8364
召回率: 0.9020
AUPRC: 0.9291

Fold 2/10

Fold 2 测试集指标:
准确度: 0.9022

In [40]:
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             confusion_matrix, auc)
from pytorch_tabnet.tab_model import TabNetClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import torch

warnings.filterwarnings('ignore')

def calculate_metrics(y_true, y_pred, y_scores):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'auprc': average_precision_score(y_true, y_scores[:, 1])  # 只取正类概率
    }
    return metrics

def plot_confusion_matrix(y_true, y_pred, dpi=720):
    """绘制最佳模型混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):  # 假设是二分类，可以根据需要调整
        for j in range(2):
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'best_model_confusion_matrix.png', dpi=dpi)
    plt.close()


def interpolate_pr_curve(precision, recall):
    """插值PR曲线到固定长度的点"""
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(1.0, 0.0))
    new_recall = np.linspace(0, 1, 100)
    new_precision = f(new_recall)
    return new_precision, new_recall

def plot_pr_curve(y_true, y_scores, fold):
    try:
        precision, recall, _ = precision_recall_curve(y_true, y_scores[:, 1])  # 只取正类概率
        auprc = auc(recall, precision)

        plt.figure()
        plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.2f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve (Fold {fold})')
        plt.legend()
        plt.savefig(f'pr_curve_fold{fold}.png')
        plt.close()
        return precision, recall
    except Exception as e:
        print(f"无法绘制Fold {fold}的PR曲线: {str(e)}")
        return None, None

def train_test_split(X, y, splits=10, batch_size=32):
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    best_model_info = {'val_score': -float('inf'), 'model': None, 'y_true': None, 'y_pred': None}

    interp_precisions = []
    interp_recalls = np.linspace(0, 1, 100)

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Use SMOTE for balancing the dataset
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_train, y_train)

        model = TabNetClassifier(
            n_d=8, n_a=8, n_steps=3, gamma=1.3, lambda_sparse=0,
            optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2),
            mask_type='sparsemax', scheduler_params=dict(step_size=50, gamma=0.9),
            scheduler_fn=torch.optim.lr_scheduler.StepLR
        )

        model.fit(X_res, y_res, eval_set=[(X_test, y_test)], patience=50, batch_size=batch_size)

        y_scores = model.predict_proba(X_test)
        y_pred = np.argmax(y_scores, axis=1)
        metrics = calculate_metrics(y_test, y_pred, y_scores)

        precision, recall = plot_pr_curve(y_test, y_scores, fold + 1)

        if precision is not None and recall is not None:
            interp_precision, _ = interpolate_pr_curve(precision, recall)
            interp_precisions.append(interp_precision)

        results.append(metrics)

        if metrics['auprc'] > best_model_info['val_score']:
            best_model_info['val_score'] = metrics['auprc']
            best_model_info['model'] = model
            best_model_info['y_true'] = y_test
            best_model_info['y_pred'] = y_pred

        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")

    # Save the best model
    best_model_info['model'].save_model('best_model.pkl')
    print(f"\nSaved best model with AUPRC {best_model_info['val_score']:.4f}")

    # Plot confusion matrix for the best model
    print(f"绘制最佳模型的混淆矩阵")
    plot_confusion_matrix(best_model_info['y_true'], best_model_info['y_pred'])

    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results])
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results])
    }

    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")

    if interp_precisions:
        plt.figure(figsize=(8, 6))
        mean_precision = np.mean(interp_precisions, axis=0)
        mean_auprc = auc(interp_recalls, mean_precision)

        for i, prec in enumerate(interp_precisions):
            plt.plot(interp_recalls, prec, alpha=0.3, label=f'Fold {i + 1}')

        plt.plot(interp_recalls, mean_precision, 'k-',
                 label=f'Mean (AUPRC = {mean_auprc:.2f})', linewidth=2)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Average Precision-Recall Curve')
        plt.legend()
        plt.savefig('average_pr_curve.png')
        plt.close()

# Load and preprocess data
data = pd.read_csv('preparations/heart_output.csv')

X = data.drop('HeartDisease', axis=1).values
y = data['HeartDisease'].values

X = X.astype(np.float32)
y = y.astype(np.int64)

train_test_split(X, y, splits=10, batch_size=32)

数据形状: X=(918, 11), y=(918,)
类别分布: [410 508]

Fold 1/10
epoch 0  | loss: 0.5596  | val_0_auc: 0.92779 |  0:00:00s
epoch 1  | loss: 0.42643 | val_0_auc: 0.88953 |  0:00:01s
epoch 2  | loss: 0.41216 | val_0_auc: 0.901   |  0:00:02s
epoch 3  | loss: 0.42895 | val_0_auc: 0.8582  |  0:00:02s
epoch 4  | loss: 0.39687 | val_0_auc: 0.90866 |  0:00:03s
epoch 5  | loss: 0.38664 | val_0_auc: 0.8637  |  0:00:04s
epoch 6  | loss: 0.35683 | val_0_auc: 0.89455 |  0:00:04s
epoch 7  | loss: 0.37754 | val_0_auc: 0.91248 |  0:00:05s
epoch 8  | loss: 0.3558  | val_0_auc: 0.89144 |  0:00:06s
epoch 9  | loss: 0.35001 | val_0_auc: 0.89383 |  0:00:06s
epoch 10 | loss: 0.32909 | val_0_auc: 0.86705 |  0:00:07s
epoch 11 | loss: 0.37618 | val_0_auc: 0.88809 |  0:00:07s
epoch 12 | loss: 0.37751 | val_0_auc: 0.89766 |  0:00:08s
epoch 13 | loss: 0.35447 | val_0_auc: 0.90483 |  0:00:09s
epoch 14 | loss: 0.37833 | val_0_auc: 0.91296 |  0:00:10s
epoch 15 | loss: 0.33428 | val_0_auc: 0.94165 |  0:00:11s
epoch 16 | loss: 

In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             confusion_matrix, auc)
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate
import warnings

warnings.filterwarnings('ignore')

def calculate_metrics(y_true, y_pred, y_scores):
    # Check for NaN values
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
    }

    # Calculate AUPRC
    try:
        metrics['auprc'] = average_precision_score(y_true, y_scores)
    except:
        print("无法计算AUPRC，使用默认值0")
        metrics['auprc'] = 0

    return metrics

def plot_confusion_matrix(y_true, y_pred, dpi=720):
    """绘制最佳模型混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):  # 假设是二分类，可以根据需要调整
        for j in range(2):
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=np.arange(1, 3), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'best_model_confusion_matrix.png', dpi=dpi)
    plt.close()

def interpolate_pr_curve(precision, recall):
    """插值PR曲线到固定长度的点"""
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(1.0, 0.0))
    new_recall = np.linspace(0, 1, 100)
    new_precision = f(new_recall)
    return new_precision, new_recall

def plot_pr_curve(y_true, y_scores, fold):
    try:
        precision, recall, _ = precision_recall_curve(y_true, y_scores)
        auprc = auc(recall, precision)

        plt.figure()
        plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.2f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend()
        plt.savefig(f'pr_curve_fold{fold}.png')
        plt.close()
        return precision, recall
    except Exception as e:
        print(f"无法绘制Fold {fold}的PR曲线: {str(e)}")
        return None, None

def train_test_split(X, y, splits=10, batch_size=32):
    # Check data
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    # Handle possible NaN values
    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    best_model_info = {'val_score': -float('inf'), 'model': None, 'y_true': None, 'y_pred': None}

    # Store PR curve data for all folds (interpolated)
    interp_precisions = []
    interp_recalls = np.linspace(0, 1, 100)  # Fixed 100 recall points

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Standardization
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Create XGBoost model
        xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1, use_label_encoder=False)

        # Train model
        xgb_model.fit(X_train, y_train)

        # Evaluate on test set
        y_scores = xgb_model.predict_proba(X_test)[:, 1]  # Get the probability for class 1 (positive class)
        y_pred = (y_scores > 0.5).astype(int)
        metrics = calculate_metrics(y_test, y_pred, y_scores)

        # Plot and save PR curve for current fold
        precision, recall = plot_pr_curve(y_test, y_scores, fold + 1)

        # Interpolate PR curve to fixed length
        if precision is not None and recall is not None:
            interp_precision, _ = interpolate_pr_curve(precision, recall)
            interp_precisions.append(interp_precision)

        # Save results
        results.append(metrics)

        # Update the best model based on AUPRC
        if metrics['auprc'] > best_model_info['val_score']:
            best_model_info['val_score'] = metrics['auprc']
            best_model_info['model'] = xgb_model
            best_model_info['y_true'] = y_test
            best_model_info['y_pred'] = y_pred

        # Print current fold results
        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")

    # Save the best model
    best_model_info['model'].save_model('best_model.xgb')
    print(f"\nSaved best model with AUPRC {best_model_info['val_score']:.4f}")

    # Plot confusion matrix for the best model
    print(f"绘制最佳模型的混淆矩阵")
    plot_confusion_matrix(best_model_info['y_true'], best_model_info['y_pred'])

    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results])
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results])
    }

    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")

    # Plot average PR curve (using interpolated data)
    if interp_precisions:
        plt.figure(figsize=(8, 6))
        mean_precision = np.mean(interp_precisions, axis=0)
        mean_auprc = auc(interp_recalls, mean_precision)

        for i, prec in enumerate(interp_precisions):
            plt.plot(interp_recalls, prec, alpha=0.3, label=f'Fold {i + 1}')

        plt.plot(interp_recalls, mean_precision, 'k-',
                 label=f'Mean (AUPRC = {mean_auprc:.2f})', linewidth=2)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Average Precision-Recall Curve')
        plt.legend()
        plt.savefig('average_pr_curve.png')
        plt.close()

# Load and preprocess data
data = pd.read_csv('preparations/heart_output.csv')

X = data.drop('HeartDisease', axis=1).values
y = data['HeartDisease'].values

X = X.astype(np.float32)
y = y.astype(np.int64)

train_test_split(X, y, splits=10, batch_size=32)


数据形状: X=(918, 11), y=(918,)
类别分布: [410 508]

Fold 1/10

Fold 1 Test Metrics:
Accuracy: 0.8152
F1 Score: 0.8317
Precision: 0.8400
Recall: 0.8235
AUPRC: 0.8615

Fold 2/10

Fold 2 Test Metrics:
Accuracy: 0.8696
F1 Score: 0.8824
Precision: 0.8824
Recall: 0.8824
AUPRC: 0.9441

Fold 3/10

Fold 3 Test Metrics:
Accuracy: 0.8261
F1 Score: 0.8431
Precision: 0.8431
Recall: 0.8431
AUPRC: 0.8651

Fold 4/10

Fold 4 Test Metrics:
Accuracy: 0.8696
F1 Score: 0.8776
Precision: 0.9149
Recall: 0.8431
AUPRC: 0.9607

Fold 5/10

Fold 5 Test Metrics:
Accuracy: 0.8261
F1 Score: 0.8462
Precision: 0.8302
Recall: 0.8627
AUPRC: 0.9323

Fold 6/10

Fold 6 Test Metrics:
Accuracy: 0.8261
F1 Score: 0.8462
Precision: 0.8302
Recall: 0.8627
AUPRC: 0.8772

Fold 7/10

Fold 7 Test Metrics:
Accuracy: 0.8913
F1 Score: 0.9038
Precision: 0.8868
Recall: 0.9216
AUPRC: 0.8818

Fold 8/10

Fold 8 Test Metrics:
Accuracy: 0.8478
F1 Score: 0.8704
Precision: 0.8246
Recall: 0.9216
AUPRC: 0.9310

Fold 9/10

Fold 9 Test Metrics:
Accuracy: 0

In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, confusion_matrix)
from imblearn.over_sampling import SMOTE
import copy
import matplotlib.pyplot as plt
import seaborn as sns

# 设置字体为黑体，确保中文可见
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# TeLU激活函数
class TeLU(nn.Module):
    def __init__(self, alpha=0.15):
        super(TeLU, self).__init__()
        self.alpha = alpha

    def forward(self, x):
        return torch.where(x >= 0, x, self.alpha * (torch.exp(x) - 1))


# 前馈神经网络
class FFNN(nn.Module):
    def __init__(self, input_size):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.telu1 = TeLU(alpha=0.15)
        self.fc2 = nn.Linear(32, 64)
        self.telu2 = TeLU(alpha=0.1)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.telu1(x)
        x = self.fc2(x)
        x = self.telu2(x)
        x = self.fc3(x)
        return x


# 焦点损失函数
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()


def calculate_metrics(y_true, y_pred, y_scores):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'auprc': average_precision_score(y_true, y_scores[:, 1])
    }
    return metrics


def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(2):
        for j in range(2):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'

            # 将个数和百分比分行显示，个数在上，百分比在下
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(2) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'CI_FFNN_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()


def plot_pr_curve(y_true, y_scores, fold):
    precision, recall, _ = precision_recall_curve(y_true, y_scores[:, 1])
    auprc = auc(recall, precision)

    plt.figure()
    plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.2f})')
    plt.xlabel('召回率', fontsize=14, fontweight='bold')
    plt.ylabel('精确率', fontsize=14, fontweight='bold')
    plt.title('Precision-Recall 曲线', fontsize=16, fontweight='bold')
    plt.legend()
    plt.savefig(f'pr_curve_fold{fold}.png')
    plt.close()


def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=500, patience=20):
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        # 验证
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        all_scores = []
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_scores.extend(torch.softmax(outputs, dim=1).cpu().numpy())

        # 计算指标
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(val_loader.dataset)
        metrics = calculate_metrics(all_labels, all_preds, np.array(all_scores))

        # 早停
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

        # 打印进度
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
            print(f"Val Metrics - Acc: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}, "
                  f"Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, "
                  f"AUPRC: {metrics['auprc']:.4f}")

    # 加载最佳模型
    model.load_state_dict(best_model)
    return model


def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    all_scores = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_scores.extend(torch.softmax(outputs, dim=1).cpu().numpy())

    return all_labels, all_preds, np.array(all_scores)


def train_test_split(X, y, splits=10, epochs=500, batch_size=512, lr=0.001):
    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []

    # 存储所有折的PR曲线数据
    all_precisions = []
    all_recalls = []
    
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 分割数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 1. 先标准化数据
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # 2. 应用SMOTE过采样（仅对训练数据）
        # 检查少数类样本数量是否足够
        min_samples = sum(y_train == 1)
        k_neighbors = min(5, min_samples - 1) if min_samples > 1 else 1

        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

        # 转换为张量
        X_train_tensor = torch.FloatTensor(X_train_res)
        y_train_tensor = torch.LongTensor(y_train_res)
        X_test_tensor = torch.FloatTensor(X_test_scaled)
        y_test_tensor = torch.LongTensor(y_test)

        # 创建数据集和数据加载器
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        # 初始化模型、损失函数和优化器
        model = FFNN(input_size=X.shape[1])
        criterion = FocalLoss(alpha=0.25, gamma=2)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # 学习率调度器
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.5)

        # 训练模型
        model = train_model(model, train_loader, test_loader, criterion, optimizer,
                            epochs=epochs, patience=20)

        # 评估测试集
        y_true, y_pred, y_scores = evaluate_model(model, test_loader)
        metrics = calculate_metrics(y_true, y_pred, y_scores)

        # 跟踪最佳折叠（根据AUPRC）
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold + 1
            best_y_true = y_true
            best_y_pred = y_pred

        # 保存PR曲线数据
        precision, recall, _ = precision_recall_curve(y_true, y_scores[:, 1])
        all_precisions.append(precision)
        all_recalls.append(recall)

        # 绘制当前折的PR曲线
        plot_pr_curve(y_true, y_scores, fold + 1)

        # 保存结果
        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")

    # 计算并打印平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results])
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results])
    }

    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")

    # 绘制最佳折的混淆矩阵
    if best_y_true is not None and best_y_pred is not None:
        print(f"\nPlotting confusion matrix for best fold: {best_fold}")
        plot_confusion_matrix(best_y_true, best_y_pred, best_fold, dpi=720)

    # 绘制平均PR曲线
    plt.figure(figsize=(8, 6))
    for i in range(splits):
        plt.plot(all_recalls[i], all_precisions[i], alpha=0.3, label=f'Fold {i + 1}')

    mean_precision = np.mean([p for p in all_precisions], axis=0)
    mean_recall = np.mean([r for r in all_recalls], axis=0)
    mean_auprc = auc(mean_recall, mean_precision)

    plt.plot(mean_recall, mean_precision, 'k-',
             label=f'Mean (AUPRC = {mean_auprc:.2f})', linewidth=2)
    plt.xlabel('召回率', fontsize=14, fontweight='bold')
    plt.ylabel('精确率', fontsize=14, fontweight='bold')
    plt.title('平均 Precision-Recall 曲线', fontsize=16, fontweight='bold')
    plt.legend()
    plt.savefig('average_pr_curve.png')
    plt.close()


# 加载数据
data = pd.read_csv('preparations/stroke_output.csv')

# 预处理分类变量
categorical_cols = ['ever_married', 'work_type', 'smoking_status']
data[categorical_cols] = data[categorical_cols].astype('category')

# 分离特征和目标
X = data.drop('stroke', axis=1)
y = data['stroke'].values

# 对分类变量进行独热编码
X = pd.get_dummies(X, columns=categorical_cols)

# 转换为numpy数组
X = X.values.astype(np.float32)

# 运行训练和评估
train_test_split(X, y, splits=10, epochs=500, batch_size=512, lr=0.001)


Fold 1/10
Epoch 10/500 - Train Loss: 0.0262, Val Loss: 0.0296
Val Metrics - Acc: 0.7378, F1: 0.2209, Precision: 0.1293, Recall: 0.7600, AUPRC: 0.1644
Epoch 20/500 - Train Loss: 0.0237, Val Loss: 0.0308
Val Metrics - Acc: 0.7397, F1: 0.2130, Precision: 0.1250, Recall: 0.7200, AUPRC: 0.1681
Epoch 30/500 - Train Loss: 0.0220, Val Loss: 0.0267
Val Metrics - Acc: 0.7769, F1: 0.2192, Precision: 0.1322, Recall: 0.6400, AUPRC: 0.1763
Epoch 40/500 - Train Loss: 0.0204, Val Loss: 0.0275
Val Metrics - Acc: 0.7750, F1: 0.2282, Precision: 0.1371, Recall: 0.6800, AUPRC: 0.1834
Epoch 50/500 - Train Loss: 0.0189, Val Loss: 0.0274
Val Metrics - Acc: 0.7906, F1: 0.2517, Precision: 0.1525, Recall: 0.7200, AUPRC: 0.1929
Epoch 60/500 - Train Loss: 0.0179, Val Loss: 0.0291
Val Metrics - Acc: 0.7926, F1: 0.2535, Precision: 0.1538, Recall: 0.7200, AUPRC: 0.1777
Epoch 70/500 - Train Loss: 0.0170, Val Loss: 0.0303
Val Metrics - Acc: 0.7828, F1: 0.2345, Precision: 0.1417, Recall: 0.6800, AUPRC: 0.1884
Early sto

In [66]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, confusion_matrix)
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# 设置字体为黑体，确保中文可见
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings('ignore')

# TeLU激活函数
class TeLU(nn.Module):
    def __init__(self, alpha=0.15):
        super(TeLU, self).__init__()
        self.alpha = alpha

    def forward(self, x):
        return torch.where(x >= 0, x, self.alpha * (torch.exp(x) - 1))


# 前馈神经网络
class FFNN(nn.Module):
    def __init__(self, input_size):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.telu1 = TeLU(alpha=0.15)
        self.fc2 = nn.Linear(32, 64)
        self.telu2 = TeLU(alpha=0.1)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.telu1(x)
        x = self.fc2(x)
        x = self.telu2(x)
        x = self.fc3(x)
        return x


# 焦点损失函数
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()


def calculate_metrics(y_true, y_pred, y_scores):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'auprc': average_precision_score(y_true, y_scores[:, 1])
    }
    return metrics


def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(cm.shape[1]) + 0.5, labels=np.arange(cm.shape[1]), fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(cm.shape[0]) + 0.5, labels=np.arange(cm.shape[0]), fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'FFNN_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()


def plot_pr_curve(y_true, y_scores, fold):
    precision, recall, _ = precision_recall_curve(y_true, y_scores[:, 1])
    auprc = auc(recall, precision)

    plt.figure()
    plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.2f})')
    plt.xlabel('召回率', fontsize=14, fontweight='bold')
    plt.ylabel('精确率', fontsize=14, fontweight='bold')
    plt.title('Precision-Recall 曲线', fontsize=16, fontweight='bold')
    plt.legend()
    plt.savefig(f'pr_curve_fold{fold}.png')
    plt.close()


def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=500, patience=20):
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        # 验证
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        all_scores = []
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_scores.extend(torch.softmax(outputs, dim=1).cpu().numpy())

        # 计算指标
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(val_loader.dataset)
        metrics = calculate_metrics(all_labels, all_preds, np.array(all_scores))

        # 早停
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

        # 打印进度
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
            print(f"Val Metrics - Acc: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}, "
                  f"Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, "
                  f"AUPRC: {metrics['auprc']:.4f}")

    # 加载最佳模型
    model.load_state_dict(best_model)
    return model


def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    all_scores = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_scores.extend(torch.softmax(outputs, dim=1).cpu().numpy())

    return all_labels, all_preds, np.array(all_scores)


def train_test_split(X, y, splits=10, epochs=500, batch_size=512, lr=0.001):
    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []

    # 存储所有折的PR曲线数据
    all_precisions = []
    all_recalls = []
    
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        # 分割数据
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # 转换为张量
        X_train_tensor = torch.FloatTensor(X_train)
        y_train_tensor = torch.LongTensor(y_train)
        X_test_tensor = torch.FloatTensor(X_test)
        y_test_tensor = torch.LongTensor(y_test)

        # 创建数据集和数据加载器
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        # 初始化模型、损失函数和优化器
        model = FFNN(input_size=X.shape[1])
        criterion = FocalLoss(alpha=0.25, gamma=2)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # 学习率调度器
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.5)

        # 训练模型
        model = train_model(model, train_loader, test_loader, criterion, optimizer,
                            epochs=epochs, patience=20)

        # 评估测试集
        y_true, y_pred, y_scores = evaluate_model(model, test_loader)
        metrics = calculate_metrics(y_true, y_pred, y_scores)

        # 跟踪最佳折叠（根据AUPRC）
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold + 1
            best_y_true = y_true
            best_y_pred = y_pred

        # 保存PR曲线数据
        precision, recall, _ = precision_recall_curve(y_true, y_scores[:, 1])
        all_precisions.append(precision)
        all_recalls.append(recall)

        # 绘制当前折的PR曲线
        plot_pr_curve(y_true, y_scores, fold + 1)

        # 保存结果
        results.append(metrics)

        # 打印当前折的结果
        print(f'\nFold {fold + 1} Test Metrics:')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")

    # 计算并打印平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results])
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results])
    }

    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")

    # 绘制最佳折的混淆矩阵
    if best_y_true is not None and best_y_pred is not None:
        print(f"\nPlotting confusion matrix for best fold: {best_fold}")
        plot_confusion_matrix(best_y_true, best_y_pred, best_fold, dpi=720)

    # 绘制平均PR曲线
    plt.figure(figsize=(8, 6))
    for i in range(splits):
        plt.plot(all_recalls[i], all_precisions[i], alpha=0.3, label=f'Fold {i + 1}')

    mean_precision = np.mean([p for p in all_precisions], axis=0)
    mean_recall = np.mean([r for r in all_recalls], axis=0)
    mean_auprc = auc(mean_recall, mean_precision)

    plt.plot(mean_recall, mean_precision, 'k-',
             label=f'Mean (AUPRC = {mean_auprc:.2f})', linewidth=2)
    plt.xlabel('召回率', fontsize=14, fontweight='bold')
    plt.ylabel('精确率', fontsize=14, fontweight='bold')
    plt.title('平均 Precision-Recall 曲线', fontsize=16, fontweight='bold')
    plt.legend()
    plt.savefig('average_pr_curve.png')
    plt.close()


# 加载数据
data = pd.read_csv('preparations/stroke_output.csv')

# 预处理分类变量
categorical_cols = ['ever_married', 'work_type', 'smoking_status']
data[categorical_cols] = data[categorical_cols].astype('category')

# 分离特征和目标
X = data.drop('stroke', axis=1)
y = data['stroke'].values

# 对分类变量进行独热编码
X = pd.get_dummies(X, columns=categorical_cols)

# 转换为numpy数组
X = X.values.astype(np.float32)

# 运行训练和评估
train_test_split(X, y, splits=10, epochs=500, batch_size=512, lr=0.001)


Fold 1/10
Epoch 10/500 - Train Loss: 0.0115, Val Loss: 0.0117
Val Metrics - Acc: 0.9511, F1: 0.0000, Precision: 0.0000, Recall: 0.0000, AUPRC: 0.1855
Epoch 20/500 - Train Loss: 0.0105, Val Loss: 0.0109
Val Metrics - Acc: 0.9511, F1: 0.0000, Precision: 0.0000, Recall: 0.0000, AUPRC: 0.2081
Epoch 30/500 - Train Loss: 0.0101, Val Loss: 0.0110
Val Metrics - Acc: 0.9511, F1: 0.0000, Precision: 0.0000, Recall: 0.0000, AUPRC: 0.1870
Epoch 40/500 - Train Loss: 0.0100, Val Loss: 0.0111
Val Metrics - Acc: 0.9491, F1: 0.0000, Precision: 0.0000, Recall: 0.0000, AUPRC: 0.1816
Early stopping at epoch 42

Fold 1 Test Metrics:
Accuracy: 0.9511
F1 Score: 0.0000
Precision: 0.0000
Recall: 0.0000
AUPRC: 0.2012

Fold 2/10
Epoch 10/500 - Train Loss: 0.0112, Val Loss: 0.0107
Val Metrics - Acc: 0.9511, F1: 0.0000, Precision: 0.0000, Recall: 0.0000, AUPRC: 0.1902
Epoch 20/500 - Train Loss: 0.0105, Val Loss: 0.0103
Val Metrics - Acc: 0.9511, F1: 0.0000, Precision: 0.0000, Recall: 0.0000, AUPRC: 0.1974
Epoch 30

In [67]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, confusion_matrix)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate

# 设置字体为黑体，确保中文可见
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(cm.shape[1]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(cm.shape[0]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'LGB_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()

# 解决PR曲线长度不一致的问题
def interpolate_pr_curve(precision, recall, num_points=100):
    """将PR曲线插值到固定长度的点"""
    if len(precision) < 2 or len(recall) < 2:
        return np.linspace(0, 1, num_points), np.linspace(1, 0, num_points)

    # 确保recall是单调递增的
    sorted_indices = np.argsort(recall)
    recall = np.array(recall)[sorted_indices]
    precision = np.array(precision)[sorted_indices]

    # 插值
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(precision[0], precision[-1]))
    new_recall = np.linspace(0, 1, num_points)
    new_precision = f(new_recall)
    return new_precision, new_recall


def calculate_metrics(y_true, y_pred, y_scores):
    """计算评估指标"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'auprc': average_precision_score(y_true, y_scores)
    }

    # 添加混淆矩阵信息
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics.update({
        'confusion_matrix': {
            'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp
        },
        'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0
    })
    return metrics


def plot_pr_curve(y_true, y_scores, fold, save_path=None):
    """绘制PR曲线并返回插值后的数据"""
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    auprc = auc(recall, precision)

    # 插值到固定长度
    interp_precision, interp_recall = interpolate_pr_curve(precision, recall)

    plt.figure()
    plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.3f})')
    plt.xlabel('召回率', fontsize=14, fontweight='bold')
    plt.ylabel('精确率', fontsize=14, fontweight='bold')
    plt.title(f'PR Curve (Fold {fold})', fontsize=16, fontweight='bold')
    plt.legend()
    if save_path:
        plt.savefig(save_path)
    plt.close()

    return interp_precision, interp_recall, auprc


def train_lgb_model(X_train, y_train, X_val, y_val):
    """训练LightGBM模型"""
    params = {
        'objective': 'binary',
        'metric': ['binary_logloss', 'auc'],
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42,
        'is_unbalance': True,  # 处理类别不平衡
        'min_child_samples': 20
    }

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=50)
        ]
    )
    return model


def cross_validate(X, y, n_splits=10):
    """执行交叉验证"""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []
    all_interp_precisions = []
    interp_recall = np.linspace(0, 1, 100)  # 固定100个recall点
    
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
        print(f'\n{"=" * 40} Fold {fold}/{n_splits} {"=" * 40}')

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # SMOTE过采样
        smote = SMOTE(random_state=42, k_neighbors=min(5, sum(y_train == 1) - 1))
        X_res, y_res = smote.fit_resample(X_train_scaled, y_train)

        # 训练模型
        model = train_lgb_model(X_res, y_res, X_test_scaled, y_test)

        # 预测
        y_scores = model.predict(X_test_scaled)
        y_pred = (y_scores >= 0.5).astype(int)

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred, y_scores)
        results.append(metrics)
        
        # 跟踪最佳折叠（根据AUPRC）
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold
            best_y_true = y_test
            best_y_pred = y_pred

        # PR曲线
        interp_precision, _, _ = plot_pr_curve(
            y_test, y_scores, fold,
            save_path=f'pr_curve_fold{fold}.png'
        )
        all_interp_precisions.append(interp_precision)

        # 打印结果
        print(f"\nFold {fold} Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall/Sensitivity: {metrics['recall']:.4f}")
        print(f"Specificity: {metrics['specificity']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"Confusion Matrix:")
        print(f"TN: {metrics['confusion_matrix']['TN']} | FP: {metrics['confusion_matrix']['FP']}")
        print(f"FN: {metrics['confusion_matrix']['FN']} | TP: {metrics['confusion_matrix']['TP']}")

    # 绘制最佳折的混淆矩阵
    if best_y_true is not None and best_y_pred is not None:
        print(f"\nPlotting confusion matrix for best fold: {best_fold}")
        plot_confusion_matrix(best_y_true, best_y_pred, best_fold, dpi=720)

    # 计算平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'specificity': np.mean([r['specificity'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results]),
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'specificity': np.std([r['specificity'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results]),
    }

    # 绘制平均PR曲线
    if all_interp_precisions:
        mean_precision = np.mean(all_interp_precisions, axis=0)
        mean_auprc = auc(interp_recall, mean_precision)

        plt.figure(figsize=(10, 6))
        for i, prec in enumerate(all_interp_precisions, 1):
            plt.plot(interp_recall, prec, alpha=0.2, label=f'Fold {i}')

        plt.plot(interp_recall, mean_precision, 'r-',
                 linewidth=3, label=f'Mean (AUPRC = {mean_auprc:.3f})')
        plt.xlabel('召回率', fontsize=14, fontweight='bold')
        plt.ylabel('精确率', fontsize=14, fontweight='bold')
        plt.title('平均 Precision-Recall 曲线', fontsize=16, fontweight='bold')
        plt.legend()
        plt.savefig('average_pr_curve.png')
        plt.close()

    return avg_metrics, std_metrics


# 主程序
def main():
    # 加载数据
    data = pd.read_csv('preparations/stroke_output.csv')

    # 预处理
    categorical_cols = ['ever_married', 'work_type', 'smoking_status']
    data[categorical_cols] = data[categorical_cols].astype('category')
    X = data.drop('stroke', axis=1)
    y = data['stroke'].values

    # 独热编码
    X = pd.get_dummies(X, columns=categorical_cols)
    X = X.values.astype(np.float32)

    # 执行交叉验证
    print("Starting cross-validation...")
    avg_metrics, std_metrics = cross_validate(X, y, n_splits=10)

    # 打印最终结果
    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall/Sensitivity: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average Specificity: {avg_metrics['specificity']:.4f} ± {std_metrics['specificity']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")


if __name__ == '__main__':
    main()

Starting cross-validation...

[50]	valid_0's binary_logloss: 0.303693	valid_0's auc: 0.839095

Fold 1 Metrics:
Accuracy: 0.8356
F1 Score: 0.2759
Precision: 0.1758
Recall/Sensitivity: 0.6400
Specificity: 0.8457
AUPRC: 0.2036
Confusion Matrix:
TN: 411 | FP: 75
FN: 9 | TP: 16

[50]	valid_0's binary_logloss: 0.30531	valid_0's auc: 0.789712

Fold 2 Metrics:
Accuracy: 0.8121
F1 Score: 0.2500
Precision: 0.1553
Recall/Sensitivity: 0.6400
Specificity: 0.8210
AUPRC: 0.1868
Confusion Matrix:
TN: 399 | FP: 87
FN: 9 | TP: 16

[50]	valid_0's binary_logloss: 0.320179	valid_0's auc: 0.878107

Fold 3 Metrics:
Accuracy: 0.7671
F1 Score: 0.2699
Precision: 0.1594
Recall/Sensitivity: 0.8800
Specificity: 0.7613
AUPRC: 0.2353
Confusion Matrix:
TN: 370 | FP: 116
FN: 3 | TP: 22

[50]	valid_0's binary_logloss: 0.318758	valid_0's auc: 0.832716
[100]	valid_0's binary_logloss: 0.240525	valid_0's auc: 0.816214

Fold 4 Metrics:
Accuracy: 0.8787
F1 Score: 0.2791
Precision: 0.1967
Recall/Sensitivity: 0.4800
Specificit

In [69]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, roc_auc_score,
                             confusion_matrix, classification_report)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体为黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(cm.shape[1]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(cm.shape[0]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'LR_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()

def calculate_metrics(y_true, y_pred, y_scores):
    if np.isnan(y_scores).any():
        y_scores = np.nan_to_num(y_scores)

    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
    }

    try:
        metrics['auprc'] = average_precision_score(y_true, y_scores)
        metrics['auroc'] = roc_auc_score(y_true, y_scores)
    except:
        print("无法计算AUPRC/AUROC，使用默认值0")
        metrics['auprc'] = 0
        metrics['auroc'] = 0

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    metrics['sensitivity'] = tp / (tp + fn) if (tp + fn) > 0 else 0

    return metrics

def interpolate_pr_curve(precision, recall):
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(1.0, 0.0))
    new_recall = np.linspace(0, 1, 100)
    new_precision = f(new_recall)
    return new_precision, new_recall

def plot_pr_curve(y_true, y_scores, fold):
    try:
        precision, recall, _ = precision_recall_curve(y_true, y_scores)
        auprc = auc(recall, precision)

        plt.figure()
        plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.2f})')
        plt.xlabel('召回率', fontsize=14, fontweight='bold')
        plt.ylabel('精确率', fontsize=14, fontweight='bold')
        plt.title('精确率-召回率曲线', fontsize=16, fontweight='bold')
        plt.legend()
        plt.savefig(f'pr_curve_fold{fold}.png')
        plt.close()
        return precision, recall
    except Exception as e:
        print(f"无法绘制Fold {fold}的PR曲线: {str(e)}")
        return None, None

def train_test_split(X, y, splits=10):
    print(f"数据形状: X={X.shape}, y={y.shape}")
    print(f"类别分布: {np.bincount(y)}")

    try:
        feature_names = data.drop('stroke', axis=1).columns.tolist()
    except:
        feature_names = [f'Feature_{i}' for i in range(X.shape[1])]

    X = np.nan_to_num(X)
    y = np.nan_to_num(y).astype(int)

    k_fold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=2025)
    results = []
    weights = []
    all_y_true = []
    all_y_pred = []
    all_y_scores = []

    interp_precisions = []
    interp_recalls = np.linspace(0, 1, 100)
    
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(k_fold.split(X, y)):
        print(f'\n{"=" * 50}')
        print(f'Fold {fold + 1}/{splits}')
        print(f'{"=" * 50}')

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', random_state=42)
        model.fit(X_train, y_train)

        fold_weights = {
            'intercept': model.intercept_[0],
            'coefficients': model.coef_[0]
        }
        weights.append(fold_weights)

        y_scores = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)

        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)
        all_y_scores.extend(y_scores)

        metrics = calculate_metrics(y_test, y_pred, y_scores)
        
        # 跟踪最佳折叠（根据AUPRC）
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold + 1
            best_y_true = y_test
            best_y_pred = y_pred

        precision, recall = plot_pr_curve(y_test, y_scores, fold + 1)
        if precision is not None and recall is not None:
            interp_precision, _ = interpolate_pr_curve(precision, recall)
            interp_precisions.append(interp_precision)

        results.append(metrics)

        print(f'\nFold {fold + 1} 测试集指标:')
        print(f"准确度: {metrics['accuracy']:.4f}")
        print(f"F1分数: {metrics['f1']:.4f}")
        print(f"精确率: {metrics['precision']:.4f}")
        print(f"召回率: {metrics['recall']:.4f}")
        print(f"特异性: {metrics['specificity']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"AUROC: {metrics['auroc']:.4f}")
        print("\n分类报告:")
        print(classification_report(y_test, y_pred, zero_division=0))

    # 绘制最佳折的混淆矩阵
    if best_y_true is not None and best_y_pred is not None:
        print(f"\n绘制最佳折的混淆矩阵 (Fold {best_fold})")
        plot_confusion_matrix(best_y_true, best_y_pred, best_fold, dpi=720)

    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'specificity': np.mean([r['specificity'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results]),
        'auroc': np.mean([r['auroc'] for r in results])
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'specificity': np.std([r['specificity'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results]),
        'auroc': np.std([r['auroc'] for r in results])
    }

    print('\n' + '=' * 50)
    print('交叉验证最终结果:')
    print('=' * 50)
    print(f"平均准确度: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"平均F1分数: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"平均精确率: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"平均召回率: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"平均特异性: {avg_metrics['specificity']:.4f} ± {std_metrics['specificity']:.4f}")
    print(f"平均AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")
    print(f"平均AUROC: {avg_metrics['auroc']:.4f} ± {std_metrics['auroc']:.4f}")

    print('\n整体分类报告:')
    print(classification_report(all_y_true, all_y_pred, zero_division=0))

    print('\n' + '=' * 50)
    print('各折模型权重:')
    print('=' * 50)
    for i, fold_weight in enumerate(weights):
        print(f'\nFold {i + 1} 权重:')
        print(f"截距项 (bias): {fold_weight['intercept']:.4f}")
        for name, coef in zip(feature_names, fold_weight['coefficients']):
            print(f"{name}: {coef:.4f}")

    avg_intercept = np.mean([w['intercept'] for w in weights])
    avg_coefficients = np.mean([w['coefficients'] for w in weights], axis=0)

    print('\n' + '=' * 50)
    print('跨折平均权重:')
    print('=' * 50)
    print(f"平均截距项: {avg_intercept:.4f}")
    for name, coef in zip(feature_names, avg_coefficients):
        print(f"{name}: {coef:.4f}")

    if interp_precisions:
        plt.figure(figsize=(8, 6))
        mean_precision = np.mean(interp_precisions, axis=0)
        mean_auprc = auc(interp_recalls, mean_precision)

        for i, prec in enumerate(interp_precisions):
            plt.plot(interp_recalls, prec, alpha=0.3, label=f'Fold {i + 1}')

        plt.plot(interp_recalls, mean_precision, 'k-',
                 label=f'平均 (AUPRC = {mean_auprc:.2f})', linewidth=2)
        plt.xlabel('召回率', fontsize=14, fontweight='bold')
        plt.ylabel('精确率', fontsize=14, fontweight='bold')
        plt.title('平均精确率-召回率曲线', fontsize=16, fontweight='bold')
        plt.legend()
        plt.savefig('average_pr_curve.png')
        plt.close()

# 加载数据
data = pd.read_csv('preparations/stroke_output.csv')  # 请替换为您的实际文件路径

# 检查数据
print("数据前5行:")
print(data.head())
print("\n类别分布:")
print(data['stroke'].value_counts())

# 分离特征和目标
X = data.drop('stroke', axis=1).values
y = data['stroke'].values

# 运行训练和评估
train_test_split(X, y, splits=10)

数据前5行:
    age  hypertension  heart_disease  ever_married  work_type  \
0  67.0             0              1             1        3.0   
1  61.0             0              0             1        2.0   
2  80.0             0              1             1        3.0   
3  49.0             0              0             1        3.0   
4  79.0             1              0             1        2.0   

   avg_glucose_level        bmi  smoking_status  stroke  
0             228.69  36.600000               1       1  
1             202.21  28.893237               0       1  
2             105.92  32.500000               0       1  
3             171.23  34.400000               2       1  
4             174.12  24.000000               0       1  

类别分布:
stroke
0    4861
1     249
Name: count, dtype: int64
数据形状: X=(5110, 8), y=(5110,)
类别分布: [4861  249]

Fold 1/10

Fold 1 测试集指标:
准确度: 0.9511
F1分数: 0.0000
精确率: 0.0000
召回率: 0.0000
特异性: 1.0000
AUPRC: 0.2353
AUROC: 0.8531

分类报告:
              precision  

In [70]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, confusion_matrix)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate
from sklearn.ensemble import RandomForestClassifier

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体为黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(cm.shape[1]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(cm.shape[0]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'RF_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()

def interpolate_pr_curve(precision, recall, num_points=100):
    """将PR曲线插值到固定长度的点"""
    if len(precision) < 2 or len(recall) < 2:
        return np.linspace(0, 1, num_points), np.linspace(1, 0, num_points)

    # 确保recall是单调递增的
    sorted_indices = np.argsort(recall)
    recall = np.array(recall)[sorted_indices]
    precision = np.array(precision)[sorted_indices]

    # 插值
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(precision[0], precision[-1]))
    new_recall = np.linspace(0, 1, num_points)
    new_precision = f(new_recall)
    return new_precision, new_recall

def calculate_metrics(y_true, y_pred, y_scores):
    """计算评估指标"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'auprc': average_precision_score(y_true, y_scores)
    }

    # 添加混淆矩阵信息
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics.update({
        'confusion_matrix': {
            'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp
        },
        'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0
    })
    return metrics

def plot_pr_curve(y_true, y_scores, fold, save_path=None):
    """绘制PR曲线并返回插值后的数据"""
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    auprc = auc(recall, precision)

    # 插值到固定长度
    interp_precision, interp_recall = interpolate_pr_curve(precision, recall)

    plt.figure()
    plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.3f})')
    plt.xlabel('召回率', fontsize=14, fontweight='bold')
    plt.ylabel('精确率', fontsize=14, fontweight='bold')
    plt.title(f'PR Curve (Fold {fold})', fontsize=16, fontweight='bold')
    plt.legend()
    if save_path:
        plt.savefig(save_path)
    plt.close()

    return interp_precision, interp_recall, auprc

def train_rf_model(X_train, y_train, X_val, y_val):
    """训练Random Forest模型"""
    model = RandomForestClassifier(
        n_estimators=100, 
        random_state=42, 
        class_weight='balanced', 
        n_jobs=-1,
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=2
    )
    
    # 训练模型
    model.fit(X_train, y_train)
    return model

def cross_validate(X, y, n_splits=10):
    """执行交叉验证"""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []
    all_interp_precisions = []
    interp_recall = np.linspace(0, 1, 100)  # 固定100个recall点
    
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
        print(f'\n{"=" * 40} Fold {fold}/{n_splits} {"=" * 40}')

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # SMOTE过采样
        smote = SMOTE(random_state=42, k_neighbors=min(5, sum(y_train == 1) - 1))
        X_res, y_res = smote.fit_resample(X_train_scaled, y_train)

        # 训练模型
        model = train_rf_model(X_res, y_res, X_test_scaled, y_test)

        # 预测
        y_scores = model.predict_proba(X_test_scaled)[:, 1]  # 获取正类的预测概率
        y_pred = model.predict(X_test_scaled)

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred, y_scores)
        results.append(metrics)
        
        # 跟踪最佳折叠（根据AUPRC）
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold
            best_y_true = y_test
            best_y_pred = y_pred

        # PR曲线
        interp_precision, _, _ = plot_pr_curve(
            y_test, y_scores, fold,
            save_path=f'pr_curve_fold{fold}.png'
        )
        all_interp_precisions.append(interp_precision)

        # 打印结果
        print(f"\nFold {fold} Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall/Sensitivity: {metrics['recall']:.4f}")
        print(f"Specificity: {metrics['specificity']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"Confusion Matrix:")
        print(f"TN: {metrics['confusion_matrix']['TN']} | FP: {metrics['confusion_matrix']['FP']}")
        print(f"FN: {metrics['confusion_matrix']['FN']} | TP: {metrics['confusion_matrix']['TP']}")

    # 绘制最佳折的混淆矩阵
    if best_y_true is not None and best_y_pred is not None:
        print(f"\nPlotting confusion matrix for best fold: {best_fold}")
        plot_confusion_matrix(best_y_true, best_y_pred, best_fold, dpi=720)

    # 计算平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'specificity': np.mean([r['specificity'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results]),
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'specificity': np.std([r['specificity'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results]),
    }

    # 绘制平均PR曲线
    if all_interp_precisions:
        mean_precision = np.mean(all_interp_precisions, axis=0)
        mean_auprc = auc(interp_recall, mean_precision)

        plt.figure(figsize=(10, 6))
        for i, prec in enumerate(all_interp_precisions, 1):
            plt.plot(interp_recall, prec, alpha=0.2, label=f'Fold {i}')

        plt.plot(interp_recall, mean_precision, 'r-',
                 linewidth=3, label=f'Mean (AUPRC = {mean_auprc:.3f})')
        plt.xlabel('召回率', fontsize=14, fontweight='bold')
        plt.ylabel('精确率', fontsize=14, fontweight='bold')
        plt.title('平均 Precision-Recall 曲线', fontsize=16, fontweight='bold')
        plt.legend()
        plt.savefig('average_pr_curve.png')
        plt.close()

    return avg_metrics, std_metrics

def main():
    # 加载数据
    data = pd.read_csv('preparations/stroke_output.csv')

    # 预处理
    categorical_cols = ['ever_married', 'work_type', 'smoking_status']
    data[categorical_cols] = data[categorical_cols].astype('category')
    X = data.drop('stroke', axis=1)
    y = data['stroke'].values

    # 独热编码
    X = pd.get_dummies(X, columns=categorical_cols)
    X = X.values.astype(np.float32)

    # 执行交叉验证
    print("Starting cross-validation...")
    avg_metrics, std_metrics = cross_validate(X, y, n_splits=10)

    # 打印最终结果
    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall/Sensitivity: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average Specificity: {avg_metrics['specificity']:.4f} ± {std_metrics['specificity']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")

if __name__ == '__main__':
    main()

Starting cross-validation...


Fold 1 Metrics:
Accuracy: 0.7123
F1 Score: 0.2054
Precision: 0.1187
Recall/Sensitivity: 0.7600
Specificity: 0.7099
AUPRC: 0.2017
Confusion Matrix:
TN: 345 | FP: 141
FN: 6 | TP: 19


Fold 2 Metrics:
Accuracy: 0.7025
F1 Score: 0.1828
Precision: 0.1056
Recall/Sensitivity: 0.6800
Specificity: 0.7037
AUPRC: 0.2360
Confusion Matrix:
TN: 342 | FP: 144
FN: 8 | TP: 17


Fold 3 Metrics:
Accuracy: 0.6908
F1 Score: 0.2255
Precision: 0.1285
Recall/Sensitivity: 0.9200
Specificity: 0.6790
AUPRC: 0.2447
Confusion Matrix:
TN: 330 | FP: 156
FN: 2 | TP: 23


Fold 4 Metrics:
Accuracy: 0.7182
F1 Score: 0.2088
Precision: 0.1210
Recall/Sensitivity: 0.7600
Specificity: 0.7160
AUPRC: 0.1400
Confusion Matrix:
TN: 348 | FP: 138
FN: 6 | TP: 19


Fold 5 Metrics:
Accuracy: 0.7123
F1 Score: 0.2304
Precision: 0.1325
Recall/Sensitivity: 0.8800
Specificity: 0.7037
AUPRC: 0.1738
Confusion Matrix:
TN: 342 | FP: 144
FN: 3 | TP: 22


Fold 6 Metrics:
Accuracy: 0.6986
F1 Score: 0.1979
Precision

In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, average_precision_score,
                             precision_recall_curve, auc, confusion_matrix)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体为黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(cm.shape[1]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(cm.shape[0]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'SVM_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()

def interpolate_pr_curve(precision, recall, num_points=100):
    """将PR曲线插值到固定长度的点"""
    if len(precision) < 2 or len(recall) < 2:
        return np.linspace(0, 1, num_points), np.linspace(1, 0, num_points)

    # 确保recall是单调递增的
    sorted_indices = np.argsort(recall)
    recall = np.array(recall)[sorted_indices]
    precision = np.array(precision)[sorted_indices]

    # 插值
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(precision[0], precision[-1]))
    new_recall = np.linspace(0, 1, num_points)
    new_precision = f(new_recall)
    return new_precision, new_recall

def calculate_metrics(y_true, y_pred, y_scores):
    """计算评估指标"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'auprc': average_precision_score(y_true, y_scores)
    }

    # 添加混淆矩阵信息
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics.update({
        'confusion_matrix': {
            'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp
        },
        'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0
    })
    return metrics

def plot_pr_curve(y_true, y_scores, fold, save_path=None):
    """绘制PR曲线并返回插值后的数据"""
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    auprc = auc(recall, precision)

    # 插值到固定长度
    interp_precision, interp_recall = interpolate_pr_curve(precision, recall)

    plt.figure()
    plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.3f})')
    plt.xlabel('召回率', fontsize=14, fontweight='bold')
    plt.ylabel('精确率', fontsize=14, fontweight='bold')
    plt.title(f'PR Curve (Fold {fold})', fontsize=16, fontweight='bold')
    plt.legend()
    if save_path:
        plt.savefig(save_path)
    plt.close()

    return interp_precision, interp_recall, auprc

def train_svm_model(X_train, y_train, X_val, y_val):
    """训练SVM模型"""
    svm = SVC(probability=True, random_state=42)

    # 使用网格搜索调参来寻找最优参数
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
    grid_search = GridSearchCV(svm, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print(f"Best Parameters: {grid_search.best_params_}")

    # 使用最优参数训练SVM
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    return best_model

def cross_validate(X, y, n_splits=10):
    """执行交叉验证"""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []
    all_interp_precisions = []
    interp_recall = np.linspace(0, 1, 100)  # 固定100个recall点
    
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
        print(f'\n{"=" * 40} Fold {fold}/{n_splits} {"=" * 40}')

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # SMOTE过采样
        smote = SMOTE(random_state=42, k_neighbors=min(5, sum(y_train == 1) - 1))
        X_res, y_res = smote.fit_resample(X_train_scaled, y_train)

        # 训练SVM模型
        model = train_svm_model(X_res, y_res, X_test_scaled, y_test)

        # 预测
        y_scores = model.predict_proba(X_test_scaled)[:, 1]  # 取正类的预测概率
        y_pred = (y_scores >= 0.5).astype(int)

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred, y_scores)
        results.append(metrics)
        
        # 跟踪最佳折叠（根据AUPRC）
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold
            best_y_true = y_test
            best_y_pred = y_pred

        # PR曲线
        interp_precision, _, _ = plot_pr_curve(
            y_test, y_scores, fold,
            save_path=f'pr_curve_fold{fold}.png'
        )
        all_interp_precisions.append(interp_precision)

        # 打印结果
        print(f"\nFold {fold} Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall/Sensitivity: {metrics['recall']:.4f}")
        print(f"Specificity: {metrics['specificity']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"Confusion Matrix: {metrics['confusion_matrix']}")

    # 绘制最佳折的混淆矩阵
    if best_y_true is not None and best_y_pred is not None:
        print(f"\nPlotting confusion matrix for best fold: {best_fold}")
        plot_confusion_matrix(best_y_true, best_y_pred, best_fold, dpi=720)

    # 计算平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'specificity': np.mean([r['specificity'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results]),
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'specificity': np.std([r['specificity'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results]),
    }

    # 绘制平均PR曲线
    if all_interp_precisions:
        mean_precision = np.mean(all_interp_precisions, axis=0)
        mean_auprc = auc(interp_recall, mean_precision)

        plt.figure(figsize=(10, 6))
        for i, prec in enumerate(all_interp_precisions, 1):
            plt.plot(interp_recall, prec, alpha=0.2, label=f'Fold {i}')

        plt.plot(interp_recall, mean_precision, 'r-',
                 linewidth=3, label=f'Mean (AUPRC = {mean_auprc:.3f})')
        plt.xlabel('召回率', fontsize=14, fontweight='bold')
        plt.ylabel('精确率', fontsize=14, fontweight='bold')
        plt.title('平均 Precision-Recall 曲线', fontsize=16, fontweight='bold')
        plt.legend()
        plt.savefig('average_pr_curve.png')
        plt.close()

    return avg_metrics, std_metrics

def main():
    # 加载数据
    data = pd.read_csv('preparations/stroke_output.csv')

    # 预处理
    categorical_cols = ['ever_married', 'work_type', 'smoking_status']
    data[categorical_cols] = data[categorical_cols].astype('category')
    X = data.drop('stroke', axis=1)
    y = data['stroke'].values

    # 独热编码
    X = pd.get_dummies(X, columns=categorical_cols)
    X = X.values.astype(np.float32)

    # 执行交叉验证
    print("Starting cross-validation...")
    avg_metrics, std_metrics = cross_validate(X, y, n_splits=10)

    # 打印最终结果
    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall/Sensitivity: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average Specificity: {avg_metrics['specificity']:.4f} ± {std_metrics['specificity']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")

if __name__ == '__main__':
    main()

Starting cross-validation...

Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

Fold 1 Metrics:
Accuracy: 0.8141
F1 Score: 0.2149
Precision: 0.1354
Recall/Sensitivity: 0.5200
Specificity: 0.8292
AUPRC: 0.2136
Confusion Matrix: {'TN': 403, 'FP': 83, 'FN': 12, 'TP': 13}

Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

Fold 2 Metrics:
Accuracy: 0.8121
F1 Score: 0.1864
Precision: 0.1183
Recall/Sensitivity: 0.4400
Specificity: 0.8313
AUPRC: 0.1647
Confusion Matrix: {'TN': 404, 'FP': 82, 'FN': 14, 'TP': 11}

Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

Fold 3 Metrics:
Accuracy: 0.8278
F1 Score: 0.2542
Precision: 0.1613
Recall/Sensitivity: 0.6000
Specificity: 0.8395
AUPRC: 0.1843
Confusion Matrix: {'TN': 408, 'FP': 78, 'FN': 10, 'TP': 15}

Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

Fold 4 Metrics:
Accuracy: 0.8239
F1 Score: 0.1176
Precision: 0.0779
Recall/Sensitivity: 0.2400
Specificity: 0.8539
AUPRC: 0.1411
Confusion Matrix: {'TN'

In [72]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                           recall_score, average_precision_score,
                           precision_recall_curve, auc, confusion_matrix)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interpolate

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体为黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

def plot_confusion_matrix(y_true, y_pred, fold, dpi=720):
    """绘制正方形混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # 百分比表示

    plt.figure(figsize=(6, 6))
    ax = sns.heatmap(cm_percentage, annot=False, fmt='.2f', cmap='Blues', square=True, cbar=False,
                     linewidths=2, linecolor='black')

    # 在每个格子中显示个数和百分比
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            # 判断字体颜色，深色背景用白色字体，浅色背景用黑色字体
            text_color = 'white' if cm_percentage[i, j] > 50 else 'black'
            ax.text(j + 0.5, i + 0.5, f'{cm[i, j]}\n({cm_percentage[i, j]:.2f}%)',
                    color=text_color, ha='center', va='center', fontsize=14, fontweight='bold')

    # 添加中文标签
    plt.xlabel('预测类别', fontsize=16, fontweight='bold')
    plt.ylabel('实际类别', fontsize=16, fontweight='bold')
    plt.xticks(ticks=np.arange(cm.shape[1]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')
    plt.yticks(ticks=np.arange(cm.shape[0]) + 0.5, labels=['0', '1'], fontsize=14, fontweight='bold')

    # 调整布局，减少空白边缘
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # 保存混淆矩阵图
    plt.savefig(f'XGB_best_fold_confusion_matrix_fold{fold}.png', dpi=dpi)
    plt.close()

def interpolate_pr_curve(precision, recall, num_points=100):
    """将PR曲线插值到固定长度的点"""
    if len(precision) < 2 or len(recall) < 2:
        return np.linspace(0, 1, num_points), np.linspace(1, 0, num_points)

    # 确保recall是单调递增的
    sorted_indices = np.argsort(recall)
    recall = np.array(recall)[sorted_indices]
    precision = np.array(precision)[sorted_indices]

    # 插值
    f = interpolate.interp1d(recall, precision, bounds_error=False, fill_value=(precision[0], precision[-1]))
    new_recall = np.linspace(0, 1, num_points)
    new_precision = f(new_recall)
    return new_precision, new_recall

def calculate_metrics(y_true, y_pred, y_scores):
    """计算评估指标"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'auprc': average_precision_score(y_true, y_scores)
    }

    # 添加混淆矩阵信息
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics.update({
        'confusion_matrix': {
            'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp
        },
        'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0
    })
    return metrics

def plot_pr_curve(y_true, y_scores, fold, save_path=None):
    """绘制PR曲线并返回插值后的数据"""
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    auprc = auc(recall, precision)

    # 插值到固定长度
    interp_precision, interp_recall = interpolate_pr_curve(precision, recall)

    plt.figure()
    plt.plot(recall, precision, label=f'Fold {fold} (AUPRC = {auprc:.3f})')
    plt.xlabel('召回率', fontsize=14, fontweight='bold')
    plt.ylabel('精确率', fontsize=14, fontweight='bold')
    plt.title(f'PR Curve (Fold {fold})', fontsize=16, fontweight='bold')
    plt.legend()
    if save_path:
        plt.savefig(save_path)
    plt.close()

    return interp_precision, interp_recall, auprc

def train_xgb_model(X_train, y_train, X_val, y_val):
    """训练XGBoost模型"""
    params = {
        'objective': 'binary:logistic',
        'eval_metric': ['auc', 'logloss'],
        'learning_rate': 0.05,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'scale_pos_weight': float(np.sum(y_train == 0)) / np.sum(y_train == 1),  # 处理类别不平衡
        'seed': 42,
        'n_jobs': -1
    }

    # 将数据转换为DMatrix格式
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    # 训练XGBoost模型
    watchlist = [(dtrain, 'train'), (dval, 'eval')]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=watchlist,
        early_stopping_rounds=50,
        verbose_eval=50
    )
    return model

def cross_validate(X, y, n_splits=10):
    """执行交叉验证"""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []
    all_interp_precisions = []
    interp_recall = np.linspace(0, 1, 100)  # 固定100个recall点
    
    best_fold_metrics = None
    best_fold = -1
    best_y_true = None
    best_y_pred = None

    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
        print(f'\n{"=" * 40} Fold {fold}/{n_splits} {"=" * 40}')

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 标准化
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # SMOTE过采样
        smote = SMOTE(random_state=42, k_neighbors=min(5, sum(y_train == 1) - 1))
        X_res, y_res = smote.fit_resample(X_train_scaled, y_train)

        # 训练模型
        model = train_xgb_model(X_res, y_res, X_test_scaled, y_test)

        # 预测
        dtest = xgb.DMatrix(X_test_scaled)
        y_scores = model.predict(dtest)
        y_pred = (y_scores >= 0.5).astype(int)

        # 计算指标
        metrics = calculate_metrics(y_test, y_pred, y_scores)
        results.append(metrics)
        
        # 跟踪最佳折叠（根据AUPRC）
        if best_fold_metrics is None or metrics['auprc'] > best_fold_metrics['auprc']:
            best_fold_metrics = metrics
            best_fold = fold
            best_y_true = y_test
            best_y_pred = y_pred

        # PR曲线
        interp_precision, _, _ = plot_pr_curve(
            y_test, y_scores, fold,
            save_path=f'pr_curve_fold{fold}.png'
        )
        all_interp_precisions.append(interp_precision)

        # 打印结果
        print(f"\nFold {fold} Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall/Sensitivity: {metrics['recall']:.4f}")
        print(f"Specificity: {metrics['specificity']:.4f}")
        print(f"AUPRC: {metrics['auprc']:.4f}")
        print(f"Confusion Matrix:")
        print(f"TN: {metrics['confusion_matrix']['TN']} | FP: {metrics['confusion_matrix']['FP']}")
        print(f"FN: {metrics['confusion_matrix']['FN']} | TP: {metrics['confusion_matrix']['TP']}")

    # 绘制最佳折的混淆矩阵
    if best_y_true is not None and best_y_pred is not None:
        print(f"\nPlotting confusion matrix for best fold: {best_fold}")
        plot_confusion_matrix(best_y_true, best_y_pred, best_fold, dpi=720)

    # 计算平均指标
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in results]),
        'f1': np.mean([r['f1'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results]),
        'specificity': np.mean([r['specificity'] for r in results]),
        'auprc': np.mean([r['auprc'] for r in results]),
    }

    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in results]),
        'f1': np.std([r['f1'] for r in results]),
        'precision': np.std([r['precision'] for r in results]),
        'recall': np.std([r['recall'] for r in results]),
        'specificity': np.std([r['specificity'] for r in results]),
        'auprc': np.std([r['auprc'] for r in results]),
    }

    # 绘制平均PR曲线
    if all_interp_precisions:
        mean_precision = np.mean(all_interp_precisions, axis=0)
        mean_auprc = auc(interp_recall, mean_precision)

        plt.figure(figsize=(10, 6))
        for i, prec in enumerate(all_interp_precisions, 1):
            plt.plot(interp_recall, prec, alpha=0.2, label=f'Fold {i}')

        plt.plot(interp_recall, mean_precision, 'r-',
                 linewidth=3, label=f'Mean (AUPRC = {mean_auprc:.3f})')
        plt.xlabel('召回率', fontsize=14, fontweight='bold')
        plt.ylabel('精确率', fontsize=14, fontweight='bold')
        plt.title('平均 Precision-Recall 曲线', fontsize=16, fontweight='bold')
        plt.legend()
        plt.savefig('average_pr_curve.png')
        plt.close()

    return avg_metrics, std_metrics

def main():
    # 加载数据
    data = pd.read_csv('preparations/stroke_output.csv')

    # 预处理
    categorical_cols = ['ever_married', 'work_type', 'smoking_status']
    data[categorical_cols] = data[categorical_cols].astype('category')
    X = data.drop('stroke', axis=1)
    y = data['stroke'].values

    # 独热编码
    X = pd.get_dummies(X, columns=categorical_cols)
    X = X.values.astype(np.float32)

    # 执行交叉验证
    print("Starting cross-validation...")
    avg_metrics, std_metrics = cross_validate(X, y, n_splits=10)

    # 打印最终结果
    print('\n' + '=' * 50)
    print('Final Cross-Validation Results:')
    print('=' * 50)
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average F1 Score: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall/Sensitivity: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average Specificity: {avg_metrics['specificity']:.4f} ± {std_metrics['specificity']:.4f}")
    print(f"Average AUPRC: {avg_metrics['auprc']:.4f} ± {std_metrics['auprc']:.4f}")

if __name__ == '__main__':
    main()

Starting cross-validation...

[0]	train-auc:0.88631	train-logloss:0.66888	eval-auc:0.82753	eval-logloss:0.67050
[50]	train-auc:0.95287	train-logloss:0.33305	eval-auc:0.83539	eval-logloss:0.39765
[100]	train-auc:0.97344	train-logloss:0.26400	eval-auc:0.82749	eval-logloss:0.34244
[150]	train-auc:0.98386	train-logloss:0.21969	eval-auc:0.82074	eval-logloss:0.30879
[200]	train-auc:0.98950	train-logloss:0.19017	eval-auc:0.80996	eval-logloss:0.28692
[250]	train-auc:0.99321	train-logloss:0.16653	eval-auc:0.80222	eval-logloss:0.27429
[300]	train-auc:0.99541	train-logloss:0.14761	eval-auc:0.79514	eval-logloss:0.26700
[350]	train-auc:0.99678	train-logloss:0.13198	eval-auc:0.79761	eval-logloss:0.25917
[400]	train-auc:0.99773	train-logloss:0.11844	eval-auc:0.79366	eval-logloss:0.25722
[450]	train-auc:0.99833	train-logloss:0.10852	eval-auc:0.79078	eval-logloss:0.25423
[500]	train-auc:0.99881	train-logloss:0.09896	eval-auc:0.78840	eval-logloss:0.25217
[550]	train-auc:0.99915	train-logloss:0.09071	eva