# 随机森林

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [2]:
# 设置随机种子以确保结果的可重复性
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

In [4]:
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]

In [5]:
y = df['标签']

In [6]:
# 连续变量
continuous_vars = ['PR', 'ki-67']

In [7]:
# 使用分层划分以保持类别比例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [8]:
# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])

In [9]:
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

In [10]:
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

In [11]:
# 定义随机森林分类器
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

In [12]:
# 重建参数网格 - 严格限制复杂度
param_grid = {
    'n_estimators': [100, 150],       # 减少树数量
    'max_depth': [3, 5, 7],           # 严格限制深度!
    'min_samples_split': [10, 20],     # 大幅提高
    'min_samples_leaf': [10, 20],      # 最低10个样本
    'max_features': ['sqrt'],          # 固定推荐值
    'class_weight': [None, 'balanced', class_weight_dict]    # 强制类别平衡
}

In [13]:
# 创建F1评估器
f1_scorer = make_scorer(f1_score, pos_label=1)

In [14]:
# 使用网格搜索和交叉验证来寻找最佳参数
#grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search = GridSearchCV(
                    estimator=rf,
                    param_grid=param_grid,
                    scoring=f1_scorer,  # 关键修改！用F1替代准确率
                    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                    n_jobs=-1
)

In [15]:
# 拟合模型
grid_search.fit(X_train, y_train)

In [16]:
# 在测试集上评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 输出评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'class_weight': {0: 0.6183431952662722, 1: 2.6125}, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 10, 'n_estimators': 100}
测试集准确率 Accuracy: 0.6667
测试集召回率 Recall: 0.2917
测试集F1分数: 0.2500
测试集AUC值: 0.6111

测试集分类报告：
               precision    recall  f1-score   support

           0       0.82      0.75      0.79       102
           1       0.22      0.29      0.25        24

    accuracy                           0.67       126
   macro avg       0.52      0.52      0.52       126
weighted avg       0.70      0.67      0.68       126



In [17]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("rf_roc_curve.csv", index=False)
print("ROC数据已保存为 rf_roc_curve.csv")

ROC数据已保存为 rf_roc_curve.csv


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR', '腋窝淋巴结状态', '手术前怀孕', '目前月经情况', 'ki-67', 'HER2+FISH', '治疗后怀孕',
        '治疗后生产', 'LN转移个数', '放疗', '化疗期间是否应用诺雷德',
        '靶向治疗（赫赛汀或赫赛汀+帕捷特）', '手术方式', '化疗方案', '内分泌治疗方案']]
y = df['标签']

# 连续变量标准化
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])

X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义随机森林
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# 参数网格
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5, 7],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [10, 20],
    'max_features': ['sqrt'],
    'class_weight': [None, 'balanced', class_weight_dict]
}

# F1 scorer
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索 + 交叉验证（在训练集上）
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# 测试集评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 输出评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'class_weight': 'balanced', 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 100}
测试集准确率 Accuracy: 0.6667
测试集召回率 Recall: 0.5417
测试集F1分数: 0.3824
测试集AUC值: 0.7124

测试集分类报告：
               precision    recall  f1-score   support

           0       0.87      0.70      0.77       102
           1       0.30      0.54      0.38        24

    accuracy                           0.67       126
   macro avg       0.58      0.62      0.58       126
weighted avg       0.76      0.67      0.70       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("rf_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 rf_bjd_roc_curve.csv")

ROC数据已保存为 rf_bjd_roc_curve.csv


# XGBoost

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子以确保结果可重复
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征和标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗', 
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 分层划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重（XGBoost 不直接支持 class_weight，但我们可以通过 scale_pos_weight 参数近似实现）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
# 如果是二分类：类别为0和1，设置 scale_pos_weight = neg/pos
pos_weight = class_weight_dict[0] / class_weight_dict[1]

# 定义XGBoost分类器
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# 参数网格（根据XGBoost调参经验设置）
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'min_child_weight': [1, 5],
    'scale_pos_weight': [pos_weight],  # 加入类别不平衡处理
}

# F1作为评估指标
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型训练
grid_search.fit(X_train, y_train)

# 最佳模型评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 150, 'scale_pos_weight': 0.23668639053254442, 'subsample': 0.8}
测试集准确率 Accuracy: 0.7937
测试集召回率 Recall: 0.0417
测试集F1分数: 0.0714
测试集AUC值: 0.6462

测试集分类报告：
               precision    recall  f1-score   support

           0       0.81      0.97      0.88       102
           1       0.25      0.04      0.07        24

    accuracy                           0.79       126
   macro avg       0.53      0.51      0.48       126
weighted avg       0.70      0.79      0.73       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("xgb_roc_curve.csv", index=False)
print("ROC数据已保存为 xgb_roc_curve.csv")

ROC数据已保存为 xgb_roc_curve.csv


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子以确保结果可重复
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征和标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗', 
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重（XGBoost 不直接支持 class_weight，但我们可以通过 scale_pos_weight 参数近似实现）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
# 如果是二分类：类别为0和1，设置 scale_pos_weight = neg/pos
pos_weight = class_weight_dict[0] / class_weight_dict[1]

# 定义XGBoost分类器
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# 参数网格（根据XGBoost调参经验设置）
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'min_child_weight': [1, 5],
    'scale_pos_weight': [pos_weight],  # 加入类别不平衡处理
}

# F1作为评估指标
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型训练
grid_search.fit(X_train, y_train)

# 最佳模型评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 150, 'scale_pos_weight': 0.5810185185185186, 'subsample': 0.8}
测试集准确率 Accuracy: 0.7540
测试集召回率 Recall: 0.5000
测试集F1分数: 0.4364
测试集AUC值: 0.6977

测试集分类报告：
               precision    recall  f1-score   support

           0       0.87      0.81      0.84       102
           1       0.39      0.50      0.44        24

    accuracy                           0.75       126
   macro avg       0.63      0.66      0.64       126
weighted avg       0.78      0.75      0.77       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("xgb_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 xgb_bjd_roc_curve.csv")

ROC数据已保存为 xgb_bjd_roc_curve.csv


# BP神经网络

In [1]:
import random
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [2]:
# 设置随机种子以确保结果的可重复性
random.seed(42)
np.random.seed(42)
tf.keras.utils.set_random_seed(42)

In [3]:
# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67']
# 使用分层划分以保持类别比例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义神经网络模型
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# 编译模型
adam = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

# 定义早停回调
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 训练模型
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, 
                    class_weight=class_weight_dict, callbacks=[early_stopping],shuffle=False,verbose=1)

# 评估模型
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

# 进行预测
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# 导入必要的评估指标库
from sklearn.metrics import f1_score, roc_auc_score

# 计算F1值（加权平均适用于二分类）
f1 = f1_score(y_test, y_pred, average='binary')

# 计算AUC值
auc = roc_auc_score(y_test, y_pred_prob)

# 输出结果
print(f'Test F1 Score: {f1:.4f}')
print(f'Test AUC: {auc:.4f}')

# 输出分类报告和混淆矩阵
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [4]:
# 计算FPR、TPR和阈值
from sklearn.metrics import roc_curve
import pandas as pd

# 注意：y_pred_prob 是模型对 X_test 的概率预测结果
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)


# 保存为CSV文件供后续绘图使用
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("bp_roc_curve.csv", index=False)
print("ROC 曲线数据已保存至 bp_roc_curve.csv")

ROC 曲线数据已保存至 bp_roc_curve.csv


In [5]:
import random
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 设置随机种子以确保结果的可重复性
random.seed(42)
np.random.seed(42)
tf.keras.utils.set_random_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义神经网络模型
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# 编译模型
adam = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

# 定义早停回调
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 训练模型
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, 
                    class_weight=class_weight_dict, callbacks=[early_stopping],shuffle=False,verbose=1)

# 评估模型
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

# 进行预测
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# 导入必要的评估指标库
from sklearn.metrics import f1_score, roc_auc_score

# 计算F1值（加权平均适用于二分类）
f1 = f1_score(y_test, y_pred, average='binary')

# 计算AUC值
auc = roc_auc_score(y_test, y_pred_prob)

# 输出结果
print(f'Test F1 Score: {f1:.4f}')
print(f'Test AUC: {auc:.4f}')

# 输出分类报告和混淆矩阵
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [6]:
# 计算FPR、TPR和阈值
from sklearn.metrics import roc_curve
import pandas as pd

# 注意：y_pred_prob 是模型对 X_test 的概率预测结果
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)


# 保存为CSV文件供后续绘图使用
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("bp_bjd_roc_curve.csv", index=False)
print("ROC 曲线数据已保存至 bp_bjd_roc_curve.csv")

ROC 曲线数据已保存至 bp_bjd_roc_curve.csv


# MLP

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# 设置随机种子
np.random.seed(42)

# 读取数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['LN转移个数', '腋窝淋巴结状态', 'PR', 'HER2+FISH', 'ki-67', 
        '手术前怀孕', '治疗后怀孕', '治疗后生产', '目前月经情况', '手术方式',
        '放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）',
        '化疗方案', '内分泌治疗方案']]
y = df['标签']

# 连续变量标准化
continuous_vars = ['PR', 'ki-67']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 样本权重处理类别不平衡
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# 定义MLP模型
mlp = MLPClassifier(max_iter=500, random_state=42)

# 参数搜索空间（适合小样本）
param_grid = {
    'hidden_layer_sizes': [(32,), (64,), (32, 32), (64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],  # 'lbfgs' 也可尝试，但不支持 sample_weight
    'alpha': [0.0001, 0.001],  # L2正则项
    'learning_rate': ['constant', 'adaptive'],
}

# 分层K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 网格搜索
grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

# 拟合
grid_search.fit(X_train, y_train)

# 最佳模型
best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_test)
y_proba = best_mlp.predict_proba(X_test)[:, 1]

# 评估
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (64,), 'learning_rate': 'constant', 'solver': 'adam'}
Accuracy: 0.7619
F1 Score: 0.1176
ROC AUC: 0.6818

Confusion Matrix:
[[94  8]
 [22  2]]

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.92      0.86       102
           1       0.20      0.08      0.12        24

    accuracy                           0.76       126
   macro avg       0.51      0.50      0.49       126
weighted avg       0.69      0.76      0.72       126





In [2]:
# 输出FPR、TPR和Thresholds
from sklearn.metrics import roc_curve
import pandas as pd

fpr, tpr, thresholds = roc_curve(y_test, y_proba)


# 保存为CSV文件
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("mlp_roc_curve.csv", index=False)
print("ROC 曲线数据已保存为 mlp_roc_curve.csv")

ROC 曲线数据已保存为 mlp_roc_curve.csv


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# 设置随机种子
np.random.seed(42)

# 读取数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['LN转移个数', '腋窝淋巴结状态', 'PR', 'HER2+FISH', 'ki-67', 
        '手术前怀孕', '治疗后怀孕', '治疗后生产', '目前月经情况', '手术方式',
        '放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）',
        '化疗方案', '内分泌治疗方案']]
y = df['标签']

# 连续变量标准化
continuous_vars = ['PR', 'ki-67']
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 样本权重处理类别不平衡
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# 定义MLP模型
mlp = MLPClassifier(max_iter=500, random_state=42)

# 参数搜索空间（适合小样本）
param_grid = {
    'hidden_layer_sizes': [(32,), (64,), (32, 32), (64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],  # 'lbfgs' 也可尝试，但不支持 sample_weight
    'alpha': [0.0001, 0.001],  # L2正则项
    'learning_rate': ['constant', 'adaptive'],
}

# 分层K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 网格搜索
grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

# 拟合
grid_search.fit(X_train, y_train)

# 最佳模型
best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_test)
y_proba = best_mlp.predict_proba(X_test)[:, 1]

# 评估
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (32,), 'learning_rate': 'constant', 'solver': 'adam'}
Accuracy: 0.7381
F1 Score: 0.4211
ROC AUC: 0.7324

Confusion Matrix:
[[81 21]
 [12 12]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.79      0.83       102
           1       0.36      0.50      0.42        24

    accuracy                           0.74       126
   macro avg       0.62      0.65      0.63       126
weighted avg       0.77      0.74      0.75       126





In [4]:
# 输出FPR、TPR和Thresholds
from sklearn.metrics import roc_curve
import pandas as pd

fpr, tpr, thresholds = roc_curve(y_test, y_proba)


# 保存为CSV文件
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("mlp_bjd_roc_curve.csv", index=False)
print("ROC 曲线数据已保存为 mlp_bjd_roc_curve.csv")

ROC 曲线数据已保存为 mlp_bjd_roc_curve.csv


# 图神经网络

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.neighbors import kneighbors_graph
import umap
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dropout, Dense

from spektral.data import Dataset, Graph
from spektral.data.loaders import SingleLoader
from spektral.layers import GCNConv

#random.seed(42)
np.random.seed(42)
tf.keras.utils.set_random_seed(42)

# 1. —— 数据加载与预处理 —— 
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 原始特征和标签
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', 
        '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', 
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
y = df['标签']

# 连续变量要做标准化
continuous_vars = ['PR', 'ki-67']

# 分层采样划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算训练集中每个类别的权重，用来给节点级别的损失加权
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train),
                                     y=y_train)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}

# 将标签转换为 one-hot 编码（二分类）
y_train_oh = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test_oh = tf.keras.utils.to_categorical(y_test, num_classes=2)

k = 49

knn_train = kneighbors_graph(
    X_train.values, 
    n_neighbors=k, 
    mode='connectivity', 
    include_self=False
).toarray().astype(np.float32)

knn_train = np.maximum(knn_train, knn_train.T)

knn_test = kneighbors_graph(
    X_test.values, 
    n_neighbors=k, 
    mode='connectivity', 
    include_self=False
).toarray().astype(np.float32)
knn_test = np.maximum(knn_test, knn_test.T)

class CustomGraphDataset(Dataset):
    """
    接收 X(节点特征矩阵)、y(one-hot 标签矩阵)、adj(邻接矩阵)，
    返回一个只有单张图的 Dataset。
    """
    def __init__(self, X, y, adj, **kwargs):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
        self.adj = adj.astype(np.float32)
        super().__init__(**kwargs)

    def read(self):
        # 只返回一张 Graph：节点数目 = 样本数
        graph = Graph(x=self.X, a=self.adj, y=self.y)
        return [graph]
    
train_dataset = CustomGraphDataset(
    X_train.values,   # (N_train, n_features)
    y_train_oh,       # (N_train, 2)
    knn_train         # (N_train, N_train)
)
test_dataset = CustomGraphDataset(
    X_test.values, 
    y_test_oh, 
    knn_test
)

train_loader = SingleLoader(train_dataset, epochs=30)
test_loader = SingleLoader(test_dataset)

class GCNModel(Model):
    def __init__(self):
        super().__init__()
        # 输入维度不需要在这里显式写，Spektral 会自动推断（从数据集中 Graph.x.shape）
        self.gcn1 = GCNConv(32, activation='relu')
        self.dropout = Dropout(0.5)
        self.gcn2 = GCNConv(16, activation='relu')
        self.dense = Dense(2, activation='softmax')

    def call(self, inputs, training=False):
        x, a = inputs  # x: (N, n_features)，a: (N, N)
        x = self.gcn1([x, a])
        x = self.dropout(x, training=training)
        x = self.gcn2([x, a])
        output = self.dense(x)  # 节点级别输出 (N, 2)
        return output
    
model = GCNModel()
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(
    train_loader.load(), 
    steps_per_epoch=1, 
    epochs=30,
    class_weight=class_weight_dict,
    verbose=1
)

loader_iter = iter(test_loader.load())  
(x_test_graph, a_test_graph), y_true_onehot = next(loader_iter)
y_prob = model([x_test_graph, a_test_graph], training=False).numpy()
y_pred = np.argmax(y_prob, axis=-1)
y_true_labels = np.argmax(y_true_onehot, axis=-1)
accuracy = accuracy_score(y_true_labels, y_pred)
f1 = f1_score(y_true_labels, y_pred)
auc = roc_auc_score(y_true_labels, y_prob[:, 1])

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_true_labels, y_pred))
print("\nClassification Report:")
print(classification_report(y_true_labels, y_pred))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy: 0.5635
F1 Score: 0.3820
AUC: 0.6379
Confusion Matrix:
[[54 48]
 [ 7 17]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.53      0.66       102
           1       0.26      0.71      0.38        24

    accuracy                           0.56       126
   macro avg       0.57      0.62      0.52       126
weighted avg       0.77      0.56      0.61       126



In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.neighbors import kneighbors_graph
import umap
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dropout, Dense

from spektral.data import Dataset, Graph
from spektral.data.loaders import SingleLoader
from spektral.layers import GCNConv

#random.seed(42)
np.random.seed(42)
tf.keras.utils.set_random_seed(42)

# 1. —— 数据加载与预处理 —— 
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 原始特征和标签
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', 
        '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', 
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
y = df['标签']

# 连续变量要做标准化
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算训练集中每个类别的权重，用来给节点级别的损失加权
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train),
                                     y=y_train)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}

# 将标签转换为 one-hot 编码（二分类）
y_train_oh = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test_oh = tf.keras.utils.to_categorical(y_test, num_classes=2)

k = 28

knn_train = kneighbors_graph(
    X_train.values, 
    n_neighbors=k, 
    mode='connectivity', 
    include_self=False
).toarray().astype(np.float32)

knn_train = np.maximum(knn_train, knn_train.T)

knn_test = kneighbors_graph(
    X_test.values, 
    n_neighbors=k, 
    mode='connectivity', 
    include_self=False
).toarray().astype(np.float32)
knn_test = np.maximum(knn_test, knn_test.T)

class CustomGraphDataset(Dataset):
    """
    接收 X(节点特征矩阵)、y(one-hot 标签矩阵)、adj(邻接矩阵)，
    返回一个只有单张图的 Dataset。
    """
    def __init__(self, X, y, adj, **kwargs):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
        self.adj = adj.astype(np.float32)
        super().__init__(**kwargs)

    def read(self):
        # 只返回一张 Graph：节点数目 = 样本数
        graph = Graph(x=self.X, a=self.adj, y=self.y)
        return [graph]
    
train_dataset = CustomGraphDataset(
    X_train.values,   # (N_train, n_features)
    y_train_oh,       # (N_train, 2)
    knn_train         # (N_train, N_train)
)
test_dataset = CustomGraphDataset(
    X_test.values, 
    y_test_oh, 
    knn_test
)

train_loader = SingleLoader(train_dataset, epochs=30)
test_loader = SingleLoader(test_dataset)

class GCNModel(Model):
    def __init__(self):
        super().__init__()
        # 输入维度不需要在这里显式写，Spektral 会自动推断（从数据集中 Graph.x.shape）
        self.gcn1 = GCNConv(32, activation='relu')
        self.dropout = Dropout(0.5)
        self.gcn2 = GCNConv(16, activation='relu')
        self.dense = Dense(2, activation='softmax')

    def call(self, inputs, training=False):
        x, a = inputs  # x: (N, n_features)，a: (N, N)
        x = self.gcn1([x, a])
        x = self.dropout(x, training=training)
        x = self.gcn2([x, a])
        output = self.dense(x)  # 节点级别输出 (N, 2)
        return output
    
model = GCNModel()
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(
    train_loader.load(), 
    steps_per_epoch=1, 
    epochs=30,
    class_weight=class_weight_dict,
    verbose=1
)

loader_iter = iter(test_loader.load())  
(x_test_graph, a_test_graph), y_true_onehot = next(loader_iter)
y_prob = model([x_test_graph, a_test_graph], training=False).numpy()
y_pred = np.argmax(y_prob, axis=-1)
y_true_labels = np.argmax(y_true_onehot, axis=-1)
accuracy = accuracy_score(y_true_labels, y_pred)
f1 = f1_score(y_true_labels, y_pred)
auc = roc_auc_score(y_true_labels, y_prob[:, 1])

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_true_labels, y_pred))
print("\nClassification Report:")
print(classification_report(y_true_labels, y_pred))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy: 0.4762
F1 Score: 0.3400
AUC: 0.5760
Confusion Matrix:
[[43 59]
 [ 7 17]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.42      0.57       102
           1       0.22      0.71      0.34        24

    accuracy                           0.48       126
   macro avg       0.54      0.56      0.45       126
weighted avg       0.74      0.48      0.52       126



In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import umap
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.neighbors import kneighbors_graph

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dropout, Dense

from spektral.data import Dataset, Graph
from spektral.data.loaders import SingleLoader
from spektral.layers import GCNConv

# 尝试不同的 K
k_values = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50]
results = []

for k in k_values:
    print(f"\n=== 正在评估 K = {k} 的模型 ===")
    
    #random.seed(42)
    np.random.seed(42)
    tf.keras.utils.set_random_seed(42)

    # 1. —— 数据加载与预处理 —— 
    file_path = "预测.xlsx"
    df = pd.read_excel(file_path)

    # 原始特征和标签
    X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
    y = df['标签']

    # 连续变量要做标准化
    continuous_vars = ['PR', 'ki-67']

    # 从前627行中划分出20%作为测试集
    X_front = X.iloc[:627]
    y_front = y.iloc[:627]
    X_front_train, X_test, y_front_train, y_test = train_test_split(X_front, y_front, test_size=0.2, stratify=y_front, random_state=42)

    # 剩下的数据（从第628行开始）作为训练集的一部分
    X_rest = X.iloc[627:]
    y_rest = y.iloc[627:]

    # 拼接训练数据：前627行的80% + 剩下所有行
    X_train = pd.concat([X_front_train, X_rest], axis=0)
    y_train = pd.concat([y_front_train, y_rest], axis=0)

    # 标准化连续变量
    scaler = StandardScaler()
    X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
    X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

    # 计算训练集中每个类别的权重，用来给节点级别的损失加权
    class_weights = compute_class_weight(class_weight='balanced',classes=np.unique(y_train),y=y_train)
    class_weight_dict = {i: w for i, w in enumerate(class_weights)}
       
    # 将标签转换为 one-hot 编码（二分类）
    y_train_oh = tf.keras.utils.to_categorical(y_train, num_classes=2)
    y_test_oh = tf.keras.utils.to_categorical(y_test, num_classes=2)

    class CustomGraphDataset(Dataset):
        """
        接收 X(节点特征矩阵)、y(one-hot 标签矩阵)、adj(邻接矩阵)，
        返回一个只有单张图的 Dataset。
        """
        def __init__(self, X, y, adj, **kwargs):
            self.X = X.astype(np.float32)
            self.y = y.astype(np.float32)
            self.adj = adj.astype(np.float32)
            super().__init__(**kwargs)

        def read(self):
            # 只返回一张 Graph：节点数目 = 样本数
            graph = Graph(x=self.X, a=self.adj, y=self.y)
            return [graph]

    class GCNModel(Model):
        def __init__(self):
            super().__init__()
            # 输入维度不需要在这里显式写，Spektral 会自动推断（从数据集中 Graph.x.shape）
            self.gcn1 = GCNConv(32, activation='relu')
            self.dropout = Dropout(0.5)
            self.gcn2 = GCNConv(16, activation='relu')
            self.dense = Dense(2, activation='softmax')

        def call(self, inputs, training=False):
            x, a = inputs  # x: (N, n_features)，a: (N, N)
            x = self.gcn1([x, a])
            x = self.dropout(x, training=training)
            x = self.gcn2([x, a])
            output = self.dense(x)  # 节点级别输出 (N, 2)
            return output

    knn_train = kneighbors_graph(X_train.values, n_neighbors=k, mode='connectivity', include_self=False).toarray().astype(np.float32)
    knn_train = np.maximum(knn_train, knn_train.T)
    knn_test = kneighbors_graph(X_test.values, n_neighbors=k, mode='connectivity', include_self=False).toarray().astype(np.float32)
    knn_test = np.maximum(knn_test, knn_test.T)
    
    train_dataset = CustomGraphDataset(X_train.values,y_train_oh,knn_train)
    test_dataset = CustomGraphDataset(X_test.values, y_test_oh, knn_test)
    
    train_loader = SingleLoader(train_dataset, epochs=30)
    test_loader = SingleLoader(test_dataset)
    
    model = GCNModel()
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    model.fit(train_loader.load(), steps_per_epoch=1, epochs=30,class_weight=class_weight_dict,verbose=1)

    loader_iter = iter(test_loader.load())  
    (x_test_graph, a_test_graph), y_true_onehot = next(loader_iter)
    y_prob = model([x_test_graph, a_test_graph], training=False).numpy()
    y_pred = np.argmax(y_prob, axis=-1)
    y_true_labels = np.argmax(y_true_onehot, axis=-1)
    accuracy = accuracy_score(y_true_labels, y_pred)
    f1 = f1_score(y_true_labels, y_pred)
    auc = roc_auc_score(y_true_labels, y_prob[:, 1])
    results.append((k, auc))
    print(f"K = {k} -> AUC: {auc:.4f}")

# 排序输出结果
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)

print("\n=== AUC 分数从高到低排序 ===")
for k_val, auc_val in sorted_results:
    print(f"K = {k_val:<2} -> AUC = {auc_val:.4f}")

best_k, best_auc = sorted_results[0]
print(f"\n✅ 最佳 K 值为 {best_k}，对应 AUC = {best_auc:.4f}")

  from .autonotebook import tqdm as notebook_tqdm



=== 正在评估 K = 1 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 1 -> AUC: 0.5539

=== 正在评估 K = 2 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 2 -> AUC: 0.5065

=== 正在评估 K = 3 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
E

Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 6 -> AUC: 0.5129

=== 正在评估 K = 7 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 7 -> AUC: 0.4949

=== 正在评估 K = 8 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 8 -> AUC: 0.5172

=== 正在

Epoch 29/30
Epoch 30/30
K = 11 -> AUC: 0.5223

=== 正在评估 K = 12 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 12 -> AUC: 0.5458

=== 正在评估 K = 13 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 13 -> AUC: 0.5605

=== 正在评估 K = 14 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/3

Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 17 -> AUC: 0.5564

=== 正在评估 K = 18 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 18 -> AUC: 0.5568

=== 正在评估 K = 19 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30


Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 22 -> AUC: 0.5525

=== 正在评估 K = 23 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 23 -> AUC: 0.5462

=== 正在评估 K = 24 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 24 -> AUC: 0.5476

=== 正在评估 K = 25 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 

Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 25 -> AUC: 0.5449

=== 正在评估 K = 26 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 26 -> AUC: 0.5545

=== 正在评估 K = 27 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 30 -> AUC: 0.5756

=== 正在评估 K = 31 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 31 -> AUC: 0.5735

=== 正在评估 K = 32 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 32 -> AUC: 0.5729

=== 正在评估 K = 33 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/

Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 33 -> AUC: 0.5574

=== 正在评估 K = 34 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 34 -> AUC: 0.5441

=== 正在评估 K = 35 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 35 -> AU

Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 38 -> AUC: 0.5617

=== 正在评估 K = 39 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 39 -> AUC: 0.5531

=== 正在评估 K = 40 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 40 -> AUC: 0.5605

=== 正在评估 K = 41 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/3

Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 44 -> AUC: 0.5239

=== 正在评估 K = 45 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 45 -> AUC: 0.5082

=== 正在评估 K = 46 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epo

Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 49 -> AUC: 0.5274

=== 正在评估 K = 50 的模型 ===
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
K = 50 -> AUC: 0.5404

=== AUC 分数从高到低排序 ===
K = 28 -> AUC = 0.5760
K = 29 -> AUC = 0.5760
K = 30 -> AUC = 0.5756
K = 27 -> AUC = 0.5735
K = 31 -> AUC = 0.5735
K = 32 -> AUC = 0.5729
K = 16 -> AUC = 0.5686
K = 21 -> AUC = 0.5668
K = 20 -> AUC = 0.5623
K = 38 -> AUC = 0.5617
K = 13 -> AUC = 0.5605
K = 40 -> AUC = 0.5605
K = 37 -> AUC = 0.5582
K = 33 -> AUC = 0.5574
K = 18 -> AUC = 0.5568
K = 17 -> AUC = 0.5564
K = 41 -> AUC = 0.5564
K = 15 -> A

# 梯度提升树

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子以确保结果可重复
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征和标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗', 
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 分层划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
# 计算正类/负类样本比例（手动调整 sample_weight 时可能用得到）
pos_weight = class_weight_dict[0] / class_weight_dict[1]

# 定义梯度提升分类器
gbdt = GradientBoostingClassifier(random_state=42)

# 设置参数网格
param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [10, 20],
    'subsample': [0.8, 1.0]
}

# F1评分函数
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索交叉验证
grid_search = GridSearchCV(
    estimator=gbdt,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 拟合模型
grid_search.fit(X_train, y_train)

# 最佳模型评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# 输出结果
print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 150, 'subsample': 1.0}
测试集准确率 Accuracy: 0.7460
测试集召回率 Recall: 0.2083
测试集F1分数: 0.2381
测试集AUC值: 0.6095

测试集分类报告：
               precision    recall  f1-score   support

           0       0.82      0.87      0.85       102
           1       0.28      0.21      0.24        24

    accuracy                           0.75       126
   macro avg       0.55      0.54      0.54       126
weighted avg       0.72      0.75      0.73       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("gbt_roc_curve.csv", index=False)
print("ROC数据已保存为 gbt_roc_curve.csv")

ROC数据已保存为 gbt_roc_curve.csv


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子以确保结果可重复
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征和标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗', 
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
# 计算正类/负类样本比例（手动调整 sample_weight 时可能用得到）
pos_weight = class_weight_dict[0] / class_weight_dict[1]

# 定义梯度提升分类器
gbdt = GradientBoostingClassifier(random_state=42)

# 设置参数网格
param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [10, 20],
    'subsample': [0.8, 1.0]
}

# F1评分函数
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索交叉验证
grid_search = GridSearchCV(
    estimator=gbdt,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 拟合模型
grid_search.fit(X_train, y_train)

# 最佳模型评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# 输出结果
print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 0.8}
测试集准确率 Accuracy: 0.7381
测试集召回率 Recall: 0.5417
测试集F1分数: 0.4407
测试集AUC值: 0.7083

测试集分类报告：
               precision    recall  f1-score   support

           0       0.88      0.78      0.83       102
           1       0.37      0.54      0.44        24

    accuracy                           0.74       126
   macro avg       0.63      0.66      0.63       126
weighted avg       0.78      0.74      0.76       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("gbt_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 gbt_bjd_roc_curve.csv")

ROC数据已保存为 gbt_bjd_roc_curve.csv


# SVM

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子以确保结果可重复
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征和标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗', 
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 分层划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重（SVM 支持 class_weight）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义 SVM 分类器（使用概率输出）
svm = SVC(probability=True, class_weight='balanced', random_state=42)

# 参数网格
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# F1评分函数
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型训练
grid_search.fit(X_train, y_train)

# 最佳模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# 输出结果
print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
测试集准确率 Accuracy: 0.6429
测试集召回率 Recall: 0.4583
测试集F1分数: 0.3284
测试集AUC值: 0.6552

测试集分类报告：
               precision    recall  f1-score   support

           0       0.84      0.69      0.76       102
           1       0.26      0.46      0.33        24

    accuracy                           0.64       126
   macro avg       0.55      0.57      0.54       126
weighted avg       0.73      0.64      0.68       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("svm_roc_curve.csv", index=False)
print("ROC数据已保存为 svm_roc_curve.csv")

ROC数据已保存为 svm_roc_curve.csv


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子以确保结果可重复
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征和标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗', 
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重（SVM 支持 class_weight）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义 SVM 分类器（使用概率输出）
svm = SVC(probability=True, class_weight='balanced', random_state=42)

# 参数网格
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# F1评分函数
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型训练
grid_search.fit(X_train, y_train)

# 最佳模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# 输出结果
print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
测试集准确率 Accuracy: 0.7222
测试集召回率 Recall: 0.5417
测试集F1分数: 0.4262
测试集AUC值: 0.6712

测试集分类报告：
               precision    recall  f1-score   support

           0       0.88      0.76      0.82       102
           1       0.35      0.54      0.43        24

    accuracy                           0.72       126
   macro avg       0.61      0.65      0.62       126
weighted avg       0.78      0.72      0.74       126



In [4]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("svm_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 svm_bjd_roc_curve.csv")

ROC数据已保存为 svm_bjd_roc_curve.csv


# LightGBM

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子以确保可重复
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 分层划分训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 连续变量标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重（正类比例调整）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
# LightGBM 用 scale_pos_weight 来处理类别不平衡（仅对二分类有效）
scale_pos_weight = class_weight_dict[0] / class_weight_dict[1]

# 定义 LGBM 分类器
lgbm = LGBMClassifier(
    random_state=42,
    objective='binary',
    scale_pos_weight=scale_pos_weight,
    boosting_type='gbdt',
    n_jobs=-1,
    verbose=-1
)

# 设置参数搜索网格
param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31],  # 和 max_depth 联动
    'min_child_samples': [10, 20],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# F1 分数作为调参指标
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型训练
grid_search.fit(X_train, y_train)

# 最佳模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 模型评估
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# 输出结果
print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_samples': 10, 'n_estimators': 150, 'num_leaves': 31, 'subsample': 0.8}
测试集准确率 Accuracy: 0.7778
测试集召回率 Recall: 0.1667
测试集F1分数: 0.2222
测试集AUC值: 0.6348

测试集分类报告：
               precision    recall  f1-score   support

           0       0.82      0.92      0.87       102
           1       0.33      0.17      0.22        24

    accuracy                           0.78       126
   macro avg       0.58      0.54      0.55       126
weighted avg       0.73      0.78      0.75       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("lgbm_roc_curve.csv", index=False)
print("ROC数据已保存为 lgbm_roc_curve.csv")

ROC数据已保存为 lgbm_roc_curve.csv


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子以确保可重复
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 连续变量标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 计算类别权重（正类比例调整）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
# LightGBM 用 scale_pos_weight 来处理类别不平衡（仅对二分类有效）
scale_pos_weight = class_weight_dict[0] / class_weight_dict[1]

# 定义 LGBM 分类器
lgbm = LGBMClassifier(
    random_state=42,
    objective='binary',
    scale_pos_weight=scale_pos_weight,
    boosting_type='gbdt',
    n_jobs=-1,
    verbose=-1
)

# 设置参数搜索网格
param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31],  # 和 max_depth 联动
    'min_child_samples': [10, 20],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# F1 分数作为调参指标
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型训练
grid_search.fit(X_train, y_train)

# 最佳模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 模型评估
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# 输出结果
print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_samples': 10, 'n_estimators': 100, 'num_leaves': 15, 'subsample': 0.8}
测试集准确率 Accuracy: 0.7778
测试集召回率 Recall: 0.5000
测试集F1分数: 0.4615
测试集AUC值: 0.7177

测试集分类报告：
               precision    recall  f1-score   support

           0       0.88      0.84      0.86       102
           1       0.43      0.50      0.46        24

    accuracy                           0.78       126
   macro avg       0.65      0.67      0.66       126
weighted avg       0.79      0.78      0.78       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("lgbm_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 lgbm_bjd_roc_curve.csv")

ROC数据已保存为 lgbm_bjd_roc_curve.csv


# AdaBoost

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 数据划分
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 类别权重（用于基础学习器）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义基础学习器：弱分类器（如深度为1的决策树）
base_estimator = DecisionTreeClassifier(max_depth=1, class_weight='balanced', random_state=42)

# 定义 AdaBoost 分类器
adaboost = AdaBoostClassifier(estimator=base_estimator, random_state=42)

# 设置参数搜索空间
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0]
}

# 评分函数：F1
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=adaboost,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型拟合
grid_search.fit(X_train, y_train)

# 测试集评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 输出评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'learning_rate': 0.1, 'n_estimators': 50}
测试集准确率 Accuracy: 0.6905
测试集召回率 Recall: 0.4167
测试集F1分数: 0.3390
测试集AUC值: 0.6389

测试集分类报告：
               precision    recall  f1-score   support

           0       0.85      0.75      0.80       102
           1       0.29      0.42      0.34        24

    accuracy                           0.69       126
   macro avg       0.57      0.59      0.57       126
weighted avg       0.74      0.69      0.71       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("adab_roc_curve.csv", index=False)
print("ROC数据已保存为 adab_roc_curve.csv")

ROC数据已保存为 adab_roc_curve.csv


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 类别权重（用于基础学习器）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义基础学习器：弱分类器（如深度为1的决策树）
base_estimator = DecisionTreeClassifier(max_depth=1, class_weight='balanced', random_state=42)

# 定义 AdaBoost 分类器
adaboost = AdaBoostClassifier(estimator=base_estimator, random_state=42)

# 设置参数搜索空间
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0]
}

# 评分函数：F1
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=adaboost,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型拟合
grid_search.fit(X_train, y_train)

# 测试集评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 输出评估指标
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'learning_rate': 0.1, 'n_estimators': 150}
测试集准确率 Accuracy: 0.7302
测试集召回率 Recall: 0.4167
测试集F1分数: 0.3704
测试集AUC值: 0.6624

测试集分类报告：
               precision    recall  f1-score   support

           0       0.85      0.80      0.83       102
           1       0.33      0.42      0.37        24

    accuracy                           0.73       126
   macro avg       0.59      0.61      0.60       126
weighted avg       0.75      0.73      0.74       126



In [4]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("adab_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 adab_bjd_roc_curve.csv")

ROC数据已保存为 adab_bjd_roc_curve.csv


# CatBoost

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier, Pool
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 类别变量索引（CatBoost支持类别型变量索引列表）
categorical_features = [i for i, col in enumerate(X.columns) if col not in continuous_vars]

# 分层划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 标准化连续变量（注意：CatBoost支持原始输入，但标准化仍然可提升性能）
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 类别权重：CatBoost支持 class_weights 参数
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义 CatBoost 分类器（设置 silent 和 class_weights）
catboost = CatBoostClassifier(
    random_state=42,
    verbose=0,
    class_weights=class_weight_dict,
    loss_function='Logloss',
    eval_metric='F1',
    task_type='CPU'
)

# 参数网格
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

# F1 评分器
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=catboost,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 拟合模型（指定类别特征索引）
grid_search.fit(X_train, y_train, cat_features=categorical_features)

# 最佳模型评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 输出评估结果
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'depth': 4, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.01}
测试集准确率 Accuracy: 0.6111
测试集召回率 Recall: 0.5000
测试集F1分数: 0.3288
测试集AUC值: 0.6001

测试集分类报告：
               precision    recall  f1-score   support

           0       0.84      0.64      0.73       102
           1       0.24      0.50      0.33        24

    accuracy                           0.61       126
   macro avg       0.54      0.57      0.53       126
weighted avg       0.73      0.61      0.65       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("catb_roc_curve.csv", index=False)
print("ROC数据已保存为 catb_roc_curve.csv")

ROC数据已保存为 catb_roc_curve.csv


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier, Pool
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 类别变量索引（CatBoost支持类别型变量索引列表）
categorical_features = [i for i, col in enumerate(X.columns) if col not in continuous_vars]

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 标准化连续变量（注意：CatBoost支持原始输入，但标准化仍然可提升性能）
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 类别权重：CatBoost支持 class_weights 参数
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义 CatBoost 分类器（设置 silent 和 class_weights）
catboost = CatBoostClassifier(
    random_state=42,
    verbose=0,
    class_weights=class_weight_dict,
    loss_function='Logloss',
    eval_metric='F1',
    task_type='CPU'
)

# 参数网格
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

# F1 评分器
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=catboost,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 拟合模型（指定类别特征索引）
grid_search.fit(X_train, y_train, cat_features=categorical_features)

# 最佳模型评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 输出评估结果
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'depth': 8, 'iterations': 100, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
测试集准确率 Accuracy: 0.7063
测试集召回率 Recall: 0.5000
测试集F1分数: 0.3934
测试集AUC值: 0.6671

测试集分类报告：
               precision    recall  f1-score   support

           0       0.87      0.75      0.81       102
           1       0.32      0.50      0.39        24

    accuracy                           0.71       126
   macro avg       0.59      0.63      0.60       126
weighted avg       0.76      0.71      0.73       126



In [2]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("catb_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 catb_bjd_roc_curve.csv")

ROC数据已保存为 catb_bjd_roc_curve.csv


# 决策树

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 分层划分训练/测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 类别权重（自动平衡类别）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义决策树分类器
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# 参数搜索空间
param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

# 使用 F1 分数作为评分指标
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型训练
grid_search.fit(X_train, y_train)

# 模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 输出评估结果
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 10}
测试集准确率 Accuracy: 0.6349
测试集召回率 Recall: 0.4167
测试集F1分数: 0.3030
测试集AUC值: 0.6315

测试集分类报告：
               precision    recall  f1-score   support

           0       0.83      0.69      0.75       102
           1       0.24      0.42      0.30        24

    accuracy                           0.63       126
   macro avg       0.54      0.55      0.53       126
weighted avg       0.72      0.63      0.67       126



In [6]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("dt_roc_curve.csv", index=False)
print("ROC数据已保存为 dt_roc_curve.csv")

ROC数据已保存为 dt_roc_curve.csv


In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 标准化连续变量
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 类别权重（自动平衡类别）
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义决策树分类器
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# 参数搜索空间
param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

# 使用 F1 分数作为评分指标
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 模型训练
grid_search.fit(X_train, y_train)

# 模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 输出评估结果
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 20}
测试集准确率 Accuracy: 0.6667
测试集召回率 Recall: 0.5417
测试集F1分数: 0.3824
测试集AUC值: 0.6391

测试集分类报告：
               precision    recall  f1-score   support

           0       0.87      0.70      0.77       102
           1       0.30      0.54      0.38        24

    accuracy                           0.67       126
   macro avg       0.58      0.62      0.58       126
weighted avg       0.76      0.67      0.70       126



In [8]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("dt_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 dt_bjd_roc_curve.csv")

ROC数据已保存为 dt_bjd_roc_curve.csv


# 逻辑回归

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 数据划分
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 连续变量标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 类别权重处理
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义逻辑回归分类器
log_reg = LogisticRegression(
    solver='liblinear',  # 更稳定的求解器，适合小数据
    class_weight='balanced',
    random_state=42
)

# 参数网格
param_grid = {
    'C': [0.01, 0.1, 1, 10],            # 正则化强度
    'penalty': ['l1', 'l2']             # L1 或 L2 正则
}

# 使用 F1 分数作为评分指标
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 拟合模型
grid_search.fit(X_train, y_train)

# 最佳模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 评估结果
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))

最佳参数： {'C': 0.1, 'penalty': 'l1'}
测试集准确率 Accuracy: 0.6905
测试集召回率 Recall: 0.5000
测试集F1分数: 0.3810
测试集AUC值: 0.6181

测试集分类报告：
               precision    recall  f1-score   support

           0       0.86      0.74      0.79       102
           1       0.31      0.50      0.38        24

    accuracy                           0.69       126
   macro avg       0.58      0.62      0.59       126
weighted avg       0.76      0.69      0.72       126



In [10]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("lr_roc_curve.csv", index=False)
print("ROC数据已保存为 lr_roc_curve.csv")

ROC数据已保存为 lr_roc_curve.csv


In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

# 特征与标签
X = df[['PR','腋窝淋巴结状态','手术前怀孕','目前月经情况','ki-67','HER2+FISH','治疗后怀孕','治疗后生产','LN转移个数','放疗',
        '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','手术方式','化疗方案','内分泌治疗方案']]
y = df['标签']

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 连续变量标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

# 类别权重处理
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# 定义逻辑回归分类器
log_reg = LogisticRegression(
    solver='liblinear',  # 更稳定的求解器，适合小数据
    class_weight='balanced',
    random_state=42
)

# 参数网格
param_grid = {
    'C': [0.01, 0.1, 1, 10],            # 正则化强度
    'penalty': ['l1', 'l2']             # L1 或 L2 正则
}

# 使用 F1 分数作为评分指标
f1_scorer = make_scorer(f1_score, pos_label=1)

# 网格搜索
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# 拟合模型
grid_search.fit(X_train, y_train)

# 最佳模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# 评估结果
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("最佳参数：", grid_search.best_params_)
print(f"测试集准确率 Accuracy: {accuracy:.4f}")
print(f"测试集召回率 Recall: {recall:.4f}")
print(f"测试集F1分数: {f1:.4f}")
print(f"测试集AUC值: {auc:.4f}")
print("\n测试集分类报告：\n", classification_report(y_test, y_pred))


最佳参数： {'C': 0.1, 'penalty': 'l1'}
测试集准确率 Accuracy: 0.6984
测试集召回率 Recall: 0.5417
测试集F1分数: 0.4062
测试集AUC值: 0.6417

测试集分类报告：
               precision    recall  f1-score   support

           0       0.87      0.74      0.80       102
           1       0.33      0.54      0.41        24

    accuracy                           0.70       126
   macro avg       0.60      0.64      0.60       126
weighted avg       0.77      0.70      0.72       126



In [12]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("lr_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 lr_bjd_roc_curve.csv")

ROC数据已保存为 lr_bjd_roc_curve.csv


# 朴素贝叶斯

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

# 设置随机种子以确保结果的可重复性
np.random.seed(42)

# 加载数据
file_path = "已知数据.xlsx"
df = pd.read_excel(file_path)

X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
y = df['标签']

# 使用分层划分以保持类别比例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 朴素贝叶斯处理连续变量的关键优化：离散化
# 使用分箱将连续变量转换为离散变量
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
X_train[continuous_vars] = discretizer.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = discretizer.transform(X_test[continuous_vars])

# 定义朴素贝叶斯分类器
nb = GaussianNB()

# 朴素贝叶斯的超参数网格（相对较少）
param_grid = {
    'var_smoothing': np.logspace(-10, -5, 20)  # 方差平滑参数
}

# 使用网格搜索和交叉验证
grid_search = GridSearchCV(
    estimator=nb,
    param_grid=param_grid,
    cv=5,  # 使用5折交叉验证
    n_jobs=-1,  # 使用所有CPU核心
    verbose=1,  # 适度输出日志
    scoring='roc_auc'
)

grid_search.fit(X_train, y_train)

# 使用最佳模型进行预测
best_nb = grid_search.best_estimator_
y_pred = best_nb.predict(X_test)
y_test_proba = best_nb.predict_proba(X_test)[:, 1]

# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# 评估模型
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'var_smoothing': 1e-10}
Accuracy: 0.7381
F1 Score: 0.2667
ROC AUC: 0.6565

Confusion Matrix:
[[87 15]
 [18  6]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       102
           1       0.29      0.25      0.27        24

    accuracy                           0.74       126
   macro avg       0.56      0.55      0.55       126
weighted avg       0.73      0.74      0.73       126



In [14]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("nb_roc_curve.csv", index=False)
print("ROC数据已保存为 nb_roc_curve.csv")

ROC数据已保存为 nb_roc_curve.csv


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

# 设置随机种子以确保结果的可重复性
np.random.seed(42)

# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
y = df['标签']

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 连续变量
continuous_vars = ['PR', 'ki-67']

# 朴素贝叶斯处理连续变量的关键优化：离散化
# 使用分箱将连续变量转换为离散变量
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
X_train[continuous_vars] = discretizer.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = discretizer.transform(X_test[continuous_vars])

# 定义朴素贝叶斯分类器
nb = GaussianNB()

# 朴素贝叶斯的超参数网格（相对较少）
param_grid = {
    'var_smoothing': np.logspace(-10, -5, 20)  # 方差平滑参数
}

# 使用网格搜索和交叉验证
grid_search = GridSearchCV(
    estimator=nb,
    param_grid=param_grid,
    cv=5,  # 使用5折交叉验证
    n_jobs=-1,  # 使用所有CPU核心
    verbose=1,  # 适度输出日志
    scoring='roc_auc'
)

grid_search.fit(X_train, y_train)

# 使用最佳模型进行预测
best_nb = grid_search.best_estimator_
y_pred = best_nb.predict(X_test)
y_test_proba = best_nb.predict_proba(X_test)[:, 1]

# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# 评估模型
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'var_smoothing': 1e-05}
Accuracy: 0.6984
F1 Score: 0.4062
ROC AUC: 0.7014

Confusion Matrix:
[[75 27]
 [11 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.74      0.80       102
           1       0.33      0.54      0.41        24

    accuracy                           0.70       126
   macro avg       0.60      0.64      0.60       126
weighted avg       0.77      0.70      0.72       126



In [16]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("nb_bjd_roc_curve.csv", index=False)
print("ROC数据已保存为 nb_bjd_roc_curve.csv")

ROC数据已保存为 nb_bjd_roc_curve.csv
