# 随机森林

In [1]:
import numpy as np
import umap
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 设置随机种子以确保结果的可重复性
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)

In [4]:
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')

In [5]:
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=10)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)

In [6]:
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', 
        '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况',
        '手术方式','放疗', '化疗期间是否应用诺雷德', 
        '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
#X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案','Text_Embedding_Combined1','Text_Embedding_Combined2']]

In [7]:
X = pd.concat([X, text_feature_pca_df], axis=1)

In [8]:
X

Unnamed: 0,LN转移个数,腋窝淋巴结状态,PR,HER2+FISH,ki-67,手术前怀孕,治疗后怀孕,治疗后生产,目前月经情况,手术方式,放疗,化疗期间是否应用诺雷德,靶向治疗（赫赛汀或赫赛汀+帕捷特）,化疗方案,内分泌治疗方案,Text_Embedding_Combined1,Text_Embedding_Combined2
0,5,1,10,1,0.20,2,0,0,6,2,1,0,1,2,1,14.980383,5.013096
1,2,1,95,2,0.10,1,0,0,1,2,1,0,0,4,1,12.068896,8.871240
2,0,0,70,1,0.05,3,0,0,5,1,1,0,0,8,1,16.276093,8.185767
3,6,1,3,0,0.60,2,0,0,7,5,1,0,0,4,1,16.475365,6.555526
4,0,0,90,3,0.05,4,0,0,6,5,0,0,1,2,1,12.591355,5.566292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1361,0,0,90,0,0.40,2,0,0,2,1,1,0,0,0,1,13.169783,8.524548
1362,1,1,95,0,0.45,0,0,0,1,1,1,0,0,0,3,12.164911,5.536772
1363,8,1,0,1,0.40,1,0,0,1,4,1,0,0,7,0,17.065783,6.023079
1364,0,0,95,2,0.15,3,0,0,2,1,1,0,0,8,0,14.051453,7.935132


In [9]:
y = df['标签']

In [10]:
# 连续变量
#continuous_vars = ['PR', 'ki-67'] + pca_columns
continuous_vars = ['PR', 'ki-67','Text_Embedding_Combined1','Text_Embedding_Combined2']

In [11]:
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

In [12]:
# # 使用分层划分以保持类别比例
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [13]:
# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])

In [14]:
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

In [15]:
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

In [16]:
class_weight_dict

{0: 0.8136482939632546, 1: 1.297071129707113}

In [17]:
# 定义随机森林分类器
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

In [18]:
#定义超参数网格
param_grid = {
    'n_estimators': [100, 200, 300],  # 树的数量
    'max_depth': [None, 10, 20, 30],  # 树的最大深度
    'min_samples_split': [2, 5, 10],  # 内部节点再划分所需最小样本数
    'min_samples_leaf': [1, 2, 4],  # 叶子节点所需最小样本数
    'max_features': ['sqrt', 'log2', None],  # 划分时考虑的最大特征数
    'class_weight': [None, 'balanced', class_weight_dict]  # 类别权重选项
}

In [19]:
# 使用网格搜索和交叉验证来寻找最佳参数
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 972 candidates, totalling 4860 fits


In [20]:
# 输出最佳参数和对应得分
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Best parameters found:  {'class_weight': None, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation score:  0.9150919655142757


In [21]:
# 使用最佳模型进行预测
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

In [22]:
y_test_proba = best_rf.predict_proba(X_test)[:, 1]
# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)
print(f"Accuracy (Adjusted Threshold): {accuracy}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

Accuracy (Adjusted Threshold): 0.7142857142857143
F1 Score: 0.41935483870967744
ROC AUC: 0.7393790849673203


In [23]:
# 评估模型
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[77 25]
 [11 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.75      0.81       102
           1       0.34      0.54      0.42        24

    accuracy                           0.71       126
   macro avg       0.61      0.65      0.61       126
weighted avg       0.77      0.71      0.74       126



In [24]:
# 导入ROC曲线函数
from sklearn.metrics import roc_curve

# 计算FPR, TPR, 阈值
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# 可保存为文件用于后续绘图
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("rf_yy_roc_curve.csv", index=False)
print("ROC数据已保存为 rf_yy_roc_curve.csv")

ROC数据已保存为 rf_yy_roc_curve.csv


In [25]:
import joblib

# 保存最优模型和标准化器
joblib.dump(best_rf, 'bf_w.pkl')
joblib.dump(scaler, 'scaler_bf_w.pkl')

print("✅ 模型和标准化器已保存。")

✅ 模型和标准化器已保存。


# XGBoost

In [1]:
import numpy as np
import umap
import pandas as pd
import xgboost as xgb
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 设置随机种子以确保结果的可重复性
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns

# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

In [4]:
scale_pos_weight = class_weight_dict.get(1, 1)

In [5]:
# 定义 XGBoost 分类器
xgb_classifier = xgb.XGBClassifier(
    objective='binary:logistic',  # 二分类逻辑回归
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

In [6]:
# 定义超参数网格
param_grid = {
    'n_estimators': [100, 200, 300],  # 树的数量
    'max_depth': [3, 5, 7],  # 树的最大深度
    'learning_rate': [0.01, 0.1, 0.2],  # 学习率
    'subsample': [0.7, 0.8, 1.0],  # 每次迭代时随机选择的样本比例
    'colsample_bytree': [0.5, 0.7, 1.0],  # 每棵树随机选择的特征比例
    'gamma': [0, 0.1, 0.2],  # 树的叶子节点上进行进一步分裂所需的最小损失减少量
    'min_child_weight': [1, 2, 3]  # 子节点中最小的样本权重和
}

In [7]:
# 使用网格搜索和交叉验证来寻找最佳参数
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


In [8]:
# 输出最佳参数和对应得分
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Best parameters found:  {'colsample_bytree': 0.5, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1.0}
Best cross-validation score:  0.9164290935672514


In [9]:
# 获取最佳模型
best_xgb = grid_search.best_estimator_

In [10]:
# 使用最佳模型进行预测
y_pred = best_xgb.predict(X_test)

In [11]:
y_test_proba = best_xgb.predict_proba(X_test)[:, 1]
# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)
print(f"Accuracy (Adjusted Threshold): {accuracy}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

Accuracy (Adjusted Threshold): 0.7222222222222222
F1 Score: 0.4262295081967213
ROC AUC: 0.7234477124183006


In [12]:
# 计算 FPR, TPR, Thresholds
from sklearn.metrics import roc_curve
import pandas as pd

fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# 保存为CSV，供画图使用
roc_df = pd.DataFrame({
    'FPR': fpr,
    'TPR': tpr,
    'Threshold': thresholds
})
roc_df.to_csv("xgb_yy_roc_curve.csv", index=False)
print("ROC 曲线数据已保存至 xgb_yy_roc_curve.csv")

ROC 曲线数据已保存至 xgb_yy_roc_curve.csv


In [13]:
import joblib

# 保存最优模型和标准化器
joblib.dump(best_xgb, 'best_xgb.pkl')
joblib.dump(scaler, 'scaler_xgb.pkl')

print("✅ 模型和标准化器已保存。")

✅ 模型和标准化器已保存。


# SVM

In [34]:
import numpy as np
import umap
import pandas as pd
import xgboost as xgb
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [35]:
# 设置随机种子以确保结果的可重复性
np.random.seed(42)
tf.random.set_seed(42)

In [36]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

In [37]:
# 定义支持向量机分类器
svm = SVC(random_state=42)

In [38]:
# 定义超参数网格
param_grid = {
    'C': [0.1, 1, 10],  # 正则化参数
    'kernel': ['linear', 'rbf', 'poly'],  # 核函数
    'gamma': ['scale', 'auto', 0.1, 1],  # 核函数系数
    'class_weight': [None, 'balanced', class_weight_dict]  # 类别权重
}

In [39]:
# 使用网格搜索和交叉验证来寻找最佳参数
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [40]:
# 输出最佳参数和对应得分
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Best parameters found:  {'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation score:  0.8845542010663914


In [41]:
best_svm = grid_search.best_estimator_

In [42]:
# 使用最佳模型进行预测
y_pred = best_svm.predict(X_test)

In [43]:
# 计算F1分数（针对二分类）
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")
# 计算AUC值
auc = roc_auc_score(y_test, best_svm.decision_function(X_test))
print(f"AUC: {auc:.4f}")


Accuracy: 0.7142857142857143
F1 Score: 0.4194
AUC: 0.6609


In [44]:
# 评估模型
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[77 25]
 [11 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.75      0.81       102
           1       0.34      0.54      0.42        24

    accuracy                           0.71       126
   macro avg       0.61      0.65      0.61       126
weighted avg       0.77      0.71      0.74       126



# 梯度提升树

In [1]:
import numpy as np
import umap
import pandas as pd
import xgboost as xgb
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 设置随机种子以确保结果的可重复性
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

In [4]:
sample_weights_train = np.array([class_weight_dict[label] for label in y_train])

In [5]:
# 定义梯度提升树分类器
gbc = GradientBoostingClassifier(random_state=42)

In [6]:
# 定义超参数网格
param_grid = {
    'n_estimators': [50, 100, 200],  # 树的数量
    'learning_rate': [0.01, 0.1, 0.2],  # 学习率
    'max_depth': [3, 5, 7],  # 树的最大深度
    'min_samples_split': [2, 5, 10],  # 内部节点再划分所需最小样本数
    'min_samples_leaf': [1, 2, 4],  # 叶子节点所需最小样本数
    'subsample': [0.7, 0.8, 1.0]  # 每次迭代时随机选择的样本比例
}

In [7]:
# 使用网格搜索和交叉验证来寻找最佳参数
grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search.fit(X_train, y_train, sample_weight=sample_weights_train)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


In [8]:
# 输出最佳参数和对应得分
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Best parameters found:  {'learning_rate': 0.2, 'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 1.0}
Best cross-validation score:  0.9143449539903681


In [9]:
best_gbc = grid_search.best_estimator_

In [10]:
# 使用最佳模型进行预测
y_pred = best_gbc.predict(X_test)

In [11]:
y_test_proba = best_gbc.predict_proba(X_test)[:, 1]
# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)
print(f"Accuracy (Adjusted Threshold): {accuracy}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

Accuracy (Adjusted Threshold): 0.7142857142857143
F1 Score: 0.4
ROC AUC: 0.7308006535947713


In [12]:
# 评估模型
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[78 24]
 [12 12]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.76      0.81       102
           1       0.33      0.50      0.40        24

    accuracy                           0.71       126
   macro avg       0.60      0.63      0.61       126
weighted avg       0.77      0.71      0.73       126



In [13]:
# 计算 FPR, TPR, Thresholds
from sklearn.metrics import roc_curve
import pandas as pd

fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# 保存为CSV，供画图使用
roc_df = pd.DataFrame({
    'FPR': fpr,
    'TPR': tpr,
    'Threshold': thresholds
})
roc_df.to_csv("gbt_yy_roc_curve.csv", index=False)
print("ROC 曲线数据已保存至 gbt_yy_roc_curve.csv")

ROC 曲线数据已保存至 gbt_yy_roc_curve.csv


In [14]:
import joblib

# 保存最优模型和标准化器
joblib.dump(best_gbc, 'best_gbc.pkl')
joblib.dump(scaler, 'scaler_gbc.pkl')

print("✅ 模型和标准化器已保存。")

✅ 模型和标准化器已保存。


# 神经网络

In [35]:
from sklearn.cluster import KMeans
import random
import umap
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [36]:
# 设置随机种子以确保结果的可重复性
random.seed(42)
np.random.seed(42)
tf.keras.utils.set_random_seed(42)

In [37]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 使用分层划分以保持类别比例
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

In [38]:
# 定义神经网络模型
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [39]:
# 编译模型
adam = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [40]:
# 定义早停回调
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [41]:
# 训练模型
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, 
                    class_weight=class_weight_dict, callbacks=[early_stopping],shuffle=False,verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100


In [42]:
# 评估模型
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

Test Loss: 0.5682604908943176
Test Accuracy: 0.6984127163887024


In [43]:
# 进行预测
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()



In [44]:
# 导入必要的评估指标库
from sklearn.metrics import f1_score, roc_auc_score

# 计算F1值（加权平均适用于二分类）
f1 = f1_score(y_test, y_pred, average='binary')

# 计算AUC值
auc = roc_auc_score(y_test, y_pred_prob)

# 输出结果
print(f'Test F1 Score: {f1:.4f}')
print(f'Test AUC: {auc:.4f}')

Test F1 Score: 0.4062
Test AUC: 0.7092


In [45]:
# 输出分类报告和混淆矩阵
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[75 27]
 [11 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.74      0.80       102
           1       0.33      0.54      0.41        24

    accuracy                           0.70       126
   macro avg       0.60      0.64      0.60       126
weighted avg       0.77      0.70      0.72       126



In [12]:
# 计算FPR、TPR和阈值
from sklearn.metrics import roc_curve
import pandas as pd

# 注意：y_pred_prob 是模型对 X_test 的概率预测结果
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)


# 保存为CSV文件供后续绘图使用
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("bp_yy_roc_curve.csv", index=False)
print("ROC 曲线数据已保存至 bp_yy_roc_curve.csv")

ROC 曲线数据已保存至 bp_yy_roc_curve.csv


# 图神经网络

In [47]:
from sklearn.cluster import KMeans
import random
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
import tensorflow_gnn as tfgnn
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [47]:
random.seed(42)
np.random.seed(42)
tf.keras.utils.set_random_seed(42)

In [48]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('bert-base-uncased.npy')
scaler = StandardScaler()
text_embeddings_scaled = scaler.fit_transform(text_embeddings)

pca = PCA(n_components=2)
text_features_pca = pca.fit_transform(text_embeddings_scaled)

# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 使用分层划分以保持类别比例
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

In [49]:
# 构建图结构数据
# 这里简化处理，假设每个样本作为一个节点，构建一个完全连接的图
def build_graph_data(X, y):
    num_samples = X.shape[0]
    # 构建边，假设每个节点连接到所有其他节点
    edges = []
    for i in range(num_samples):
        for j in range(num_samples):
            if i != j:
                edges.append((i, j))
    # 节点特征
    node_features = X.values.astype(np.float32)
    # 节点标签
    node_labels = y.values.astype(np.int32)
    return node_features, node_labels, edges

In [50]:
node_features, node_labels, edges = build_graph_data(X_train, y_train)

In [51]:
node_features_test, node_labels_test, _ = build_graph_data(X_test, y_test)

In [52]:
# 定义图神经网络模型
class GNN(Model):
    def __init__(self, num_classes):
        super(GNN, self).__init__()
        self.dense1 = Dense(64, activation='relu')
        self.dense2 = Dense(32, activation='relu')
        self.output_layer = Dense(num_classes, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

In [53]:
# 构建和编译模型
model = GNN(num_classes=1)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy"]
)

In [54]:
from tensorflow_gnn.keras import layers as tfgnn_layers

In [55]:
# 训练模型
history = model.fit(
    node_features, 
    node_labels,
    epochs=10,
    batch_size=32,
    shuffle=False, 
    class_weight=class_weight_dict,
    validation_split=0.2,
    verbose=1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [56]:
# 预测和评估
test_node_features, test_node_labels, _ = build_graph_data(X_test, y_test)
test_predictions = model.predict(test_node_features)
test_predictions = (test_predictions > 0.5).astype(int).flatten()



In [57]:
# 计算各项指标
test_probabilities = model.predict(test_node_features).flatten()
accuracy = accuracy_score(test_node_labels, test_predictions)
f1 = f1_score(test_node_labels, test_predictions)
auc = roc_auc_score(test_node_labels, test_probabilities)
print(f"\nAccuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")


Accuracy: 0.7774
F1 Score: 0.6806
AUC: 0.8351


In [58]:
print("Confusion Matrix:")
print(confusion_matrix(test_node_labels, test_predictions))
print("\nClassification Report:")
print(classification_report(test_node_labels, test_predictions))

Confusion Matrix:
[[148  41]
 [ 20  65]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.78      0.83       189
           1       0.61      0.76      0.68        85

    accuracy                           0.78       274
   macro avg       0.75      0.77      0.75       274
weighted avg       0.80      0.78      0.78       274



In [1]:
import numpy as np
import pandas as pd
import umap
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.neighbors import kneighbors_graph

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dropout, Dense

from spektral.data import Dataset, Graph
from spektral.data.loaders import SingleLoader
from spektral.layers import GCNConv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#random.seed(42)
np.random.seed(42)
tf.keras.utils.set_random_seed(42)

In [3]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 使用分层划分以保持类别比例
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

  warn(


In [4]:
# 将标签转换为 one-hot 编码（二分类）
y_train_oh = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test_oh = tf.keras.utils.to_categorical(y_test, num_classes=2)

In [5]:
k = 30

In [6]:
knn_train = kneighbors_graph(
    X_train.values, 
    n_neighbors=k, 
    mode='connectivity', 
    include_self=False
).toarray().astype(np.float32)

In [7]:
knn_train = np.maximum(knn_train, knn_train.T)

In [8]:
knn_test = kneighbors_graph(
    X_test.values, 
    n_neighbors=k, 
    mode='connectivity', 
    include_self=False
).toarray().astype(np.float32)
knn_test = np.maximum(knn_test, knn_test.T)

In [9]:
print("Train adjacency shape:", knn_train.shape)
print("Test adjacency shape:", knn_test.shape)

Train adjacency shape: (1092, 1092)
Test adjacency shape: (274, 274)


In [10]:
class CustomGraphDataset(Dataset):
    """
    接收 X(节点特征矩阵)、y(one-hot 标签矩阵)、adj(邻接矩阵)，
    返回一个只有单张图的 Dataset。
    """
    def __init__(self, X, y, adj, **kwargs):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
        self.adj = adj.astype(np.float32)
        super().__init__(**kwargs)

    def read(self):
        # 只返回一张 Graph：节点数目 = 样本数
        graph = Graph(x=self.X, a=self.adj, y=self.y)
        return [graph]

In [11]:
train_dataset = CustomGraphDataset(
    X_train.values,   # (N_train, n_features)
    y_train_oh,       # (N_train, 2)
    knn_train         # (N_train, N_train)
)
test_dataset = CustomGraphDataset(
    X_test.values, 
    y_test_oh, 
    knn_test
)

In [12]:
train_loader = SingleLoader(train_dataset, epochs=30)
test_loader = SingleLoader(test_dataset)

In [13]:
class GCNModel(Model):
    def __init__(self):
        super().__init__()
        # 输入维度不需要在这里显式写，Spektral 会自动推断（从数据集中 Graph.x.shape）
        self.gcn1 = GCNConv(32, activation='relu')
        self.dropout = Dropout(0.5)
        self.gcn2 = GCNConv(16, activation='relu')
        self.dense = Dense(2, activation='softmax')

    def call(self, inputs, training=False):
        x, a = inputs  # x: (N, n_features)，a: (N, N)
        x = self.gcn1([x, a])
        x = self.dropout(x, training=training)
        x = self.gcn2([x, a])
        output = self.dense(x)  # 节点级别输出 (N, 2)
        return output

In [14]:
model = GCNModel()
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [15]:
model.fit(
    train_loader.load(), 
    steps_per_epoch=1, 
    epochs=30,
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x1b8042aed60>

In [16]:
loader_iter = iter(test_loader.load())  
(x_test_graph, a_test_graph), y_true_onehot = next(loader_iter)
y_prob = model([x_test_graph, a_test_graph], training=False).numpy()
y_pred = np.argmax(y_prob, axis=-1)
y_true_labels = np.argmax(y_true_onehot, axis=-1)
accuracy = accuracy_score(y_true_labels, y_pred)
f1 = f1_score(y_true_labels, y_pred)
auc = roc_auc_score(y_true_labels, y_prob[:, 1])

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_true_labels, y_pred))
print("\nClassification Report:")
print(classification_report(y_true_labels, y_pred))

Accuracy: 0.6715
F1 Score: 0.5588
AUC: 0.7157
Confusion Matrix:
[[127  62]
 [ 28  57]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.67      0.74       189
           1       0.48      0.67      0.56        85

    accuracy                           0.67       274
   macro avg       0.65      0.67      0.65       274
weighted avg       0.71      0.67      0.68       274



In [17]:
# 计算 FPR, TPR, Thresholds
from sklearn.metrics import roc_curve
import pandas as pd

fpr, tpr, thresholds = roc_curve(y_true_labels, y_prob[:, 1])


# 保存为CSV文件供绘图使用
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("gcn_yy_roc_curve.csv", index=False)
print("ROC 曲线数据已保存至 gcn_yy_roc_curve.csv")

ROC 曲线数据已保存至 gcn_yy_roc_curve.csv


# LightGBM

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import statsmodels.api as sm
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import pearsonr, pointbiserialr, spearmanr
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, silhouette_score, precision_recall_curve, f1_score, precision_score, recall_score
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
import warnings
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import umap
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 设置随机种子以确保结果的可重复性
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)

# 2. 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

In [4]:
# 计算类别权重（LightGBM需要手动计算正样本权重）
positive_count = np.sum(y_train == 1)
negative_count = np.sum(y_train == 0)
scale_pos_weight = negative_count / positive_count  # 用于处理不平衡数据

In [5]:
# 定义LightGBM分类器
lgbm = lgb.LGBMClassifier(
    random_state=42,
    objective='binary',
    metric='auc',
    n_jobs=-1,  # 使用所有CPU核心
    verbose=-1,  # 不输出训练日志
    scale_pos_weight=scale_pos_weight  # 处理类别不平衡
)

In [6]:
# 定义LightGBM的超参数网格
param_grid = {
    'n_estimators': [50, 100, 150],  # 树的数量
    'num_leaves': [15, 30, 45],  # 最大叶子数
    'max_depth': [3, 5, 7],  # 树的最大深度（-1表示无限制）
    'learning_rate': [0.01, 0.05],  # 学习率
    'min_child_samples': [20, 30, 40],  # 叶子节点最小样本数
    'subsample': [0.7, 0.8],  # 样本采样比例
    'colsample_bytree': [0.6, 0.7],  # 特征采样比例
    'reg_alpha': [0.1, 0.5, 1.0],  # L1正则化
    'reg_lambda': [0.1, 0.5, 1.0],  # L2正则化
}

In [7]:
# 使用网格搜索和交叉验证来寻找最佳参数
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='roc_auc'
)

In [8]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 5832 candidates, totalling 29160 fits


In [9]:
# 使用最佳模型进行预测
best_lgbm = grid_search.best_estimator_
y_pred = best_lgbm.predict(X_test)
y_test_proba = best_lgbm.predict_proba(X_test)[:, 1]

In [10]:
# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)

In [11]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
# 评估模型
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_samples': 20, 'n_estimators': 150, 'num_leaves': 30, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'subsample': 0.7}
Accuracy: 0.6984
F1 Score: 0.4062
ROC AUC: 0.7345

Confusion Matrix:
[[75 27]
 [11 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.74      0.80       102
           1       0.33      0.54      0.41        24

    accuracy                           0.70       126
   macro avg       0.60      0.64      0.60       126
weighted avg       0.77      0.70      0.72       126



In [12]:
# 计算 FPR, TPR, Thresholds
from sklearn.metrics import roc_curve
import pandas as pd

fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# 保存为CSV，供画图使用
roc_df = pd.DataFrame({
    'FPR': fpr,
    'TPR': tpr,
    'Threshold': thresholds
})
roc_df.to_csv("lgbm_yy_roc_curve.csv", index=False)
print("ROC 曲线数据已保存至 lgbm_yy_roc_curve.csv")

ROC 曲线数据已保存至 lgbm_yy_roc_curve.csv


In [13]:
import joblib

# 保存最优模型和标准化器
joblib.dump(best_lgbm, 'best_lgbm.pkl')
joblib.dump(scaler, 'scaler_lgbm.pkl')

print("✅ 模型和标准化器已保存。")

✅ 模型和标准化器已保存。


# 朴素贝叶斯

In [25]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import umap

In [26]:
# 设置随机种子以确保结果的可重复性
np.random.seed(42)

In [27]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 朴素贝叶斯处理连续变量的关键优化：离散化
# 使用分箱将连续变量转换为离散变量
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
X_train[continuous_vars] = discretizer.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = discretizer.transform(X_test[continuous_vars])

  warn(


In [28]:
# 定义朴素贝叶斯分类器
nb = GaussianNB()

In [29]:
# 朴素贝叶斯的超参数网格（相对较少）
param_grid = {
    'var_smoothing': np.logspace(-10, -5, 20)  # 方差平滑参数
}

In [30]:
# 使用网格搜索和交叉验证
grid_search = GridSearchCV(
    estimator=nb,
    param_grid=param_grid,
    cv=5,  # 使用5折交叉验证
    n_jobs=-1,  # 使用所有CPU核心
    verbose=1,  # 适度输出日志
    scoring='roc_auc'
)

In [31]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [32]:
# 使用最佳模型进行预测
best_nb = grid_search.best_estimator_
y_pred = best_nb.predict(X_test)
y_test_proba = best_nb.predict_proba(X_test)[:, 1]

In [33]:
# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)

In [34]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# 评估模型
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'var_smoothing': 1e-05}
Accuracy: 0.6905
F1 Score: 0.4179
ROC AUC: 0.7128

Confusion Matrix:
[[73 29]
 [10 14]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.72      0.79       102
           1       0.33      0.58      0.42        24

    accuracy                           0.69       126
   macro avg       0.60      0.65      0.60       126
weighted avg       0.77      0.69      0.72       126



# AdaBoost

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, 
                             confusion_matrix, classification_report, 
                             roc_curve, auc, precision_recall_curve, average_precision_score)
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import umap

In [13]:
# 设置随机种子以确保结果的可重复性
np.random.seed(42)

In [14]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

  warn(


In [15]:
# 计算样本权重以处理类别不平衡
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [16]:
# 定义基础分类器（决策树桩）
base_estimator = DecisionTreeClassifier(
    max_depth=1,  # 树桩：单层决策树
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

In [17]:
# 定义AdaBoost分类器
ada = AdaBoostClassifier(
    estimator=base_estimator,
    algorithm='SAMME.R',  # 使用概率估计的改进算法
    random_state=42
)

In [18]:
# 针对小数据集优化的超参数网格
param_grid = {
    'n_estimators': [30, 50, 70, 100],  # 弱学习器数量
    'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5],  # 学习率
    'estimator__max_depth': [1, 2],  # 基础分类器深度
    'estimator__min_samples_split': [2, 5],  # 基础分类器分割最小样本数
}

In [19]:
# 使用分层K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [20]:
# 使用网格搜索和交叉验证
grid_search = GridSearchCV(
    estimator=ada,
    param_grid=param_grid,
    cv=cv,
    n_jobs=-1,
    verbose=1,
    scoring='roc_auc'
)

In [21]:
grid_search.fit(X_train, y_train, sample_weight=sample_weights)

Fitting 5 folds for each of 80 candidates, totalling 400 fits




In [22]:
# 使用最佳模型进行预测
best_ada = grid_search.best_estimator_
y_pred = best_ada.predict(X_test)
y_test_proba = best_ada.predict_proba(X_test)[:, 1]

In [23]:
# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)

In [24]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# 评估模型
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'estimator__max_depth': 2, 'estimator__min_samples_split': 2, 'learning_rate': 0.05, 'n_estimators': 30}
Accuracy: 0.7302
F1 Score: 0.4138
ROC AUC: 0.6810

Confusion Matrix:
[[80 22]
 [12 12]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.78      0.82       102
           1       0.35      0.50      0.41        24

    accuracy                           0.73       126
   macro avg       0.61      0.64      0.62       126
weighted avg       0.77      0.73      0.75       126



# CatBoost

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, 
                             confusion_matrix, classification_report)
from sklearn.utils.class_weight import compute_sample_weight
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import umap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 设置随机种子
np.random.seed(42)

In [3]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

  warn(


In [4]:
# 5. 类别样本权重
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [5]:
# 6. CatBoost 参数网格
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'depth': [4, 6],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [32, 64],  # 用于数值特征的分箱
}

In [6]:
# 7. 定义模型
cat = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=0,
    random_seed=42
)

In [7]:
# 8. 网格搜索 + 分层K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=cat,
    param_grid=param_grid,
    cv=cv,
    n_jobs=-1,
    verbose=1,
    scoring='roc_auc'
)

In [8]:
# 9. 拟合模型
grid_search.fit(X_train, y_train, sample_weight=sample_weights)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [9]:
# 10. 预测与评估
best_cat = grid_search.best_estimator_
y_pred = best_cat.predict(X_test)
y_test_proba = best_cat.predict_proba(X_test)[:, 1]

In [10]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, y_test_proba)

In [11]:
# 11. 输出结果
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'border_count': 64, 'depth': 6, 'iterations': 100, 'l2_leaf_reg': 3, 'learning_rate': 0.1}
Accuracy: 0.7063
F1 Score: 0.4127
ROC AUC: 0.7422

Confusion Matrix:
[[76 26]
 [11 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.75      0.80       102
           1       0.33      0.54      0.41        24

    accuracy                           0.71       126
   macro avg       0.60      0.64      0.61       126
weighted avg       0.77      0.71      0.73       126



In [12]:
# 计算 FPR, TPR, Thresholds
from sklearn.metrics import roc_curve
import pandas as pd

fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# 保存为CSV，供画图使用
roc_df = pd.DataFrame({
    'FPR': fpr,
    'TPR': tpr,
    'Threshold': thresholds
})
roc_df.to_csv("catb_yy_roc_curve.csv", index=False)
print("ROC 曲线数据已保存至 catb_yy_roc_curve.csv")

ROC 曲线数据已保存至 catb_yy_roc_curve.csv


In [13]:
import joblib

# 保存最优模型和标准化器
joblib.dump(best_cat, 'best_cat.pkl')
joblib.dump(scaler, 'scaler_cat.pkl')

print("✅ 模型和标准化器已保存。")

✅ 模型和标准化器已保存。


# KNN

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, 
                             confusion_matrix, classification_report)
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import umap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 设置随机种子
np.random.seed(42)

In [3]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 使用分层划分以保持类别比例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

  warn(


In [4]:
# 5. 定义模型
knn = KNeighborsClassifier()

In [5]:
# 6. 定义超参数搜索范围
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],  # 1: 曼哈顿距离，2: 欧几里得距离
    'leaf_size': [20, 30, 40]
}

In [6]:
# 7. 定义交叉验证策略
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
# 8. 网格搜索（注：KNN 不支持 sample_weight，因此不传）
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

In [8]:
# 9. 训练模型
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [9]:
# 10. 预测
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
y_test_proba = best_knn.predict_proba(X_test)[:, 1]

In [10]:
# 11. 评估模型
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_test_proba)

In [11]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'leaf_size': 20, 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Accuracy: 0.7883
F1 Score: 0.5915
ROC AUC: 0.8430

Confusion Matrix:
[[174  15]
 [ 43  42]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       189
           1       0.74      0.49      0.59        85

    accuracy                           0.79       274
   macro avg       0.77      0.71      0.72       274
weighted avg       0.78      0.79      0.77       274



In [12]:
from sklearn.metrics import roc_curve
import pandas as pd

# 计算 FPR, TPR, Thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)

# 保存为 CSV 文件
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("knn_yy_roc_curve.csv", index=False)
print("ROC 曲线数据已保存为 knn_yy_roc_curve.csv")

ROC 曲线数据已保存为 knn_yy_roc_curve.csv


# 逻辑回归

In [22]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, 
                             confusion_matrix, classification_report)
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import umap

In [23]:
# 设置随机种子
np.random.seed(42)

In [24]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

  warn(


In [25]:
# 5. 定义逻辑回归模型
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

In [26]:
# 只构建合法组合
param_grid = [
    # L1 正则只适用于 liblinear 和 saga
    {'penalty': ['l1'], 'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'saga']},
    
    # L2 正则适用于所有求解器
    {'penalty': ['l2'], 'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'saga']},

    # Elasticnet 仅支持 saga，且必须指定 l1_ratio
    {'penalty': ['elasticnet'], 'C': [0.01, 0.1, 1, 10], 'solver': ['saga'], 'l1_ratio': [0.5, 0.7, 1.0]}
]

In [27]:
# 7. 网格搜索 + 分层K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

In [28]:
# 8. 拟合模型
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [29]:
# 9. 预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_test_proba = best_model.predict_proba(X_test)[:, 1]

In [30]:
# 10. 评估
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_test_proba)

In [31]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 0.7143
F1 Score: 0.4375
ROC AUC: 0.6573

Confusion Matrix:
[[76 26]
 [10 14]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.75      0.81       102
           1       0.35      0.58      0.44        24

    accuracy                           0.71       126
   macro avg       0.62      0.66      0.62       126
weighted avg       0.78      0.71      0.74       126



# MLP

In [12]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import umap
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

In [13]:
# 设置随机种子
np.random.seed(42)

In [14]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=1)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 使用分层划分以保持类别比例
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

  warn(


In [15]:
# 定义MLP模型
mlp = MLPClassifier(max_iter=500, random_state=42)

In [16]:
# 参数搜索空间（适合小样本）
param_grid = {
    'hidden_layer_sizes': [(32,), (64,), (32, 32), (64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],  # 'lbfgs' 也可尝试，但不支持 sample_weight
    'alpha': [0.0001, 0.001],  # L2正则项
    'learning_rate': ['constant', 'adaptive'],
}

In [17]:
# 分层K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
# 网格搜索
grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

In [19]:
# 拟合
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits




In [20]:
# 最佳模型
best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_test)
y_proba = best_mlp.predict_proba(X_test)[:, 1]

In [21]:
# 评估
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (64,), 'learning_rate': 'constant', 'solver': 'adam'}
Accuracy: 0.7222
F1 Score: 0.3860
ROC AUC: 0.7030

Confusion Matrix:
[[80 22]
 [13 11]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.78      0.82       102
           1       0.33      0.46      0.39        24

    accuracy                           0.72       126
   macro avg       0.60      0.62      0.60       126
weighted avg       0.76      0.72      0.74       126



In [None]:
# 输出FPR、TPR和Thresholds
from sklearn.metrics import roc_curve
import pandas as pd

fpr, tpr, thresholds = roc_curve(y_test, y_proba)


# 保存为CSV文件
roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
roc_df.to_csv("mlp_yy_roc_curve.csv", index=False)
print("ROC 曲线数据已保存为 mlp_yy_roc_curve.csv")

# 决策树

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, 
                             confusion_matrix, classification_report)
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import umap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 设置随机种子
np.random.seed(42)

In [3]:
# 加载数据
file_path = "预测.xlsx"
df = pd.read_excel(file_path)
# 加载文本特征
text_embeddings = np.load('PubMedBERT.npy')
# scaler = StandardScaler()
# text_embeddings_scaled = scaler.fit_transform(text_embeddings)

# pca = PCA(n_components=2)
# text_features_pca = pca.fit_transform(text_embeddings_scaled)
text_features_pca = umap.UMAP(n_components=2, random_state=42).fit_transform(MinMaxScaler().fit_transform(text_embeddings))
# 动态生成列名
n_pca_features = text_features_pca.shape[1]
pca_columns = [f'Text_Embedding_Combined{i+1}' for i in range(n_pca_features)]

# 转换为DataFrame
text_feature_pca_df = pd.DataFrame(text_features_pca, columns=pca_columns)
X = df[['LN转移个数','腋窝淋巴结状态','PR', 'HER2+FISH', 'ki-67', '手术前怀孕', '治疗后怀孕', '治疗后生产','目前月经情况','手术方式','放疗', '化疗期间是否应用诺雷德', '靶向治疗（赫赛汀或赫赛汀+帕捷特）','化疗方案', '内分泌治疗方案']]
X = pd.concat([X, text_feature_pca_df], axis=1)
y = df['标签']
# 连续变量
continuous_vars = ['PR', 'ki-67'] + pca_columns
# 从前627行中划分出20%作为测试集
X_front = X.iloc[:627]
y_front = y.iloc[:627]
X_front_train, X_test, y_front_train, y_test = train_test_split(
    X_front, y_front, test_size=0.2, stratify=y_front, random_state=42
)

# 剩下的数据（从第628行开始）作为训练集的一部分
X_rest = X.iloc[627:]
y_rest = y.iloc[627:]

# 拼接训练数据：前627行的80% + 剩下所有行
X_train = pd.concat([X_front_train, X_rest], axis=0)
y_train = pd.concat([y_front_train, y_rest], axis=0)
# 连续变量：进行标准化
scaler = StandardScaler()
X_train[continuous_vars] = scaler.fit_transform(X_train[continuous_vars])
X_test[continuous_vars] = scaler.transform(X_test[continuous_vars])

  warn(


In [4]:
# 样本权重处理类别不平衡
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [5]:
# 决策树模型
tree = DecisionTreeClassifier(random_state=42)

In [6]:
# 参数网格（避免过拟合）
param_grid = {
    'max_depth': [3, 5, 7, 10],               # 控制树的最大深度
    'min_samples_split': [2, 5, 10],          # 内部节点再划分所需最小样本数
    'min_samples_leaf': [1, 2, 4],            # 叶子节点最少样本数
    'criterion': ['gini', 'entropy']          # 划分标准
}

In [7]:
# 分层K折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
# 网格搜索
grid_search = GridSearchCV(
    estimator=tree,
    param_grid=param_grid,
    cv=cv,
    n_jobs=-1,
    verbose=1,
    scoring='roc_auc'
)

In [9]:
# 拟合模型
grid_search.fit(X_train, y_train, sample_weight=sample_weights)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [10]:
# 最优模型预测
best_tree = grid_search.best_estimator_
y_pred = best_tree.predict(X_test)
y_proba = best_tree.predict_proba(X_test)[:, 1]

In [11]:
# 评估结果
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Accuracy: 0.6667
F1 Score: 0.3824
ROC AUC: 0.6495

Confusion Matrix:
[[71 31]
 [11 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.70      0.77       102
           1       0.30      0.54      0.38        24

    accuracy                           0.67       126
   macro avg       0.58      0.62      0.58       126
weighted avg       0.76      0.67      0.70       126

