In [2]:
import warnings
from rdkit import RDLogger

# 屏蔽 RDKit 警告
RDLogger.DisableLog('rdApp.*')

# 或屏蔽所有 Python 警告
warnings.filterwarnings("ignore")
# 屏蔽 LightGBM 警告
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

In [3]:
import torch
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import precision_recall_curve, auc, f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
from rdkit.Chem import Descriptors, AllChem
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import (
    precision_recall_curve,
    roc_curve,
    auc,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import os
from sklearn.metrics import roc_curve



# 数据预处理
df = pd.read_csv('../imputed_selected_features_Toxcity.csv')
labels = df['Toxicity'].values
smiles_list = df['SMILES'].tolist()


y = labels
X = np.load('./Toxicity_smiles_embeddings.npy')


def train_evaluate_with_best_params(model_name, model_class, best_params, X, y):
    # 创建模型并设置最优参数
    model = model_class(**best_params)

    # 五折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # 存储各个折的评价指标
    metrics_train = {'PR-AUC': [], 'AU-ROC': [], 'ACC': [], 'PREC': [], 'REC': [], 'F1': []}
    metrics_test = {'PR-AUC': [], 'AU-ROC': [], 'ACC': [], 'PREC': [], 'REC': [], 'F1': []}
    
    # 存储ROC曲线数据
    all_fpr_train = []
    all_tpr_train = []
    all_fpr_test = []
    all_tpr_test = []
    
    # 存储混淆矩阵
    all_conf_matrices_train = []
    all_conf_matrices_test = []

    # 使用 tqdm 包裹五折交叉验证的每一折
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X, y), total=5, desc=f"Training {model_name}")):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        
        # 训练集指标
        y_prob_train = model.predict_proba(X_train)[:, 1]
        precision_train, recall_train, _ = precision_recall_curve(y_train, y_prob_train)
        pr_auc_train = auc(recall_train, precision_train)
        metrics_train['PR-AUC'].append(pr_auc_train)

        fpr_train, tpr_train, _ = roc_curve(y_train, y_prob_train)
        auc_train = auc(fpr_train, tpr_train)
        metrics_train['AU-ROC'].append(auc_train)
        all_fpr_train.append(fpr_train)
        all_tpr_train.append(tpr_train)

        
        accuracy_train = accuracy_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['ACC'].append(accuracy_train)

        precision_train_val = precision_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['PREC'].append(precision_train_val)

        recall_train_val = recall_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['REC'].append(recall_train_val)

        f1_train_val = f1_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['F1'].append(f1_train_val)

        # 测试集指标
        y_prob_test = model.predict_proba(X_val)[:, 1]
        precision_test, recall_test, _ = precision_recall_curve(y_val, y_prob_test)
        pr_auc_test = auc(recall_test, precision_test)
        metrics_test['PR-AUC'].append(pr_auc_test)

        fpr_test, tpr_test, _ = roc_curve(y_val, y_prob_test)
        auc_test = auc(fpr_test, tpr_test)
        metrics_test['AU-ROC'].append(auc_test)
        all_fpr_test.append(fpr_test)
        all_tpr_test.append(tpr_test)



        accuracy_test = accuracy_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['ACC'].append(accuracy_test)

        precision_test_val = precision_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['PREC'].append(precision_test_val)

        recall_test_val = recall_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['REC'].append(recall_test_val)

        f1_test_val = f1_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['F1'].append(f1_test_val)

        # 计算混淆矩阵
        conf_matrix_train = confusion_matrix(y_train, (y_prob_train > 0.5).astype(int))
        all_conf_matrices_train.append(conf_matrix_train)

        conf_matrix_test = confusion_matrix(y_val, (y_prob_test > 0.5).astype(int))
        all_conf_matrices_test.append(conf_matrix_test)

        # 输出当前折的训练集和测试集评价指标
        print(f"Fold {fold + 1} - Train metrics:")
        for metric, values in metrics_train.items():
            print(f"  {metric}: {values[-1]:.4f}")
        
        print(f"Fold {fold + 1} - Test metrics:")
        for metric, values in metrics_test.items():
            print(f"  {metric}: {values[-1]:.4f}")
        print()

    # 计算平均ROC曲线
    mean_fpr_train = np.linspace(0, 1, 100)
    mean_tpr_train = np.mean([np.interp(mean_fpr_train, fpr, tpr) for fpr, tpr in zip(all_fpr_train, all_tpr_train)], axis=0)

    mean_fpr_test = np.linspace(0, 1, 100)
    mean_tpr_test = np.mean([np.interp(mean_fpr_test, fpr, tpr) for fpr, tpr in zip(all_fpr_test, all_tpr_test)], axis=0)

    # 保存ROC曲线数据
    df_roc_train = pd.DataFrame({'FPR': mean_fpr_train, 'TPR': mean_tpr_train})
    df_roc_test = pd.DataFrame({'FPR': mean_fpr_test, 'TPR': mean_tpr_test})

    df_roc_train.to_csv(f'./Matrix_ROC_Plot/{model_name}_avg_roc_train.csv', index=False)
    df_roc_test.to_csv(f'./Matrix_ROC_Plot/{model_name}_avg_roc_test.csv', index=False)

    

    # 将每个折的混淆矩阵合并成一个整体混淆矩阵
    overall_conf_matrix_train = np.sum(all_conf_matrices_train, axis=0)
    overall_conf_matrix_test = np.sum(all_conf_matrices_test, axis=0)

    # 保存整体混淆矩阵
    df_conf_train = pd.DataFrame(overall_conf_matrix_train, columns=['Predicted 0', 'Predicted 1'], index=['True 0', 'True 1'])
    df_conf_test = pd.DataFrame(overall_conf_matrix_test, columns=['Predicted 0', 'Predicted 1'], index=['True 0', 'True 1'])

    df_conf_train.to_csv(f'./Matrix_ROC_Plot/{model_name}_overall_conf_matrix_train.csv', index=True)
    df_conf_test.to_csv(f'./Matrix_ROC_Plot/{model_name}_overall_conf_matrix_test.csv', index=True)
    

    
    # 保存评价指标到CSV文件
    results = {
        'Metric': ['PR-AUC', 'AU-ROC', 'F1', 'REC', 'PREC', 'ACC'],
        'Train': [
            np.mean(metrics_train['PR-AUC']),
            np.mean(metrics_train['AU-ROC']),
            np.mean(metrics_train['F1']),
            np.mean(metrics_train['REC']),
            np.mean(metrics_train['PREC']),
            np.mean(metrics_train['ACC'])
        ],
        'Test': [
            np.mean(metrics_test['PR-AUC']),
            np.mean(metrics_test['AU-ROC']),
            np.mean(metrics_test['F1']),
            np.mean(metrics_test['REC']),
            np.mean(metrics_test['PREC']),
            np.mean(metrics_test['ACC'])
        ]
    }
    
    df_results = pd.DataFrame(results)
    df_results.to_csv(f'./Matrix_ROC_Plot/{model_name}_evaluation_metrics.csv', index=False)
    print(f"Metrics saved to {model_name}_evaluation_metrics.csv")

    # 输出每个评价指标的训练集和测试集平均值
    print(f'Average Train PR AUC: {np.mean(metrics_train["PR-AUC"]):.4f}')
    print(f'Average Test PR AUC: {np.mean(metrics_test["PR-AUC"]):.4f}')

    print(f'Average Train AU-ROC: {np.mean(metrics_train["AU-ROC"]):.4f}')
    print(f'Average Test AU-ROC: {np.mean(metrics_test["AU-ROC"]):.4f}')

    print(f'Average Train Accuracy: {np.mean(metrics_train["ACC"]):.4f}')
    print(f'Average Test Accuracy: {np.mean(metrics_test["ACC"]):.4f}')

    print(f'Average Train Precision: {np.mean(metrics_train["PREC"]):.4f}')
    print(f'Average Test Precision: {np.mean(metrics_test["PREC"]):.4f}')

    print(f'Average Train Recall: {np.mean(metrics_train["REC"]):.4f}')
    print(f'Average Test Recall: {np.mean(metrics_test["REC"]):.4f}')

    print(f'Average Train F1: {np.mean(metrics_train["F1"]):.4f}')
    print(f'Average Test F1: {np.mean(metrics_test["F1"]):.4f}')

In [4]:
# 最优参数
best_params_xgb = {'n_estimators': 813, 'max_depth': 43, 'learning_rate': 0.02436317902902763,'eval_metric': 'error' }

# 训练并评估 XGBoost 模型
train_evaluate_with_best_params("XGBoost", XGBClassifier, best_params_xgb, X, y)

Training XGBoost:  20%|██        | 1/5 [00:04<00:17,  4.41s/it]

Fold 1 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 1 - Test metrics:
  PR-AUC: 0.9978
  AU-ROC: 0.9849
  ACC: 0.9789
  PREC: 0.9841
  REC: 0.9920
  F1: 0.9880



Training XGBoost:  40%|████      | 2/5 [00:07<00:11,  3.92s/it]

Fold 2 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 2 - Test metrics:
  PR-AUC: 0.9934
  AU-ROC: 0.9619
  ACC: 0.9648
  PREC: 0.9839
  REC: 0.9760
  F1: 0.9799



Training XGBoost:  60%|██████    | 3/5 [00:12<00:08,  4.03s/it]

Fold 3 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 3 - Test metrics:
  PR-AUC: 0.9965
  AU-ROC: 0.9780
  ACC: 0.9648
  PREC: 0.9685
  REC: 0.9919
  F1: 0.9801



Training XGBoost:  80%|████████  | 4/5 [00:15<00:03,  3.85s/it]

Fold 4 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 4 - Test metrics:
  PR-AUC: 0.9848
  AU-ROC: 0.9336
  ACC: 0.9716
  PREC: 0.9688
  REC: 1.0000
  F1: 0.9841



Training XGBoost: 100%|██████████| 5/5 [00:20<00:00,  4.07s/it]

Fold 5 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 5 - Test metrics:
  PR-AUC: 0.9996
  AU-ROC: 0.9972
  ACC: 0.9787
  PREC: 0.9764
  REC: 1.0000
  F1: 0.9880

Metrics saved to XGBoost_evaluation_metrics.csv
Average Train PR AUC: 1.0000
Average Test PR AUC: 0.9944
Average Train AU-ROC: 1.0000
Average Test AU-ROC: 0.9711
Average Train Accuracy: 1.0000
Average Test Accuracy: 0.9718
Average Train Precision: 1.0000
Average Test Precision: 0.9763
Average Train Recall: 1.0000
Average Test Recall: 0.9920
Average Train F1: 1.0000
Average Test F1: 0.9840



