In [2]:
import warnings
from rdkit import RDLogger

# 屏蔽 RDKit 警告
RDLogger.DisableLog('rdApp.*')

# 或屏蔽所有 Python 警告
warnings.filterwarnings("ignore")
# 屏蔽 LightGBM 警告
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

In [10]:
import torch
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import precision_recall_curve, auc, f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
from rdkit.Chem import Descriptors, AllChem

from sklearn.metrics import roc_curve



# 数据预处理
df = pd.read_csv('imputed_selected_features_Toxcity.csv')
labels = df['Toxicity'].values
smiles_list = df['SMILES'].tolist()

# 函数：将SMILES转换为分子描述符和指纹
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # 提取描述符
    descriptors = [
        Descriptors.MolWt(mol),  # 分子量
        Descriptors.MolLogP(mol),  # LogP
        Descriptors.NumHDonors(mol),  # 氢键供体数量
        Descriptors.NumHAcceptors(mol)  # 氢键受体数量
    ]
    # 生成Morgan指纹
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    fingerprint_array = np.zeros((2048,))
    Chem.DataStructs.ConvertToNumpyArray(fingerprint, fingerprint_array)
    # 合并描述符和指纹
    features = np.concatenate([descriptors, fingerprint_array])
    return features

# 将SMILES转换为特征
features = []
for smiles in smiles_list:
    feature = smiles_to_features(smiles)
    if feature is not None:
        features.append(feature)

# 转换为numpy数组
features = np.array(features)

y = labels
X = np.array(features)

# 直接使用最优参数进行五折交叉验证
def train_evaluate_with_best_params(model_name, model_class, best_params, X, y):
    # 创建模型并设置最优参数
    model = model_class(**best_params)

    # 五折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # 存储各个折的评价指标
    metrics_train = {'PR-AUC': [], 'AU-ROC': [], 'ACC': [], 'PREC': [], 'REC': [], 'F1': []}
    metrics_test = {'PR-AUC': [], 'AU-ROC': [], 'ACC': [], 'PREC': [], 'REC': [], 'F1': []}
    
    # 使用 tqdm 包裹五折交叉验证的每一折
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X, y), total=5, desc=f"Training {model_name}")):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        
        # 训练集指标
        y_prob_train = model.predict_proba(X_train)[:, 1]
        precision_train, recall_train, _ = precision_recall_curve(y_train, y_prob_train)
        pr_auc_train = auc(recall_train, precision_train)
        metrics_train['PR-AUC'].append(pr_auc_train)

        fpr_train, tpr_train, _ = roc_curve(y_train, y_prob_train)
        auc_train = auc(fpr_train, tpr_train)
        metrics_train['AU-ROC'].append(auc_train)

        accuracy_train = accuracy_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['ACC'].append(accuracy_train)

        precision_train_val = precision_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['PREC'].append(precision_train_val)

        recall_train_val = recall_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['REC'].append(recall_train_val)

        f1_train_val = f1_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['F1'].append(f1_train_val)

        # 测试集指标
        y_prob_test = model.predict_proba(X_val)[:, 1]
        precision_test, recall_test, _ = precision_recall_curve(y_val, y_prob_test)
        pr_auc_test = auc(recall_test, precision_test)
        metrics_test['PR-AUC'].append(pr_auc_test)

        fpr_test, tpr_test, _ = roc_curve(y_val, y_prob_test)
        auc_test = auc(fpr_test, tpr_test)
        metrics_test['AU-ROC'].append(auc_test)

        accuracy_test = accuracy_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['ACC'].append(accuracy_test)

        precision_test_val = precision_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['PREC'].append(precision_test_val)

        recall_test_val = recall_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['REC'].append(recall_test_val)

        f1_test_val = f1_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['F1'].append(f1_test_val)

        # 输出当前折的训练集和测试集评价指标
        print(f"Fold {fold + 1} - Train metrics:")
        for metric, values in metrics_train.items():
            print(f"  {metric}: {values[-1]:.4f}")
        
        print(f"Fold {fold + 1} - Test metrics:")
        for metric, values in metrics_test.items():
            print(f"  {metric}: {values[-1]:.4f}")
        print()

        # 将训练集和测试集的结果合并成一个 DataFrame
    results = {
        'Metric': ['PR-AUC', 'AU-ROC', 'F1', 'REC', 'PREC', 'ACC'],
        'Train': [
            np.mean(metrics_train['PR-AUC']),
            np.mean(metrics_train['AU-ROC']),
            np.mean(metrics_train['F1']),
            np.mean(metrics_train['REC']),
            np.mean(metrics_train['PREC']),
            np.mean(metrics_train['ACC'])
        ],
        'Test': [
            np.mean(metrics_test['PR-AUC']),
            np.mean(metrics_test['AU-ROC']),
            np.mean(metrics_test['F1']),
            np.mean(metrics_test['REC']),
            np.mean(metrics_test['PREC']),
            np.mean(metrics_test['ACC'])
        ]
    }
    
    df_results = pd.DataFrame(results)

    # 保存结果到CSV文件
    df_results.to_csv(f'{model_name}_evaluation_metrics.csv', index=False)
    print(f"Metrics saved to {model_name}_evaluation_metrics.csv")




    
    # 输出每个评价指标的训练集和测试集平均值
    print(f'Average Train PR AUC: {np.mean(metrics_train["PR-AUC"]):.4f}')
    print(f'Average Test PR AUC: {np.mean(metrics_test["PR-AUC"]):.4f}')

    print(f'Average Train AU-ROC: {np.mean(metrics_train["AU-ROC"]):.4f}')
    print(f'Average Test AU-ROC: {np.mean(metrics_test["AU-ROC"]):.4f}')

    print(f'Average Train Accuracy: {np.mean(metrics_train["ACC"]):.4f}')
    print(f'Average Test Accuracy: {np.mean(metrics_test["ACC"]):.4f}')

    print(f'Average Train Precision: {np.mean(metrics_train["PREC"]):.4f}')
    print(f'Average Test Precision: {np.mean(metrics_test["PREC"]):.4f}')

    print(f'Average Train Recall: {np.mean(metrics_train["REC"]):.4f}')
    print(f'Average Test Recall: {np.mean(metrics_test["REC"]):.4f}')

    print(f'Average Train F1: {np.mean(metrics_train["F1"]):.4f}')
    print(f'Average Test F1: {np.mean(metrics_test["F1"]):.4f}')




In [8]:
# 最优参数
best_params_xgb = {
    'n_estimators': 10,
    'max_depth': 27,
    'learning_rate': 0.0809714150972509,
    'eval_metric': 'error'  
}

# 训练并评估 XGBoost 模型
train_evaluate_with_best_params("XGBoost", XGBClassifier, best_params_xgb, X, y)

Training XGBoost:  20%|██        | 1/5 [00:42<02:51, 42.90s/it]

Fold 1 - Train metrics:
  PR-AUC: 0.9953
  AU-ROC: 0.9674
  ACC: 0.9293
  PREC: 0.9336
  REC: 0.9899
  F1: 0.9609
Fold 1 - Test metrics:
  PR-AUC: 0.9614
  AU-ROC: 0.7539
  ACC: 0.8732
  PREC: 0.8849
  REC: 0.9840
  F1: 0.9318



Training XGBoost:  40%|████      | 2/5 [01:29<02:15, 45.31s/it]

Fold 2 - Train metrics:
  PR-AUC: 0.9939
  AU-ROC: 0.9627
  ACC: 0.9276
  PREC: 0.9254
  REC: 0.9980
  F1: 0.9603
Fold 2 - Test metrics:
  PR-AUC: 0.9558
  AU-ROC: 0.7569
  ACC: 0.8732
  PREC: 0.8849
  REC: 0.9840
  F1: 0.9318



Training XGBoost:  60%|██████    | 3/5 [02:08<01:24, 42.20s/it]

Fold 3 - Train metrics:
  PR-AUC: 0.9948
  AU-ROC: 0.9642
  ACC: 0.9382
  PREC: 0.9461
  REC: 0.9859
  F1: 0.9656
Fold 3 - Test metrics:
  PR-AUC: 0.9408
  AU-ROC: 0.6909
  ACC: 0.8732
  PREC: 0.8897
  REC: 0.9758
  F1: 0.9308



Training XGBoost:  80%|████████  | 4/5 [02:46<00:40, 40.54s/it]

Fold 4 - Train metrics:
  PR-AUC: 0.9964
  AU-ROC: 0.9740
  ACC: 0.9330
  PREC: 0.9373
  REC: 0.9900
  F1: 0.9629
Fold 4 - Test metrics:
  PR-AUC: 0.9456
  AU-ROC: 0.6978
  ACC: 0.8723
  PREC: 0.8841
  REC: 0.9839
  F1: 0.9313



Training XGBoost: 100%|██████████| 5/5 [03:26<00:00, 41.30s/it]

Fold 5 - Train metrics:
  PR-AUC: 0.9954
  AU-ROC: 0.9679
  ACC: 0.9295
  PREC: 0.9321
  REC: 0.9920
  F1: 0.9611
Fold 5 - Test metrics:
  PR-AUC: 0.9602
  AU-ROC: 0.7687
  ACC: 0.9078
  PREC: 0.9111
  REC: 0.9919
  F1: 0.9498

Average Train PR AUC: 0.9952
Average Test PR AUC: 0.9528
Average Train AU-ROC: 0.9672
Average Test AU-ROC: 0.7336
Average Train Accuracy: 0.9315
Average Test Accuracy: 0.8800
Average Train Precision: 0.9349
Average Test Precision: 0.8909
Average Train Recall: 0.9912
Average Test Recall: 0.9839
Average Train F1: 0.9622
Average Test F1: 0.9351





In [11]:

from sklearn.ensemble import RandomForestClassifier

# 最优参数（随机森林）
best_params_rf = {
    'n_estimators': 659,
    'max_depth': 32,
    'min_samples_split': 0.021676009502668372
}



# 训练并评估 RandomForest 模型
train_evaluate_with_best_params("RandomForest", RandomForestClassifier, best_params_rf, X, y)


Training RandomForest:  20%|██        | 1/5 [00:02<00:09,  2.43s/it]

Fold 1 - Train metrics:
  PR-AUC: 0.9993
  AU-ROC: 0.9948
  ACC: 0.9205
  PREC: 0.9185
  REC: 0.9980
  F1: 0.9566
Fold 1 - Test metrics:
  PR-AUC: 0.9555
  AU-ROC: 0.7581
  ACC: 0.8873
  PREC: 0.8865
  REC: 1.0000
  F1: 0.9398



Training RandomForest:  40%|████      | 2/5 [00:04<00:07,  2.47s/it]

Fold 2 - Train metrics:
  PR-AUC: 0.9995
  AU-ROC: 0.9960
  ACC: 0.9170
  PREC: 0.9136
  REC: 1.0000
  F1: 0.9549
Fold 2 - Test metrics:
  PR-AUC: 0.9497
  AU-ROC: 0.7539
  ACC: 0.8803
  PREC: 0.8857
  REC: 0.9920
  F1: 0.9358



Training RandomForest:  60%|██████    | 3/5 [00:07<00:04,  2.42s/it]

Fold 3 - Train metrics:
  PR-AUC: 0.9991
  AU-ROC: 0.9933
  ACC: 0.9170
  PREC: 0.9153
  REC: 0.9980
  F1: 0.9549
Fold 3 - Test metrics:
  PR-AUC: 0.9335
  AU-ROC: 0.6886
  ACC: 0.8732
  PREC: 0.8732
  REC: 1.0000
  F1: 0.9323



Training RandomForest:  80%|████████  | 4/5 [00:09<00:02,  2.41s/it]

Fold 4 - Train metrics:
  PR-AUC: 0.9992
  AU-ROC: 0.9944
  ACC: 0.9136
  PREC: 0.9104
  REC: 1.0000
  F1: 0.9531
Fold 4 - Test metrics:
  PR-AUC: 0.9430
  AU-ROC: 0.7633
  ACC: 0.8794
  PREC: 0.8794
  REC: 1.0000
  F1: 0.9358



Training RandomForest: 100%|██████████| 5/5 [00:12<00:00,  2.42s/it]

Fold 5 - Train metrics:
  PR-AUC: 0.9992
  AU-ROC: 0.9943
  ACC: 0.9101
  PREC: 0.9071
  REC: 1.0000
  F1: 0.9513
Fold 5 - Test metrics:
  PR-AUC: 0.9718
  AU-ROC: 0.8283
  ACC: 0.8794
  PREC: 0.8794
  REC: 1.0000
  F1: 0.9358

Metrics saved to RandomForest_evaluation_metrics.csv
Average Train PR AUC: 0.9992
Average Test PR AUC: 0.9507
Average Train AU-ROC: 0.9946
Average Test AU-ROC: 0.7584
Average Train Accuracy: 0.9156
Average Test Accuracy: 0.8799
Average Train Precision: 0.9130
Average Test Precision: 0.8809
Average Train Recall: 0.9992
Average Test Recall: 0.9984
Average Train F1: 0.9541
Average Test F1: 0.9359





In [14]:



from lightgbm import LGBMClassifier

# 最优参数（LightGBM）
best_params_lgbm = {
    'n_estimators': 847,
    'max_depth': 28,
    'num_leaves': 959,
    'learning_rate': 0.41408607247765644,  
    'verbose': -1 
}

# 训练并评估 LightGBM 模型
train_evaluate_with_best_params("LightGBM", LGBMClassifier, best_params_lgbm, X, y)


Training LightGBM:  20%|██        | 1/5 [00:00<00:02,  1.48it/s]

Fold 1 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 1 - Test metrics:
  PR-AUC: 0.9486
  AU-ROC: 0.7534
  ACC: 0.8592
  PREC: 0.9008
  REC: 0.9440
  F1: 0.9219



Training LightGBM:  40%|████      | 2/5 [00:01<00:02,  1.19it/s]

Fold 2 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9982
  PREC: 0.9980
  REC: 1.0000
  F1: 0.9990
Fold 2 - Test metrics:
  PR-AUC: 0.9539
  AU-ROC: 0.7713
  ACC: 0.8662
  PREC: 0.8897
  REC: 0.9680
  F1: 0.9272



Training LightGBM:  60%|██████    | 3/5 [00:02<00:01,  1.44it/s]

Fold 3 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9982
  PREC: 0.9980
  REC: 1.0000
  F1: 0.9990
Fold 3 - Test metrics:
  PR-AUC: 0.9422
  AU-ROC: 0.7437
  ACC: 0.8662
  PREC: 0.8832
  REC: 0.9758
  F1: 0.9272



Training LightGBM:  80%|████████  | 4/5 [00:02<00:00,  1.51it/s]

Fold 4 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 4 - Test metrics:
  PR-AUC: 0.9366
  AU-ROC: 0.7614
  ACC: 0.8794
  PREC: 0.9084
  REC: 0.9597
  F1: 0.9333



Training LightGBM: 100%|██████████| 5/5 [00:03<00:00,  1.45it/s]

Fold 5 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9982
  PREC: 0.9980
  REC: 1.0000
  F1: 0.9990
Fold 5 - Test metrics:
  PR-AUC: 0.9528
  AU-ROC: 0.7324
  ACC: 0.8794
  PREC: 0.8963
  REC: 0.9758
  F1: 0.9344

Metrics saved to LightGBM_evaluation_metrics.csv
Average Train PR AUC: 1.0000
Average Test PR AUC: 0.9468
Average Train AU-ROC: 1.0000
Average Test AU-ROC: 0.7525
Average Train Accuracy: 0.9989
Average Test Accuracy: 0.8701
Average Train Precision: 0.9988
Average Test Precision: 0.8957
Average Train Recall: 1.0000
Average Test Recall: 0.9647
Average Train F1: 0.9994
Average Test F1: 0.9288





In [16]:
from sklearn.svm import SVC

# 最优参数（SVM）启用概率估计
best_params_svm = {
    'C': 0.01559252830927682,
    'gamma': 0.06808617250161814,
    'kernel': 'linear',
    'probability': True  # 启用概率估计
}

# 训练并评估 SVM 模型
train_evaluate_with_best_params("SVM", SVC, best_params_svm, X, y)



Training SVM:  20%|██        | 1/5 [00:02<00:09,  2.39s/it]

Fold 1 - Train metrics:
  PR-AUC: 0.9915
  AU-ROC: 0.9583
  ACC: 0.8975
  PREC: 0.8969
  REC: 0.9980
  F1: 0.9448
Fold 1 - Test metrics:
  PR-AUC: 0.9429
  AU-ROC: 0.7468
  ACC: 0.8803
  PREC: 0.8857
  REC: 0.9920
  F1: 0.9358



Training SVM:  40%|████      | 2/5 [00:05<00:08,  2.88s/it]

Fold 2 - Train metrics:
  PR-AUC: 0.9925
  AU-ROC: 0.9619
  ACC: 0.8816
  PREC: 0.8812
  REC: 1.0000
  F1: 0.9369
Fold 2 - Test metrics:
  PR-AUC: 0.9546
  AU-ROC: 0.7412
  ACC: 0.8803
  PREC: 0.8803
  REC: 1.0000
  F1: 0.9363



Training SVM:  60%|██████    | 3/5 [00:08<00:05,  2.87s/it]

Fold 3 - Train metrics:
  PR-AUC: 0.9917
  AU-ROC: 0.9536
  ACC: 0.9081
  PREC: 0.9099
  REC: 0.9940
  F1: 0.9501
Fold 3 - Test metrics:
  PR-AUC: 0.9233
  AU-ROC: 0.6823
  ACC: 0.8592
  PREC: 0.8768
  REC: 0.9758
  F1: 0.9237



Training SVM:  80%|████████  | 4/5 [00:11<00:02,  2.75s/it]

Fold 4 - Train metrics:
  PR-AUC: 0.9912
  AU-ROC: 0.9581
  ACC: 0.8818
  PREC: 0.8814
  REC: 1.0000
  F1: 0.9370
Fold 4 - Test metrics:
  PR-AUC: 0.9401
  AU-ROC: 0.7576
  ACC: 0.8794
  PREC: 0.8794
  REC: 1.0000
  F1: 0.9358



Training SVM: 100%|██████████| 5/5 [00:13<00:00,  2.67s/it]

Fold 5 - Train metrics:
  PR-AUC: 0.9867
  AU-ROC: 0.9510
  ACC: 0.8818
  PREC: 0.8814
  REC: 1.0000
  F1: 0.9370
Fold 5 - Test metrics:
  PR-AUC: 0.9549
  AU-ROC: 0.7386
  ACC: 0.8794
  PREC: 0.8794
  REC: 1.0000
  F1: 0.9358

Metrics saved to SVM_evaluation_metrics.csv
Average Train PR AUC: 0.9907
Average Test PR AUC: 0.9432
Average Train AU-ROC: 0.9566
Average Test AU-ROC: 0.7333
Average Train Accuracy: 0.8902
Average Test Accuracy: 0.8757
Average Train Precision: 0.8902
Average Test Precision: 0.8803
Average Train Recall: 0.9984
Average Test Recall: 0.9936
Average Train F1: 0.9411
Average Test F1: 0.9335





In [17]:
from sklearn.neighbors import KNeighborsClassifier

# 最优参数（KNN）
best_params_knn = {
    'n_neighbors': 7,
    'weights': 'uniform',
    'metric': 'manhattan'
}

# 训练并评估 KNN 模型
train_evaluate_with_best_params("KNN", KNeighborsClassifier, best_params_knn, X, y)


Training KNN:  20%|██        | 1/5 [00:00<00:03,  1.27it/s]

Fold 1 - Train metrics:
  PR-AUC: 0.9857
  AU-ROC: 0.8970
  ACC: 0.8852
  PREC: 0.8913
  REC: 0.9899
  F1: 0.9380
Fold 1 - Test metrics:
  PR-AUC: 0.9371
  AU-ROC: 0.6193
  ACC: 0.8662
  PREC: 0.8786
  REC: 0.9840
  F1: 0.9283



Training KNN:  40%|████      | 2/5 [00:01<00:02,  1.30it/s]

Fold 2 - Train metrics:
  PR-AUC: 0.9813
  AU-ROC: 0.8646
  ACC: 0.8799
  PREC: 0.8824
  REC: 0.9960
  F1: 0.9357
Fold 2 - Test metrics:
  PR-AUC: 0.9574
  AU-ROC: 0.7292
  ACC: 0.8732
  PREC: 0.8794
  REC: 0.9920
  F1: 0.9323



Training KNN:  60%|██████    | 3/5 [00:02<00:01,  1.31it/s]

Fold 3 - Train metrics:
  PR-AUC: 0.9859
  AU-ROC: 0.8968
  ACC: 0.8905
  PREC: 0.8935
  REC: 0.9940
  F1: 0.9411
Fold 3 - Test metrics:
  PR-AUC: 0.9218
  AU-ROC: 0.5925
  ACC: 0.8451
  PREC: 0.8696
  REC: 0.9677
  F1: 0.9160



Training KNN:  80%|████████  | 4/5 [00:03<00:00,  1.31it/s]

Fold 4 - Train metrics:
  PR-AUC: 0.9809
  AU-ROC: 0.8630
  ACC: 0.8818
  PREC: 0.8883
  REC: 0.9900
  F1: 0.9364
Fold 4 - Test metrics:
  PR-AUC: 0.9420
  AU-ROC: 0.6694
  ACC: 0.8865
  PREC: 0.8857
  REC: 1.0000
  F1: 0.9394



Training KNN: 100%|██████████| 5/5 [00:03<00:00,  1.31it/s]

Fold 5 - Train metrics:
  PR-AUC: 0.9810
  AU-ROC: 0.8633
  ACC: 0.8783
  PREC: 0.8837
  REC: 0.9920
  F1: 0.9347
Fold 5 - Test metrics:
  PR-AUC: 0.9651
  AU-ROC: 0.7877
  ACC: 0.8794
  PREC: 0.8794
  REC: 1.0000
  F1: 0.9358

Metrics saved to KNN_evaluation_metrics.csv
Average Train PR AUC: 0.9830
Average Test PR AUC: 0.9447
Average Train AU-ROC: 0.8770
Average Test AU-ROC: 0.6796
Average Train Accuracy: 0.8831
Average Test Accuracy: 0.8701
Average Train Precision: 0.8878
Average Test Precision: 0.8785
Average Train Recall: 0.9924
Average Test Recall: 0.9887
Average Train F1: 0.9372
Average Test F1: 0.9304



