In [1]:
import warnings
from rdkit import RDLogger

# 屏蔽 RDKit 警告
RDLogger.DisableLog('rdApp.*')

# 或屏蔽所有 Python 警告
warnings.filterwarnings("ignore")
# 屏蔽 LightGBM 警告
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

In [2]:
import torch
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import precision_recall_curve, auc, f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
from rdkit.Chem import Descriptors, AllChem

from sklearn.metrics import roc_curve



# 数据预处理
df = pd.read_csv('imputed_selected_features_Flam.csv')
labels = df['Flammability'].values
smiles_list = df['SMILES'].tolist()

# 函数：将SMILES转换为分子描述符和指纹
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # 提取描述符
    descriptors = [
        Descriptors.MolWt(mol),  # 分子量
        Descriptors.MolLogP(mol),  # LogP
        Descriptors.NumHDonors(mol),  # 氢键供体数量
        Descriptors.NumHAcceptors(mol)  # 氢键受体数量
    ]
    # 生成Morgan指纹
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    fingerprint_array = np.zeros((2048,))
    Chem.DataStructs.ConvertToNumpyArray(fingerprint, fingerprint_array)
    # 合并描述符和指纹
    features = np.concatenate([descriptors, fingerprint_array])
    return features

# 将SMILES转换为特征
features = []
for smiles in smiles_list:
    feature = smiles_to_features(smiles)
    if feature is not None:
        features.append(feature)

# 转换为numpy数组
features = np.array(features)

y = labels
X = np.array(features)

# 直接使用最优参数进行五折交叉验证
def train_evaluate_with_best_params(model_name, model_class, best_params, X, y):
    # 创建模型并设置最优参数
    model = model_class(**best_params)

    # 五折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # 存储各个折的评价指标
    metrics_train = {'PR-AUC': [], 'AU-ROC': [], 'ACC': [], 'PREC': [], 'REC': [], 'F1': []}
    metrics_test = {'PR-AUC': [], 'AU-ROC': [], 'ACC': [], 'PREC': [], 'REC': [], 'F1': []}
    
    # 使用 tqdm 包裹五折交叉验证的每一折
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X, y), total=5, desc=f"Training {model_name}")):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        
        # 训练集指标
        y_prob_train = model.predict_proba(X_train)[:, 1]
        precision_train, recall_train, _ = precision_recall_curve(y_train, y_prob_train)
        pr_auc_train = auc(recall_train, precision_train)
        metrics_train['PR-AUC'].append(pr_auc_train)

        fpr_train, tpr_train, _ = roc_curve(y_train, y_prob_train)
        auc_train = auc(fpr_train, tpr_train)
        metrics_train['AU-ROC'].append(auc_train)

        accuracy_train = accuracy_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['ACC'].append(accuracy_train)

        precision_train_val = precision_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['PREC'].append(precision_train_val)

        recall_train_val = recall_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['REC'].append(recall_train_val)

        f1_train_val = f1_score(y_train, (y_prob_train > 0.5).astype(int))
        metrics_train['F1'].append(f1_train_val)

        # 测试集指标
        y_prob_test = model.predict_proba(X_val)[:, 1]
        precision_test, recall_test, _ = precision_recall_curve(y_val, y_prob_test)
        pr_auc_test = auc(recall_test, precision_test)
        metrics_test['PR-AUC'].append(pr_auc_test)

        fpr_test, tpr_test, _ = roc_curve(y_val, y_prob_test)
        auc_test = auc(fpr_test, tpr_test)
        metrics_test['AU-ROC'].append(auc_test)

        accuracy_test = accuracy_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['ACC'].append(accuracy_test)

        precision_test_val = precision_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['PREC'].append(precision_test_val)

        recall_test_val = recall_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['REC'].append(recall_test_val)

        f1_test_val = f1_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['F1'].append(f1_test_val)

        # 输出当前折的训练集和测试集评价指标
        print(f"Fold {fold + 1} - Train metrics:")
        for metric, values in metrics_train.items():
            print(f"  {metric}: {values[-1]:.4f}")
        
        print(f"Fold {fold + 1} - Test metrics:")
        for metric, values in metrics_test.items():
            print(f"  {metric}: {values[-1]:.4f}")
        print()

        # 将训练集和测试集的结果合并成一个 DataFrame
    results = {
        'Metric': ['PR-AUC', 'AU-ROC', 'F1', 'REC', 'PREC', 'ACC'],
        'Train': [
            np.mean(metrics_train['PR-AUC']),
            np.mean(metrics_train['AU-ROC']),
            np.mean(metrics_train['F1']),
            np.mean(metrics_train['REC']),
            np.mean(metrics_train['PREC']),
            np.mean(metrics_train['ACC'])
        ],
        'Test': [
            np.mean(metrics_test['PR-AUC']),
            np.mean(metrics_test['AU-ROC']),
            np.mean(metrics_test['F1']),
            np.mean(metrics_test['REC']),
            np.mean(metrics_test['PREC']),
            np.mean(metrics_test['ACC'])
        ]
    }
    
    df_results = pd.DataFrame(results)

    # 保存结果到CSV文件
    df_results.to_csv(f'{model_name}_evaluation_metrics.csv', index=False)
    print(f"Metrics saved to {model_name}_evaluation_metrics.csv")




    
    # 输出每个评价指标的训练集和测试集平均值
    print(f'Average Train PR AUC: {np.mean(metrics_train["PR-AUC"]):.4f}')
    print(f'Average Test PR AUC: {np.mean(metrics_test["PR-AUC"]):.4f}')

    print(f'Average Train AU-ROC: {np.mean(metrics_train["AU-ROC"]):.4f}')
    print(f'Average Test AU-ROC: {np.mean(metrics_test["AU-ROC"]):.4f}')

    print(f'Average Train Accuracy: {np.mean(metrics_train["ACC"]):.4f}')
    print(f'Average Test Accuracy: {np.mean(metrics_test["ACC"]):.4f}')

    print(f'Average Train Precision: {np.mean(metrics_train["PREC"]):.4f}')
    print(f'Average Test Precision: {np.mean(metrics_test["PREC"]):.4f}')

    print(f'Average Train Recall: {np.mean(metrics_train["REC"]):.4f}')
    print(f'Average Test Recall: {np.mean(metrics_test["REC"]):.4f}')

    print(f'Average Train F1: {np.mean(metrics_train["F1"]):.4f}')
    print(f'Average Test F1: {np.mean(metrics_test["F1"]):.4f}')




In [3]:
# 最优参数
best_params_xgb = {
    'n_estimators': 368,
    'max_depth': 1,
    'learning_rate': 0.5946860775199619,
    'eval_metric': 'error'  
}

# 训练并评估 XGBoost 模型
train_evaluate_with_best_params("XGBoost", XGBClassifier, best_params_xgb, X, y)

Training XGBoost:  20%|██        | 1/5 [00:00<00:03,  1.14it/s]

Fold 1 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 0.9998
  ACC: 0.9965
  PREC: 0.9964
  REC: 1.0000
  F1: 0.9982
Fold 1 - Test metrics:
  PR-AUC: 0.9924
  AU-ROC: 0.7880
  ACC: 0.9577
  PREC: 0.9714
  REC: 0.9855
  F1: 0.9784



Training XGBoost:  40%|████      | 2/5 [00:01<00:02,  1.13it/s]

Fold 2 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9947
  PREC: 0.9946
  REC: 1.0000
  F1: 0.9973
Fold 2 - Test metrics:
  PR-AUC: 0.9923
  AU-ROC: 0.7880
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training XGBoost:  60%|██████    | 3/5 [00:02<00:01,  1.14it/s]

Fold 3 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 0.9999
  ACC: 0.9965
  PREC: 0.9964
  REC: 1.0000
  F1: 0.9982
Fold 3 - Test metrics:
  PR-AUC: 0.9945
  AU-ROC: 0.8605
  ACC: 0.9718
  PREC: 0.9786
  REC: 0.9928
  F1: 0.9856



Training XGBoost:  80%|████████  | 4/5 [00:03<00:00,  1.14it/s]

Fold 4 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9912
  PREC: 0.9910
  REC: 1.0000
  F1: 0.9955
Fold 4 - Test metrics:
  PR-AUC: 0.9995
  AU-ROC: 0.9836
  ACC: 0.9787
  PREC: 0.9855
  REC: 0.9927
  F1: 0.9891



Training XGBoost: 100%|██████████| 5/5 [00:04<00:00,  1.14it/s]

Fold 5 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9965
  PREC: 0.9964
  REC: 1.0000
  F1: 0.9982
Fold 5 - Test metrics:
  PR-AUC: 0.9906
  AU-ROC: 0.7464
  ACC: 0.9504
  PREC: 0.9710
  REC: 0.9781
  F1: 0.9745

Metrics saved to XGBoost_evaluation_metrics.csv
Average Train PR AUC: 1.0000
Average Test PR AUC: 0.9939
Average Train AU-ROC: 0.9999
Average Test AU-ROC: 0.8333
Average Train Accuracy: 0.9951
Average Test Accuracy: 0.9661
Average Train Precision: 0.9949
Average Test Precision: 0.9757
Average Train Recall: 1.0000
Average Test Recall: 0.9898
Average Train F1: 0.9975
Average Test F1: 0.9827





In [4]:

from sklearn.ensemble import RandomForestClassifier

# 最优参数（随机森林）
best_params_rf = {'n_estimators': 310, 'max_depth': 40, 'min_samples_split': 0.01747361296783917}



# 训练并评估 RandomForest 模型
train_evaluate_with_best_params("RandomForest", RandomForestClassifier, best_params_rf, X, y)


Training RandomForest:  20%|██        | 1/5 [00:00<00:01,  2.14it/s]

Fold 1 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9735
  PREC: 0.9735
  REC: 1.0000
  F1: 0.9865
Fold 1 - Test metrics:
  PR-AUC: 0.9934
  AU-ROC: 0.8080
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training RandomForest:  40%|████      | 2/5 [00:00<00:01,  2.17it/s]

Fold 2 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9735
  PREC: 0.9735
  REC: 1.0000
  F1: 0.9865
Fold 2 - Test metrics:
  PR-AUC: 0.9764
  AU-ROC: 0.5960
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training RandomForest:  60%|██████    | 3/5 [00:01<00:00,  2.18it/s]

Fold 3 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9753
  PREC: 0.9752
  REC: 1.0000
  F1: 0.9874
Fold 3 - Test metrics:
  PR-AUC: 0.9962
  AU-ROC: 0.8786
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training RandomForest:  80%|████████  | 4/5 [00:01<00:00,  2.18it/s]

Fold 4 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 0.9999
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857
Fold 4 - Test metrics:
  PR-AUC: 0.9961
  AU-ROC: 0.8814
  ACC: 0.9716
  PREC: 0.9716
  REC: 1.0000
  F1: 0.9856



Training RandomForest: 100%|██████████| 5/5 [00:02<00:00,  2.15it/s]

Fold 5 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 0.9999
  ACC: 0.9735
  PREC: 0.9735
  REC: 1.0000
  F1: 0.9866
Fold 5 - Test metrics:
  PR-AUC: 0.9945
  AU-ROC: 0.8358
  ACC: 0.9716
  PREC: 0.9716
  REC: 1.0000
  F1: 0.9856

Metrics saved to RandomForest_evaluation_metrics.csv
Average Train PR AUC: 1.0000
Average Test PR AUC: 0.9913
Average Train AU-ROC: 1.0000
Average Test AU-ROC: 0.8000
Average Train Accuracy: 0.9735
Average Test Accuracy: 0.9718
Average Train Precision: 0.9735
Average Test Precision: 0.9718
Average Train Recall: 1.0000
Average Test Recall: 1.0000
Average Train F1: 0.9866
Average Test F1: 0.9857





In [5]:



from lightgbm import LGBMClassifier

# 最优参数（LightGBM）
best_params_lgbm = {'n_estimators': 529, 'max_depth': 32, 'num_leaves': 1337, 'learning_rate': 0.16178297903549685,'verbose': -1 }

# 训练并评估 LightGBM 模型
train_evaluate_with_best_params("LightGBM", LGBMClassifier, best_params_lgbm, X, y)


Training LightGBM:  20%|██        | 1/5 [00:00<00:01,  3.24it/s]

Fold 1 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 1 - Test metrics:
  PR-AUC: 0.9919
  AU-ROC: 0.7627
  ACC: 0.9648
  PREC: 0.9716
  REC: 0.9928
  F1: 0.9821



Training LightGBM:  40%|████      | 2/5 [00:00<00:00,  3.40it/s]

Fold 2 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 2 - Test metrics:
  PR-AUC: 0.9939
  AU-ROC: 0.8170
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training LightGBM:  60%|██████    | 3/5 [00:00<00:00,  3.52it/s]

Fold 3 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 3 - Test metrics:
  PR-AUC: 0.9902
  AU-ROC: 0.7862
  ACC: 0.9789
  PREC: 0.9787
  REC: 1.0000
  F1: 0.9892



Training LightGBM:  80%|████████  | 4/5 [00:01<00:00,  3.49it/s]

Fold 4 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 4 - Test metrics:
  PR-AUC: 0.9943
  AU-ROC: 0.8431
  ACC: 0.9645
  PREC: 0.9714
  REC: 0.9927
  F1: 0.9819



Training LightGBM: 100%|██████████| 5/5 [00:02<00:00,  2.42it/s]

Fold 5 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 5 - Test metrics:
  PR-AUC: 0.9938
  AU-ROC: 0.8102
  ACC: 0.9645
  PREC: 0.9714
  REC: 0.9927
  F1: 0.9819

Metrics saved to LightGBM_evaluation_metrics.csv
Average Train PR AUC: 1.0000
Average Test PR AUC: 0.9928
Average Train AU-ROC: 1.0000
Average Test AU-ROC: 0.8038
Average Train Accuracy: 1.0000
Average Test Accuracy: 0.9689
Average Train Precision: 1.0000
Average Test Precision: 0.9730
Average Train Recall: 1.0000
Average Test Recall: 0.9956
Average Train F1: 1.0000
Average Test F1: 0.9842





In [6]:
from sklearn.svm import SVC

# 最优参数（SVM）启用概率估计
best_params_svm = {'C': 0.977399931193936, 'gamma': 0.010961352543547414, 'kernel': 'linear','probability': True}

# 训练并评估 SVM 模型
train_evaluate_with_best_params("SVM", SVC, best_params_svm, X, y)



Training SVM:  20%|██        | 1/5 [00:00<00:02,  1.48it/s]

Fold 1 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9717
  PREC: 0.9717
  REC: 1.0000
  F1: 0.9857
Fold 1 - Test metrics:
  PR-AUC: 0.9945
  AU-ROC: 0.8261
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training SVM:  40%|████      | 2/5 [00:01<00:01,  1.81it/s]

Fold 2 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9717
  PREC: 0.9717
  REC: 1.0000
  F1: 0.9857
Fold 2 - Test metrics:
  PR-AUC: 0.9714
  AU-ROC: 0.6612
  ACC: 0.9648
  PREC: 0.9716
  REC: 0.9928
  F1: 0.9821



Training SVM:  60%|██████    | 3/5 [00:01<00:01,  1.50it/s]

Fold 3 - Train metrics:
  PR-AUC: 0.9996
  AU-ROC: 0.9889
  ACC: 0.9717
  PREC: 0.9717
  REC: 1.0000
  F1: 0.9857
Fold 3 - Test metrics:
  PR-AUC: 0.9966
  AU-ROC: 0.9004
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training SVM:  80%|████████  | 4/5 [00:03<00:00,  1.08it/s]

Fold 4 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857
Fold 4 - Test metrics:
  PR-AUC: 0.9984
  AU-ROC: 0.9489
  ACC: 0.9716
  PREC: 0.9716
  REC: 1.0000
  F1: 0.9856



Training SVM: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]

Fold 5 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 0.9735
  PREC: 0.9735
  REC: 1.0000
  F1: 0.9866
Fold 5 - Test metrics:
  PR-AUC: 0.9931
  AU-ROC: 0.7974
  ACC: 0.9645
  PREC: 0.9714
  REC: 0.9927
  F1: 0.9819

Metrics saved to SVM_evaluation_metrics.csv
Average Train PR AUC: 0.9999
Average Test PR AUC: 0.9908
Average Train AU-ROC: 0.9978
Average Test AU-ROC: 0.8268
Average Train Accuracy: 0.9721
Average Test Accuracy: 0.9689
Average Train Precision: 0.9721
Average Test Precision: 0.9717
Average Train Recall: 1.0000
Average Test Recall: 0.9971
Average Train F1: 0.9858
Average Test F1: 0.9842





In [7]:
from sklearn.neighbors import KNeighborsClassifier

# 最优参数（KNN）
best_params_knn = {'n_neighbors': 5, 'weights': 'distance', 'metric': 'euclidean'}

# 训练并评估 KNN 模型
train_evaluate_with_best_params("KNN", KNeighborsClassifier, best_params_knn, X, y)


Training KNN:  20%|██        | 1/5 [00:00<00:01,  3.67it/s]

Fold 1 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 1 - Test metrics:
  PR-AUC: 0.9903
  AU-ROC: 0.6938
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training KNN: 100%|██████████| 5/5 [00:00<00:00,  7.12it/s]

Fold 2 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 2 - Test metrics:
  PR-AUC: 0.9865
  AU-ROC: 0.5697
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857

Fold 3 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 3 - Test metrics:
  PR-AUC: 0.9903
  AU-ROC: 0.6920
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857

Fold 4 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 4 - Test metrics:
  PR-AUC: 0.9903
  AU-ROC: 0.6898
  ACC: 0.9645
  PREC: 0.9714
  REC: 0.9927
  F1: 0.9819

Fold 5 - Train metrics:
  PR-AUC: 1.0000
  AU-ROC: 1.0000
  ACC: 1.0000
  PREC: 1.0000
  REC: 1.0000
  F1: 1.0000
Fold 5 - Test metrics:
  PR-AUC: 0.9863
  AU-ROC: 0.5739
  ACC: 0.9716
  PREC: 0.9716
  REC: 1.0000
  F1: 0.9856

Metrics saved to KNN_evaluation_metrics.csv
Average Train PR AUC: 1.0000
Average Test PR


