In [None]:
#性能最好的三个模型XGB  RF  LGB

In [1]:
import warnings
from rdkit import RDLogger

# 屏蔽 RDKit 警告
RDLogger.DisableLog('rdApp.*')

# 或屏蔽所有 Python 警告
warnings.filterwarnings("ignore")
# 屏蔽 LightGBM 警告
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

In [2]:
import torch
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import precision_recall_curve, auc, f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
from rdkit.Chem import Descriptors, AllChem

from sklearn.metrics import roc_curve



# 数据预处理
df = pd.read_csv('../imputed_selected_features_Flam.csv')
labels = df['Flammability'].values
smiles_list = df['SMILES'].tolist()

# 函数：将SMILES转换为分子描述符和指纹
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # 提取描述符
    descriptors = [
        Descriptors.MolWt(mol),  # 分子量
        Descriptors.MolLogP(mol),  # LogP
        Descriptors.NumHDonors(mol),  # 氢键供体数量
        Descriptors.NumHAcceptors(mol)  # 氢键受体数量
    ]
    # 生成Morgan指纹
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    fingerprint_array = np.zeros((2048,))
    Chem.DataStructs.ConvertToNumpyArray(fingerprint, fingerprint_array)
    # 合并描述符和指纹
    features = np.concatenate([descriptors, fingerprint_array])
    return features

# 将SMILES转换为特征
features = []
for smiles in smiles_list:
    feature = smiles_to_features(smiles)
    if feature is not None:
        features.append(feature)

# 转换为numpy数组
features = np.array(features)

y = labels
X = np.array(features)

import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve, auc, roc_curve, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

# 直接使用最优参数进行五折交叉验证
def train_evaluate_with_best_params(model_name, model_class, best_params, X, y):
    # 创建模型并设置最优参数
    model = model_class(**best_params)

    # 五折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # 存储各个折的评价指标
    metrics_test = {'PR-AUC': [], 'AU-ROC': [], 'ACC': [], 'PREC': [], 'REC': [], 'F1': []}
    
    # 存储预测错误的样本信息
    misclassified_idx_ALL=[]
    misclassified_X_ALL=[]
    misclassified_y_ALL=[]
    misclassified_fold_ALL=[]
    misclassified_y_prob_ALL=[]
    
    # 使用 tqdm 包裹五折交叉验证的每一折
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X, y), total=5, desc=f"Training {model_name}")):
        
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)

        # 测试集指标
        y_prob_test = model.predict_proba(X_val)[:, 1]
        precision_test, recall_test, _ = precision_recall_curve(y_val, y_prob_test)
        pr_auc_test = auc(recall_test, precision_test)
        metrics_test['PR-AUC'].append(pr_auc_test)

        fpr_test, tpr_test, _ = roc_curve(y_val, y_prob_test)
        auc_test = auc(fpr_test, tpr_test)
        metrics_test['AU-ROC'].append(auc_test)

        accuracy_test = accuracy_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['ACC'].append(accuracy_test)

        precision_test_val = precision_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['PREC'].append(precision_test_val)

        recall_test_val = recall_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['REC'].append(recall_test_val)

        f1_test_val = f1_score(y_val, (y_prob_test > 0.5).astype(int))
        metrics_test['F1'].append(f1_test_val)

        # 找出预测错误的样本
        misclassified_idx = np.where(y_val != (y_prob_test > 0.5).astype(int))[0]
    
        
        for i in misclassified_idx:
    
            misclassified_idx_ALL.append(val_idx[i])
            misclassified_fold_ALL.append(fold + 1)
            misclassified_y_ALL.append(y_val[i])
            misclassified_X_ALL.append(X_val[i])
            misclassified_y_prob_ALL.append(y_prob_test[i])
            

        print(f"Fold {fold + 1} - Test metrics:")
        for metric, values in metrics_test.items():
            print(f"  {metric}: {values[-1]:.4f}")
        print()

    # 计算各个评价指标的平均值
    results = {
        'Metric': ['PR-AUC', 'AU-ROC', 'F1', 'REC', 'PREC', 'ACC'],
        'Test': [
            np.mean(metrics_test['PR-AUC']),
            np.mean(metrics_test['AU-ROC']),
            np.mean(metrics_test['F1']),
            np.mean(metrics_test['REC']),
            np.mean(metrics_test['PREC']),
            np.mean(metrics_test['ACC'])
            ]
        }
    data = {
    'fold':misclassified_fold_ALL,
    'index': misclassified_idx_ALL,
    'X': misclassified_X_ALL,
    'y': misclassified_y_ALL,
    'y_prob' : misclassified_y_prob_ALL
    }
        # 使用pandas将字典转换为DataFrame
    df = pd.DataFrame(data)
    
    # 将DataFrame保存为CSV文件
    df.to_csv(f'{model_name}_misclassified_samples.csv', index=False)
    
    #print("CSV文件已保存")



In [3]:
# 最优参数
best_params_xgb = {'n_estimators': 368, 'max_depth': 1, 'learning_rate': 0.5946860775199619,'eval_metric': 'error'  
}
train_evaluate_with_best_params("XGBoost", XGBClassifier, best_params_xgb, X, y)


Training XGBoost:  20%|██        | 1/5 [00:00<00:03,  1.15it/s]

Fold 1 - Test metrics:
  PR-AUC: 0.9924
  AU-ROC: 0.7880
  ACC: 0.9577
  PREC: 0.9714
  REC: 0.9855
  F1: 0.9784



Training XGBoost:  40%|████      | 2/5 [00:01<00:02,  1.16it/s]

Fold 2 - Test metrics:
  PR-AUC: 0.9923
  AU-ROC: 0.7880
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training XGBoost:  60%|██████    | 3/5 [00:02<00:01,  1.15it/s]

Fold 3 - Test metrics:
  PR-AUC: 0.9945
  AU-ROC: 0.8605
  ACC: 0.9718
  PREC: 0.9786
  REC: 0.9928
  F1: 0.9856



Training XGBoost:  80%|████████  | 4/5 [00:03<00:00,  1.14it/s]

Fold 4 - Test metrics:
  PR-AUC: 0.9995
  AU-ROC: 0.9836
  ACC: 0.9787
  PREC: 0.9855
  REC: 0.9927
  F1: 0.9891



Training XGBoost: 100%|██████████| 5/5 [00:04<00:00,  1.14it/s]

Fold 5 - Test metrics:
  PR-AUC: 0.9906
  AU-ROC: 0.7464
  ACC: 0.9504
  PREC: 0.9710
  REC: 0.9781
  F1: 0.9745






In [5]:


from sklearn.svm import SVC

# 最优参数（SVM）启用概率估计
best_params_svm = {'C': 0.977399931193936, 'gamma': 0.010961352543547414, 'kernel': 'linear','probability': True}

# 训练并评估 SVM 模型
train_evaluate_with_best_params("SVM", SVC, best_params_svm, X, y)




Training SVM:  20%|██        | 1/5 [00:00<00:02,  1.55it/s]

Fold 1 - Test metrics:
  PR-AUC: 0.9945
  AU-ROC: 0.8261
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training SVM:  40%|████      | 2/5 [00:01<00:01,  1.96it/s]

Fold 2 - Test metrics:
  PR-AUC: 0.9714
  AU-ROC: 0.6612
  ACC: 0.9648
  PREC: 0.9716
  REC: 0.9928
  F1: 0.9821



Training SVM:  60%|██████    | 3/5 [00:01<00:01,  1.80it/s]

Fold 3 - Test metrics:
  PR-AUC: 0.9966
  AU-ROC: 0.9004
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training SVM:  80%|████████  | 4/5 [00:02<00:00,  1.42it/s]

Fold 4 - Test metrics:
  PR-AUC: 0.9984
  AU-ROC: 0.9489
  ACC: 0.9716
  PREC: 0.9716
  REC: 1.0000
  F1: 0.9856



Training SVM: 100%|██████████| 5/5 [00:03<00:00,  1.55it/s]

Fold 5 - Test metrics:
  PR-AUC: 0.9931
  AU-ROC: 0.7974
  ACC: 0.9716
  PREC: 0.9716
  REC: 1.0000
  F1: 0.9856






In [4]:
from lightgbm import LGBMClassifier

# 最优参数（LightGBM）
best_params_lgbm = {'n_estimators': 529, 'max_depth': 32, 'num_leaves': 1337, 'learning_rate': 0.16178297903549685,'verbose': -1 }

# 训练并评估 LightGBM 模型
train_evaluate_with_best_params("LightGBM", LGBMClassifier, best_params_lgbm, X, y)


Training LightGBM:  20%|██        | 1/5 [00:00<00:01,  3.89it/s]

Fold 1 - Test metrics:
  PR-AUC: 0.9919
  AU-ROC: 0.7627
  ACC: 0.9648
  PREC: 0.9716
  REC: 0.9928
  F1: 0.9821



Training LightGBM:  40%|████      | 2/5 [00:00<00:00,  4.10it/s]

Fold 2 - Test metrics:
  PR-AUC: 0.9939
  AU-ROC: 0.8170
  ACC: 0.9718
  PREC: 0.9718
  REC: 1.0000
  F1: 0.9857



Training LightGBM:  60%|██████    | 3/5 [00:00<00:00,  4.13it/s]

Fold 3 - Test metrics:
  PR-AUC: 0.9902
  AU-ROC: 0.7862
  ACC: 0.9789
  PREC: 0.9787
  REC: 1.0000
  F1: 0.9892



Training LightGBM:  80%|████████  | 4/5 [00:00<00:00,  4.23it/s]

Fold 4 - Test metrics:
  PR-AUC: 0.9943
  AU-ROC: 0.8431
  ACC: 0.9645
  PREC: 0.9714
  REC: 0.9927
  F1: 0.9819



Training LightGBM: 100%|██████████| 5/5 [00:01<00:00,  4.23it/s]

Fold 5 - Test metrics:
  PR-AUC: 0.9938
  AU-ROC: 0.8102
  ACC: 0.9645
  PREC: 0.9714
  REC: 0.9927
  F1: 0.9819




