In [1]:
import warnings
from rdkit import RDLogger

# 屏蔽 RDKit 警告
RDLogger.DisableLog('rdApp.*')

# 或屏蔽所有 Python 警告
warnings.filterwarnings("ignore")
# 屏蔽 LightGBM 警告
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

In [2]:
import torch
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import precision_recall_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib
import optuna
from rdkit.Chem import Descriptors, AllChem
from tqdm import tqdm  # 导入tqdm

# 数据预处理
df = pd.read_csv('imputed_selected_features_Flam.csv')
labels = df['Flammability'].values
smiles_list = df['SMILES'].tolist()

# 函数：将SMILES转换为分子描述符和指纹
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # 提取描述符
    descriptors = [
        Descriptors.MolWt(mol),  # 分子量
        Descriptors.MolLogP(mol),  # LogP
        Descriptors.NumHDonors(mol),  # 氢键供体数量
        Descriptors.NumHAcceptors(mol)  # 氢键受体数量
    ]
    # 生成Morgan指纹
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    fingerprint_array = np.zeros((2048,))
    Chem.DataStructs.ConvertToNumpyArray(fingerprint, fingerprint_array)
    # 合并描述符和指纹
    features = np.concatenate([descriptors, fingerprint_array])
    return features

# 将SMILES转换为特征
features = []
for smiles in smiles_list:
    feature = smiles_to_features(smiles)
    if feature is not None:
        features.append(feature)

# 转换为numpy数组
features = np.array(features)

y = labels
X = np.array(features)


# 训练和评估函数
def train_evaluate_model_with_optuna(model_name, model_class, param_func, X, y):
    def objective(trial):
        params = param_func(trial)
        if model_class == SVC:
            params['probability'] = True

        model = model_class(**params)

        # 五折交叉验证
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        pr_auc_scores = []

        # 使用 tqdm 包裹五折交叉验证的每一折
        for train_idx, val_idx in tqdm(skf.split(X, y), total=5, desc=f"Training {model_name}"):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            model.fit(X_train, y_train)
            y_prob = model.predict_proba(X_val)[:, 1]
            precision, recall, _ = precision_recall_curve(y_val, y_prob)
            pr_auc = auc(recall, precision)
            pr_auc_scores.append(pr_auc)

        return np.mean(pr_auc_scores)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)

    print(f'Best parameters for {model_name}: {study.best_params}')
    best_params = study.best_params
    if model_class == SVC:
        best_params['probability'] = True

    best_model = model_class(**best_params)
    best_model.fit(X, y)
    model_path = f'{model_name}_best_model.joblib'
    joblib.dump(best_model, model_path)
    print(f"Model saved to {model_path}")

# 搜索空间
#def svm_param_func(trial):
#    return {
#        'C': trial.suggest_loguniform('C', 1e-6, 1e+6),
#        'gamma': trial.suggest_loguniform('gamma', 1e-6, 1e+1),
#        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf']),
#        'probability': True
#    }

def rf_param_func(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_loguniform('min_samples_split', 0.01, 1.0)
    }

def xgb_param_func(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

def lgbm_param_func(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'num_leaves': trial.suggest_int('num_leaves', 2, 4096),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'verbose': -1
    }

# 开始训练和评估
#print("Training SVC...")
#train_evaluate_model_with_optuna("SVM", SVC, svm_param_func, X, y)


In [4]:
print("Training RandomForest...")
train_evaluate_model_with_optuna("RandomForest", RandomForestClassifier, rf_param_func, X, y)

[I 2025-02-19 17:14:29,191] A new study created in memory with name: no-name-78d5e923-67b9-4249-9d2d-316c4633c4ec


Training RandomForest...


Training RandomForest: 100%|██████████| 5/5 [00:09<00:00,  1.96s/it]
[I 2025-02-19 17:14:39,026] Trial 0 finished with value: 0.9925005254981253 and parameters: {'n_estimators': 956, 'max_depth': 15, 'min_samples_split': 0.08171530926464397}. Best is trial 0 with value: 0.9925005254981253.
Training RandomForest: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]
[I 2025-02-19 17:14:43,676] Trial 1 finished with value: 0.9922160663079478 and parameters: {'n_estimators': 435, 'max_depth': 29, 'min_samples_split': 0.03583502201858819}. Best is trial 0 with value: 0.9925005254981253.
Training RandomForest: 100%|██████████| 5/5 [00:03<00:00,  1.37it/s]
[I 2025-02-19 17:14:47,344] Trial 2 finished with value: 0.9858755369093997 and parameters: {'n_estimators': 551, 'max_depth': 1, 'min_samples_split': 0.7057991554015502}. Best is trial 0 with value: 0.9925005254981253.
Training RandomForest: 100%|██████████| 5/5 [00:01<00:00,  2.61it/s]
[I 2025-02-19 17:14:49,270] Trial 3 finished with value: 0.9

Best parameters for RandomForest: {'n_estimators': 310, 'max_depth': 40, 'min_samples_split': 0.01747361296783917}
Model saved to RandomForest_best_model.joblib


In [5]:
print("Training LightGBM...")
train_evaluate_model_with_optuna("LightGBM", LGBMClassifier, lgbm_param_func, X, y)

[I 2025-02-19 17:17:54,711] A new study created in memory with name: no-name-d376268d-f2fb-4812-bbf2-df70c48c55d9


Training LightGBM...


Training LightGBM: 100%|██████████| 5/5 [00:01<00:00,  3.73it/s]
[I 2025-02-19 17:17:56,056] Trial 0 finished with value: 0.9879836348145596 and parameters: {'n_estimators': 306, 'max_depth': 49, 'num_leaves': 2098, 'learning_rate': 0.013042146109133096}. Best is trial 0 with value: 0.9879836348145596.
Training LightGBM: 100%|██████████| 5/5 [00:01<00:00,  4.34it/s]
[I 2025-02-19 17:17:57,211] Trial 1 finished with value: 0.9854202031990278 and parameters: {'n_estimators': 666, 'max_depth': 38, 'num_leaves': 3845, 'learning_rate': 0.9741349942304655}. Best is trial 0 with value: 0.9879836348145596.
Training LightGBM: 100%|██████████| 5/5 [00:01<00:00,  3.34it/s]
[I 2025-02-19 17:17:58,712] Trial 2 finished with value: 0.9915814440270504 and parameters: {'n_estimators': 764, 'max_depth': 23, 'num_leaves': 741, 'learning_rate': 0.08482939769394415}. Best is trial 2 with value: 0.9915814440270504.
Training LightGBM: 100%|██████████| 5/5 [00:00<00:00,  8.97it/s]
[I 2025-02-19 17:17:59,273]

Best parameters for LightGBM: {'n_estimators': 529, 'max_depth': 32, 'num_leaves': 1337, 'learning_rate': 0.16178297903549685}
Model saved to LightGBM_best_model.joblib


In [5]:
print("Training XGBoost...")
train_evaluate_model_with_optuna("XGBoost", XGBClassifier, xgb_param_func, X, y)

[I 2025-02-19 20:04:38,540] A new study created in memory with name: no-name-a21bb52a-9511-48fa-a1fb-82668860295c


Training XGBoost...


Training XGBoost: 100%|██████████| 5/5 [10:34<00:00, 126.93s/it]
[I 2025-02-19 20:15:13,200] Trial 0 finished with value: 0.9889384033240521 and parameters: {'n_estimators': 276, 'max_depth': 2, 'learning_rate': 0.024790845161162187}. Best is trial 0 with value: 0.9889384033240521.
Training XGBoost: 100%|██████████| 5/5 [50:22<00:00, 604.46s/it]
[I 2025-02-19 21:05:35,480] Trial 1 finished with value: 0.9928449877997639 and parameters: {'n_estimators': 739, 'max_depth': 38, 'learning_rate': 0.04864919642363382}. Best is trial 1 with value: 0.9928449877997639.
Training XGBoost: 100%|██████████| 5/5 [1:12:34<00:00, 870.83s/it]
[I 2025-02-19 22:18:09,660] Trial 2 finished with value: 0.9916136762307677 and parameters: {'n_estimators': 801, 'max_depth': 15, 'learning_rate': 0.016037145439980353}. Best is trial 1 with value: 0.9928449877997639.
Training XGBoost: 100%|██████████| 5/5 [08:24<00:00, 100.93s/it]
[I 2025-02-19 22:26:34,306] Trial 3 finished with value: 0.9938533173091753 and par

Best parameters for XGBoost: {'n_estimators': 368, 'max_depth': 1, 'learning_rate': 0.5946860775199619}
Model saved to XGBoost_best_model.joblib


In [3]:
# 搜索空间
def svm_param_func(trial):
    return {
        'C': trial.suggest_loguniform('C', 1e-2, 1e+3),  # 缩小 C 的范围
        'gamma': trial.suggest_loguniform('gamma', 1e-2, 1e+1),  # 缩小 gamma 的范围
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf']),  # 只选择 linear 和 rbf 核
        'probability': True
    }


# 开始训练和评估
print("Training SVC...")
train_evaluate_model_with_optuna("SVM", SVC, svm_param_func, X, y)


[I 2025-02-19 19:46:50,542] A new study created in memory with name: no-name-70c22691-7005-4c38-a8c6-b52d4eda5bef


Training SVC...


Training SVM: 100%|██████████| 5/5 [00:04<00:00,  1.14it/s]
[I 2025-02-19 19:46:54,954] Trial 0 finished with value: 0.9895539883260295 and parameters: {'C': 0.013422994104944361, 'gamma': 0.8638573065530639, 'kernel': 'linear'}. Best is trial 0 with value: 0.9895539883260295.
Training SVM: 100%|██████████| 5/5 [00:09<00:00,  1.99s/it]
[I 2025-02-19 19:47:04,910] Trial 1 finished with value: 0.9831092471544091 and parameters: {'C': 493.5575251850943, 'gamma': 2.0944852180164184, 'kernel': 'linear'}. Best is trial 0 with value: 0.9895539883260295.
Training SVM: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]
[I 2025-02-19 19:47:10,142] Trial 2 finished with value: 0.9897426466089378 and parameters: {'C': 0.16656253176207178, 'gamma': 0.20314072229236227, 'kernel': 'linear'}. Best is trial 2 with value: 0.9897426466089378.
Training SVM: 100%|██████████| 5/5 [00:06<00:00,  1.26s/it]
[I 2025-02-19 19:47:16,442] Trial 3 finished with value: 0.9906863806143411 and parameters: {'C': 0.696408910

Best parameters for SVM: {'C': 0.977399931193936, 'gamma': 0.010961352543547414, 'kernel': 'linear'}
Model saved to SVM_best_model.joblib


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_recall_curve, auc
import numpy as np
import optuna



def knn_param_func(trial):
    return {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 50),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski']),
        'p': trial.suggest_int('p', 1, 5) if trial.suggest_categorical('metric', ['minkowski']) == 'minkowski' else 2
    }

# 定义目标函数
def objective(trial):
    # 定义固定的超参数搜索空间
    n_neighbors = trial.suggest_int('n_neighbors', 1, 50)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])
    p = None
    if metric == 'minkowski':  # 仅在 metric 为 'minkowski' 时使用 p 参数
        p = trial.suggest_int('p', 1, 5)

    # 构造 KNN 模型参数
    params = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'metric': metric,
    }
    if p is not None:  # 动态添加 p 参数
        params['p'] = p

    model = KNeighborsClassifier(**params)

    # 五折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    pr_auc_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # 训练模型
        model.fit(X_train, y_train)

        # 预测概率
        y_prob = model.predict_proba(X_val)[:, 1]
        precision, recall, _ = precision_recall_curve(y_val, y_prob)
        pr_auc = auc(recall, precision)
        pr_auc_scores.append(pr_auc)

    return np.mean(pr_auc_scores)

# 使用Optuna进行超参数优化
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 输出最佳参数
print("Best parameters:", study.best_params)




[I 2025-02-19 19:52:22,672] A new study created in memory with name: no-name-10b90b0e-aaff-4380-8171-42b81733e9e1
[I 2025-02-19 19:52:23,790] Trial 0 finished with value: 0.9869573507940574 and parameters: {'n_neighbors': 12, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.9869573507940574.
[I 2025-02-19 19:52:24,356] Trial 1 finished with value: 0.9878122620422781 and parameters: {'n_neighbors': 4, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 1 with value: 0.9878122620422781.
[I 2025-02-19 19:52:24,914] Trial 2 finished with value: 0.9861708050449168 and parameters: {'n_neighbors': 31, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 1 with value: 0.9878122620422781.
[I 2025-02-19 19:52:26,011] Trial 3 finished with value: 0.9835926248976161 and parameters: {'n_neighbors': 37, 'weights': 'uniform', 'metric': 'minkowski', 'p': 1}. Best is trial 1 with value: 0.9878122620422781.
[I 2025-02-19 19:52:29,292] Trial 4 finished with val

Best parameters: {'n_neighbors': 5, 'weights': 'distance', 'metric': 'euclidean'}
