In [1]:
import warnings
from rdkit import RDLogger

# 屏蔽 RDKit 警告
RDLogger.DisableLog('rdApp.*')

# 或屏蔽所有 Python 警告
warnings.filterwarnings("ignore")
# 屏蔽 LightGBM 警告
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

In [2]:
import torch
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import precision_recall_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib
import optuna
from rdkit.Chem import Descriptors, AllChem
from tqdm import tqdm  # 导入tqdm

# 数据预处理
df = pd.read_csv('imputed_selected_features_Toxcity.csv')
labels = df['Toxicity'].values
smiles_list = df['SMILES'].tolist()

# 函数：将SMILES转换为分子描述符和指纹
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # 提取描述符
    descriptors = [
        Descriptors.MolWt(mol),  # 分子量
        Descriptors.MolLogP(mol),  # LogP
        Descriptors.NumHDonors(mol),  # 氢键供体数量
        Descriptors.NumHAcceptors(mol)  # 氢键受体数量
    ]
    # 生成Morgan指纹
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    fingerprint_array = np.zeros((2048,))
    Chem.DataStructs.ConvertToNumpyArray(fingerprint, fingerprint_array)
    # 合并描述符和指纹
    features = np.concatenate([descriptors, fingerprint_array])
    return features

# 将SMILES转换为特征
features = []
for smiles in smiles_list:
    feature = smiles_to_features(smiles)
    if feature is not None:
        features.append(feature)

# 转换为numpy数组
features = np.array(features)

y = labels
X = np.array(features)


# 训练和评估函数
def train_evaluate_model_with_optuna(model_name, model_class, param_func, X, y):
    def objective(trial):
        params = param_func(trial)
        if model_class == SVC:
            params['probability'] = True

        model = model_class(**params)

        # 五折交叉验证
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        pr_auc_scores = []

        # 使用 tqdm 包裹五折交叉验证的每一折
        for train_idx, val_idx in tqdm(skf.split(X, y), total=5, desc=f"Training {model_name}"):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            model.fit(X_train, y_train)
            y_prob = model.predict_proba(X_val)[:, 1]
            precision, recall, _ = precision_recall_curve(y_val, y_prob)
            pr_auc = auc(recall, precision)
            pr_auc_scores.append(pr_auc)

        return np.mean(pr_auc_scores)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)

    print(f'Best parameters for {model_name}: {study.best_params}')
    best_params = study.best_params
    if model_class == SVC:
        best_params['probability'] = True

    best_model = model_class(**best_params)
    best_model.fit(X, y)
    model_path = f'{model_name}_best_model.joblib'
    joblib.dump(best_model, model_path)
    print(f"Model saved to {model_path}")

# 搜索空间
#def svm_param_func(trial):
#    return {
#        'C': trial.suggest_loguniform('C', 1e-6, 1e+6),
#        'gamma': trial.suggest_loguniform('gamma', 1e-6, 1e+1),
#        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf']),
#        'probability': True
#    }

def rf_param_func(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_loguniform('min_samples_split', 0.01, 1.0)
    }

def xgb_param_func(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

def lgbm_param_func(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'num_leaves': trial.suggest_int('num_leaves', 2, 4096),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'verbose': -1
    }

# 开始训练和评估
#print("Training SVC...")
#train_evaluate_model_with_optuna("SVM", SVC, svm_param_func, X, y)


In [3]:
print("Training RandomForest...")
train_evaluate_model_with_optuna("RandomForest", RandomForestClassifier, rf_param_func, X, y)

[I 2025-02-16 23:22:21,575] A new study created in memory with name: no-name-a9ee6f77-3450-4aae-a2e8-3c06d46cb0b0


Training RandomForest...


Training RandomForest: 100%|██████████| 5/5 [00:07<00:00,  1.47s/it]
[I 2025-02-16 23:22:28,909] Trial 0 finished with value: 0.9476214262234393 and parameters: {'n_estimators': 790, 'max_depth': 50, 'min_samples_split': 0.026228264900137993}. Best is trial 0 with value: 0.9476214262234393.
Training RandomForest: 100%|██████████| 5/5 [00:02<00:00,  1.73it/s]
[I 2025-02-16 23:22:31,803] Trial 1 finished with value: 0.9483186147638685 and parameters: {'n_estimators': 585, 'max_depth': 3, 'min_samples_split': 0.12625898683792072}. Best is trial 1 with value: 0.9483186147638685.
Training RandomForest: 100%|██████████| 5/5 [00:02<00:00,  1.95it/s]
[I 2025-02-16 23:22:34,372] Trial 2 finished with value: 0.9518619966168445 and parameters: {'n_estimators': 518, 'max_depth': 3, 'min_samples_split': 0.14412044347207875}. Best is trial 2 with value: 0.9518619966168445.
Training RandomForest: 100%|██████████| 5/5 [00:02<00:00,  2.17it/s]
[I 2025-02-16 23:22:36,684] Trial 3 finished with value: 0.

Best parameters for RandomForest: {'n_estimators': 659, 'max_depth': 32, 'min_samples_split': 0.021676009502668372}
Model saved to RandomForest_best_model.joblib


In [4]:
print("Training LightGBM...")
train_evaluate_model_with_optuna("LightGBM", LGBMClassifier, lgbm_param_func, X, y)

[I 2025-02-16 23:26:03,418] A new study created in memory with name: no-name-a8d3584b-138c-4e07-8ac2-85c21a828a8f


Training LightGBM...


Training LightGBM: 100%|██████████| 5/5 [00:03<00:00,  1.48it/s]
[I 2025-02-16 23:26:06,806] Trial 0 finished with value: 0.9416563087119252 and parameters: {'n_estimators': 646, 'max_depth': 24, 'num_leaves': 3426, 'learning_rate': 0.09608349254020758}. Best is trial 0 with value: 0.9416563087119252.
Training LightGBM: 100%|██████████| 5/5 [00:01<00:00,  2.71it/s]
[I 2025-02-16 23:26:08,656] Trial 1 finished with value: 0.9419046928419099 and parameters: {'n_estimators': 639, 'max_depth': 16, 'num_leaves': 617, 'learning_rate': 0.16166155791258988}. Best is trial 1 with value: 0.9419046928419099.
Training LightGBM: 100%|██████████| 5/5 [00:06<00:00,  1.34s/it]
[I 2025-02-16 23:26:15,384] Trial 2 finished with value: 0.9405159147777764 and parameters: {'n_estimators': 921, 'max_depth': 4, 'num_leaves': 2535, 'learning_rate': 0.17435095331795233}. Best is trial 1 with value: 0.9419046928419099.
Training LightGBM: 100%|██████████| 5/5 [00:02<00:00,  1.83it/s]
[I 2025-02-16 23:26:18,114] 

Best parameters for LightGBM: {'n_estimators': 847, 'max_depth': 28, 'num_leaves': 959, 'learning_rate': 0.41408607247765644}
Model saved to LightGBM_best_model.joblib


In [5]:
print("Training XGBoost...")
train_evaluate_model_with_optuna("XGBoost", XGBClassifier, xgb_param_func, X, y)

[I 2025-02-16 23:28:05,903] A new study created in memory with name: no-name-f4f8a272-fe3e-47b9-8053-2f39f6e3a407


Training XGBoost...


Training XGBoost: 100%|██████████| 5/5 [00:11<00:00,  2.23s/it]
[I 2025-02-16 23:28:17,035] Trial 0 finished with value: 0.9384876194419574 and parameters: {'n_estimators': 548, 'max_depth': 12, 'learning_rate': 0.12060269129057076}. Best is trial 0 with value: 0.9384876194419574.
Training XGBoost: 100%|██████████| 5/5 [00:13<00:00,  2.69s/it]
[I 2025-02-16 23:28:30,495] Trial 1 finished with value: 0.9394289750741466 and parameters: {'n_estimators': 510, 'max_depth': 31, 'learning_rate': 0.020093553020984086}. Best is trial 1 with value: 0.9394289750741466.
Training XGBoost: 100%|██████████| 5/5 [00:12<00:00,  2.49s/it]
[I 2025-02-16 23:28:42,945] Trial 2 finished with value: 0.936752052608745 and parameters: {'n_estimators': 580, 'max_depth': 35, 'learning_rate': 0.07252882149914718}. Best is trial 1 with value: 0.9394289750741466.
Training XGBoost: 100%|██████████| 5/5 [00:08<00:00,  1.79s/it]
[I 2025-02-16 23:28:51,874] Trial 3 finished with value: 0.9406535943529033 and parameters

Best parameters for XGBoost: {'n_estimators': 10, 'max_depth': 27, 'learning_rate': 0.0809714150972509}
Model saved to XGBoost_best_model.joblib


In [6]:
# 搜索空间
def svm_param_func(trial):
    return {
        'C': trial.suggest_loguniform('C', 1e-2, 1e+3),  # 缩小 C 的范围
        'gamma': trial.suggest_loguniform('gamma', 1e-2, 1e+1),  # 缩小 gamma 的范围
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf']),  # 只选择 linear 和 rbf 核
        'probability': True
    }


# 开始训练和评估
print("Training SVC...")
train_evaluate_model_with_optuna("SVM", SVC, svm_param_func, X, y)


[I 2025-02-16 23:34:47,380] A new study created in memory with name: no-name-69b5ba8e-b46d-4839-aee1-b81510ef2c69


Training SVC...


Training SVM: 100%|██████████| 5/5 [07:28<00:00, 89.79s/it] 
[I 2025-02-16 23:42:16,331] Trial 0 finished with value: 0.9183540034616966 and parameters: {'C': 566.8727123769759, 'gamma': 0.11034683354429779, 'kernel': 'linear'}. Best is trial 0 with value: 0.9183540034616966.
Training SVM: 100%|██████████| 5/5 [00:04<00:00,  1.19it/s]
[I 2025-02-16 23:42:20,538] Trial 1 finished with value: 0.9401698241611646 and parameters: {'C': 15.63542871594253, 'gamma': 9.746328489035172, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9401698241611646.
Training SVM: 100%|██████████| 5/5 [00:02<00:00,  1.67it/s]
[I 2025-02-16 23:42:23,527] Trial 2 finished with value: 0.9243263652742432 and parameters: {'C': 88.00407703680754, 'gamma': 0.010798289127085255, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9401698241611646.
Training SVM: 100%|██████████| 5/5 [00:05<00:00,  1.19s/it]
[I 2025-02-16 23:42:29,482] Trial 3 finished with value: 0.9419724820999391 and parameters: {'C': 0.03908786552429021,

Best parameters for SVM: {'C': 0.01559252830927682, 'gamma': 0.06808617250161814, 'kernel': 'linear'}
Model saved to SVM_best_model.joblib


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_recall_curve, auc
import numpy as np
import optuna



def knn_param_func(trial):
    return {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 50),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski']),
        'p': trial.suggest_int('p', 1, 5) if trial.suggest_categorical('metric', ['minkowski']) == 'minkowski' else 2
    }

# 定义目标函数
def objective(trial):
    # 定义固定的超参数搜索空间
    n_neighbors = trial.suggest_int('n_neighbors', 1, 50)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])
    p = None
    if metric == 'minkowski':  # 仅在 metric 为 'minkowski' 时使用 p 参数
        p = trial.suggest_int('p', 1, 5)

    # 构造 KNN 模型参数
    params = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'metric': metric,
    }
    if p is not None:  # 动态添加 p 参数
        params['p'] = p

    model = KNeighborsClassifier(**params)

    # 五折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    pr_auc_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # 训练模型
        model.fit(X_train, y_train)

        # 预测概率
        y_prob = model.predict_proba(X_val)[:, 1]
        precision, recall, _ = precision_recall_curve(y_val, y_prob)
        pr_auc = auc(recall, precision)
        pr_auc_scores.append(pr_auc)

    return np.mean(pr_auc_scores)

# 使用Optuna进行超参数优化
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 输出最佳参数
print("Best parameters:", study.best_params)




[I 2025-02-17 00:14:19,084] A new study created in memory with name: no-name-c7d36949-a3d2-4bff-824e-f501f4cd4c7a
[I 2025-02-17 00:14:19,696] Trial 0 finished with value: 0.932914893642389 and parameters: {'n_neighbors': 47, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 0 with value: 0.932914893642389.
[I 2025-02-17 00:14:20,095] Trial 1 finished with value: 0.9233240968239158 and parameters: {'n_neighbors': 26, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 0.932914893642389.
[I 2025-02-17 00:14:20,727] Trial 2 finished with value: 0.9373015150838399 and parameters: {'n_neighbors': 49, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 2 with value: 0.9373015150838399.
[I 2025-02-17 00:14:22,941] Trial 3 finished with value: 0.932879999087452 and parameters: {'n_neighbors': 3, 'weights': 'distance', 'metric': 'minkowski', 'p': 4}. Best is trial 2 with value: 0.9373015150838399.
[I 2025-02-17 00:14:23,567] Trial 4 finished with value

Best parameters: {'n_neighbors': 7, 'weights': 'uniform', 'metric': 'manhattan'}
