In [1]:
import os
import deepchem as dc
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import copy
import itertools

Skipped loading some Jax models, missing a dependency. jax requires jaxlib to be installed. See https://github.com/google/jax#installation for installation instructions.


In [None]:
#GPU测试
print(torch.cuda.is_available())
device = torch.device('cuda:0')
print(device)
print(torch.cuda.get_device_name(0))
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())


# cmd nvidia-smi -l

In [None]:
# 2. 定义nested_cv函数
# 将数据集划分成三份，进行迭代。首先两份输入inner_cv进行交叉验证，记录基于AUROC的最佳模型，最后一份用于在模型上验证
def nested_cv(dataset_X, dataset_y, model, epoch): 
    # 使用 StratifiedKFold 创建分层交叉验证划分
    outer_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    auroc_scores = []

    for fold, (train_index, test_index) in enumerate(outer_skf.split(dataset_X, dataset_y)):
        # 创建训练集和测试集的 NumpyDataset
        inner_cv_dataset = dc.data.NumpyDataset(X=dataset_X[train_index], y=dataset_y[train_index])
        test_dataset = dc.data.NumpyDataset(X=dataset_X[test_index], y=dataset_y[test_index])
        model = inner_cv(inner_cv_dataset, model, epoch)

        # 在测试集上进行预测
        y_true = test_dataset.y
        y_pred = model.predict(test_dataset)[:,1]

        # 计算 AUROC 和 AUPR
        auroc = roc_auc_score(y_true, y_pred)

        # 保存结果
        auroc_scores.append(auroc)

    #获取三次的均值
    average_rocauc_score = np.mean(auroc_scores)
    
    return average_rocauc_score

#2.1 定义内层循环
#真正进行训练的函数
def inner_cv(inner_cv_dataset, model, epoch):
    inner_skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

    auroc_scores = []

    X,y = inner_cv_dataset.X, inner_cv_dataset.y

    best_models = [] 
    for fold, (train_index, val_index) in enumerate(inner_skf.split(X, y)):
        # 创建训练集和测试集的 NumpyDataset
        train_dataset = dc.data.NumpyDataset(X=X[train_index], y=y[train_index])
        val_dataset = dc.data.NumpyDataset(X=X[val_index], y=y[val_index])


        #模型训练
        loss = model.fit(train_dataset, nb_epoch=epoch)

        # 在测试集上进行预测
        y_true = val_dataset.y
        y_pred = model.predict(val_dataset)[:,1]

        # 计算AUC-ROC值
        aucroc = roc_auc_score(y_true, y_pred)
        auroc_scores.append(aucroc)

        # 保存全部模型
        best_models.append(copy.deepcopy(model))  # 使用深层复制保存当前模型

        # 模型初始化：对模型的每个层应用初始化
        for module in model.model.model.modules():
            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
                weights_init(module)

    # 在best_models列表中选择在验证集上性能最好的模型
    best_model_index = np.argmax(auroc_scores)
    best_model = best_models[best_model_index]
    return best_model

# 2.1.1 初始化权重和偏置
def weights_init(m):
    if isinstance(m, torch.nn.Linear) or isinstance(m, torch.nn.Conv2d):
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)

In [None]:
#1. 数据输入和特征化
#文件路径准备
basePath = os.getcwd()
resultPath = basePath+'/results'
training_data_path = basePath+'/training_data'
training_list = os.listdir(training_data_path)

algorithm = 'AttentiveFP'

#hyperparameters setting
graph_conv_layers_values = [[32, 32], [32, 64], [64, 64]]
attention_hidden_size_values = [64, 128]
dense_layer_size_value = 128
dropout_value = 0.5

#设置epoch, batch_size参数
epoch = 50
batch_size = 32

#设置超参数组合
all_combinations = list(itertools.product(graph_conv_layers_values, attention_hidden_size_values))


In [None]:
for traindataset in training_list:

    tar_ids = []
    final_params = []
    final_scores = []

    pertarget_files = training_data_path+'/'+traindataset
    files_list = os.listdir(pertarget_files)
    # print(traindataset)
    for tar_id in tqdm(files_list):
        smiles = pd.read_csv(training_data_path + '/' +traindataset+'/'+tar_id, header=0,index_col=False)['c_smiles'].tolist()
        labels = pd.read_csv(training_data_path + '/' +traindataset+'/'+tar_id, header=0,index_col=False)['active_label'].tolist()
        labels = np.array(labels).reshape((len(labels), 1))

        # AttentiveFPModel需要使用MolGraphConvFeaturizer进行特征提取
        featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
        X = featurizer.featurize(smiles)
        dataset = dc.data.NumpyDataset(X=X, y=labels)

        dataset_X = dataset.X
        dataset_y = dataset.y

        
        rocauc_params = []
        rocauc_score = []
        
        for graph_conv_layers, attention_hidden_size in all_combinations:  
            # 创建AttentiveFPModel
            model = dc.models.AttentiveFPModel(
                batch_size=batch_size, 
                learning_rate=0.001,
                n_tasks=1,
                mode='classification',
                graph_conv_layers=graph_conv_layers,
                attention_hidden_size=attention_hidden_size,
                dense_layer_size=dense_layer_size_value,
                dropout=dropout_value,
            )
            #AttentiveFPModel的默认优化器（optimizer）是Adam（Adaptive Moment Estimation），l;earning_rate传入adam
            #而默认的损失函数（loss function）取决于任务的类型。对于分类任务 (mode='classification')，
            #默认使用的是交叉熵损失函数（cross entropy loss）。这些是DeepChem中AttentiveFPModel的默认设置
            
            model.model.to(device)
            #molAttentiveFPModel
            
            average_rocauc_score = nested_cv(dataset_X, dataset_y, model, epoch)

            #记录此时的超参数
            params = {
            'graph_conv_layers': graph_conv_layers, 
            'attention_hidden_size': attention_hidden_size,
            'dense_layer_size_value': dense_layer_size_value,
            'dropout_value': dropout_value
            }   
            
            #记录超参数，记录分数
            rocauc_params.append(params)
            rocauc_score.append(average_rocauc_score)

        #获取最佳分数以及最佳参数
        tar_ids.append(tar_id)
        final_params.append(rocauc_params[np.argmax(rocauc_score)])
        final_scores.append(max(rocauc_score))

        #数据输出
    data={'targets':tar_ids, 'best_params':final_params, 'rocauc_score':final_scores}
    roc_data = pd.DataFrame(data) 
    roc_data.to_csv(resultPath+'/'+algorithm+'_'+traindataset+'_rocmean.csv',index=False)