In [None]:
import os
import deepchem as dc
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from ast import literal_eval

#文件路径准备
basePath = os.getcwd()
resultPath = basePath+'/results/'
training_path = basePath+'/training_data/'
external_path = basePath+'/external_data/'

training_list = ['TCM2000_100t','TCM2000_30t','TCM2000_50t','TCM2000_80t','TCM_100t','TCM_30t','TCM_50t','TCM_80t']
algorithm = 'DMPNN'
# 没保存模型

In [None]:
#GPU测试
print(torch.cuda.is_available())
device = torch.device('cuda:0')
print(device)
print(torch.cuda.get_device_name(0))
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())

# cmd nvidia-smi -l

In [None]:
def run_ext(dataset_X, dataset_y, model, epoch, test_X, test_y,traindataset,tar_id): 

    train_dataset = dc.data.NumpyDataset(X=dataset_X, y=dataset_y)
    test_dataset = dc.data.NumpyDataset(X=test_X, y=test_y)

    loss = model.fit(train_dataset, nb_epoch=epoch)

    y_true = test_dataset.y
    y_pred = model.predict(test_dataset)[:,1]

    aucroc = roc_auc_score(y_true, y_pred) 
    
    # torch.save(model, external_path+algorithm+'_'+traindataset+'_'+tar_id+".pt")

    return aucroc


In [None]:
#设置epoch, batch_size参数
epoch = 50
batch_size = 32

In [None]:
data_all = pd.DataFrame()

for traindataset in training_list:

    params_data = pd.read_csv(resultPath+'DMPNN_'+traindataset+'_rocmean.csv')
    params_data.insert(0, 'dataset', traindataset)
    
    target_list = os.listdir(external_path+'ex_'+traindataset)
    target_list = [item[:-4] for item in target_list]
    # print(traindataset)
    
    for tar_id in tqdm(target_list):

        params = literal_eval(params_data.at[target_list.index(tar_id), 'best_params'])
        featurizer = dc.feat.DMPNNFeaturizer()

        # 加载训练集
        smiles = pd.read_csv(training_path +traindataset+'/'+tar_id+'.csv', header=0,index_col=False)['c_smiles'].tolist()
        labels = pd.read_csv(training_path +traindataset+'/'+tar_id+'.csv', header=0,index_col=False)['active_label'].tolist()
        labels = np.array(labels).reshape((len(labels), 1))
        X = featurizer.featurize(smiles)
        dataset = dc.data.NumpyDataset(X=X, y=labels)
        dataset_X = dataset.X
        dataset_y = dataset.y

        # 加载外部验证集
        smiles = pd.read_csv(external_path +'ex_'+traindataset+'/'+tar_id+'.csv', header=0,index_col=False)['c_smiles'].tolist()
        labels = pd.read_csv(external_path +'ex_'+traindataset+'/'+tar_id+'.csv', header=0,index_col=False)['active_label'].tolist()
        labels = np.array(labels).reshape((len(labels), 1))
        X = featurizer.featurize(smiles)
        dataset = dc.data.NumpyDataset(X=X, y=labels)
        test_X = dataset.X
        test_y = dataset.y
        

        model = dc.models.GCNModel(
            batch_size=batch_size, 
            learning_rate=0.001,
            n_tasks=1,
            mode='classification',
            graph_conv_layers=params['graph_conv_layers'],
            attention_hidden_size=params['attention_hidden_size'],
            dense_layer_size=params['dense_layer_size_value'],
            dropout=params['dropout_value'],
            )
            
        model.model.to(device)
        
        try:
            rocauc_score= run_ext(dataset_X, dataset_y, model, epoch, test_X, test_y,traindataset,tar_id)

        except Exception as e:
            rocauc_score = e
            print(e)

        params_data.loc[params_data['targets'] == (tar_id+'.csv'), 'external_rocauc'] = rocauc_score

    data_all = pd.concat([data_all, params_data], ignore_index=True)
    
data_all.dropna(axis=0, how='any', inplace=True)
data_all.to_excel(external_path+algorithm+'_ex_roc.xlsx',index=False)
     