In [None]:
# dgl officially supports: 3.8, 3.9, 3.10, 3.11, 3.12
# DeepChem officially supports Python 3.8 through 3.10
!python --version

In [None]:
# !pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
# !pip install deepchem
# !pip install dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html
# !pip install dgllife
# !pip install openpyxl

In [None]:
# 这里import不成功，后面的模型训练就搞不了
import dgl

In [None]:
import os
import deepchem as dc
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import copy
import itertools
from ast import literal_eval


In [None]:
#GPU测试

print(torch.cuda.is_available())
device = torch.device('cuda:0')
print(device)
print(torch.cuda.get_device_name(0))
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())


# cmd nvidia-smi -l

In [None]:
#1. 数据输入和特征化
#文件路径准备
basePath = os.getcwd()
resultPath = basePath+'/results/'
training_path = basePath+'/training_data/'
external_path = basePath+'/external_data/'
training_list = os.listdir(training_path)


algorithm = 'AttentiveFP'

#hyperparameters setting
graph_conv_layers_values = [[32, 32], [32, 64], [64, 64]]
attention_hidden_size_values = [64, 128]
dense_layer_size_value = 128
dropout_value = 0.5

#设置epoch, batch_size参数
epoch = 50
batch_size = 32

#设置超参数组合
all_combinations = list(itertools.product(graph_conv_layers_values, attention_hidden_size_values))


In [None]:
# 2. 定义nested_cv函数
# 将数据集划分成三份，进行迭代。首先两份输入inner_cv进行交叉验证，记录基于f1的最佳模型，最后一份用于在模型上验证
def nested_cv(dataset_X, dataset_y, model, epoch):
    # 使用 StratifiedKFold 创建分层交叉验证划分
    outer_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    f1_scores = []

    for fold, (train_index, test_index) in enumerate(outer_skf.split(dataset_X, dataset_y)):
        # 创建训练集和测试集的 NumpyDataset
        inner_cv_dataset = dc.data.NumpyDataset(X=dataset_X[train_index], y=dataset_y[train_index])
        test_dataset = dc.data.NumpyDataset(X=dataset_X[test_index], y=dataset_y[test_index])
        model = inner_cv(inner_cv_dataset, model, epoch)

        # 在测试集上进行预测
        y_true = test_dataset.y
        y_pred = model.predict(test_dataset)[:,1]
        y_pred_binary = (y_pred > 0.5).astype(int)
        
        # 计算 f1
        f1 = f1_score(y_true, y_pred_binary)

        # 保存结果
        f1_scores.append(f1)

    #获取三次的均值
    average_f1_score = np.mean(f1_scores)

    return average_f1_score

#2.1 定义内层循环
#每一组超参都嵌套跑，选择表现最好的模型
def inner_cv(inner_cv_dataset, model, epoch):
    inner_skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

    f1_scores = []

    X,y = inner_cv_dataset.X, inner_cv_dataset.y

    best_models = []
    for fold, (train_index, val_index) in enumerate(inner_skf.split(X, y)):
        # 创建训练集和测试集的 NumpyDataset
        train_dataset = dc.data.NumpyDataset(X=X[train_index], y=y[train_index])
        val_dataset = dc.data.NumpyDataset(X=X[val_index], y=y[val_index])


        #模型训练
        loss = model.fit(train_dataset, nb_epoch=epoch)

        # 在测试集上进行预测
        y_true = val_dataset.y
        y_pred = model.predict(val_dataset)[:,1]
        y_pred_binary = (y_pred > 0.5).astype(int)
  
        # 计算f1
        f1 = f1_score(y_true, y_pred_binary)
        f1_scores.append(f1)

        # 保存全部模型
        best_models.append(copy.deepcopy(model))  # 使用深层复制保存当前模型

        # 模型初始化：对模型的每个层应用初始化
        for module in model.model.model.modules():
            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
                weights_init(module)

    # 在best_models列表中选择在验证集上性能最好的模型
    best_model_index = np.argmax(f1_scores)
    best_model = best_models[best_model_index]
    return best_model

# 2.1.1 初始化权重和偏置
def weights_init(m):
    if isinstance(m, torch.nn.Linear) or isinstance(m, torch.nn.Conv2d):
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)

In [None]:
for traindataset in training_list:

    export_file_path = resultPath+'/'+algorithm+'_'+traindataset+'_f1mean.csv'
    if not os.path.exists(export_file_path):

        tar_ids = []
        final_params = []
        final_scores = []

        pertarget_files = training_path+'/'+traindataset
        files_list = os.listdir(pertarget_files)
        # print(traindataset)
        for tar_id in tqdm(files_list):
            smiles = pd.read_csv(training_path + '/' +traindataset+'/'+tar_id, header=0,index_col=False)['c_smiles'].tolist()
            labels = pd.read_csv(training_path + '/' +traindataset+'/'+tar_id, header=0,index_col=False)['active_label'].tolist()
            labels = np.array(labels).reshape((len(labels), 1))

            # AttentiveFPModel需要使用MolGraphConvFeaturizer进行特征提取
            featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
            X = featurizer.featurize(smiles)
            dataset = dc.data.NumpyDataset(X=X, y=labels)

            dataset_X = dataset.X
            dataset_y = dataset.y


            f1_params = []
            f1_score1 = []

            for graph_conv_layers, attention_hidden_size in all_combinations:
                # 创建AttentiveFPModel
                model = dc.models.AttentiveFPModel(
                    batch_size=batch_size,
                    learning_rate=0.001,
                    n_tasks=1,
                    mode='classification',
                    graph_conv_layers=graph_conv_layers,
                    attention_hidden_size=attention_hidden_size,
                    dense_layer_size=dense_layer_size_value,
                    dropout=dropout_value,
                    model_dir="./tmp"
                )
                #AttentiveFPModel的默认优化器（optimizer）是Adam（Adaptive Moment Estimation），l;earning_rate传入adam
                #而默认的损失函数（loss function）取决于任务的类型。对于分类任务 (mode='classification')，
                #默认使用的是交叉熵损失函数（cross entropy loss）。这些是DeepChem中AttentiveFPModel的默认设置

                model.model.to(device)
                #molAttentiveFPModel

                average_f1_score = nested_cv(dataset_X, dataset_y, model, epoch)

                #记录此时的超参数
                params = {
                'graph_conv_layers': graph_conv_layers,
                'attention_hidden_size': attention_hidden_size,
                'dense_layer_size_value': dense_layer_size_value,
                'dropout_value': dropout_value
                }

                #记录超参数，记录分数
                f1_params.append(params)
                f1_score1.append(average_f1_score)

            #获取最佳分数以及最佳参数
            tar_ids.append(tar_id)
            final_params.append(f1_params[np.argmax(f1_score1)])
            final_scores.append(max(f1_score1))

        #数据输出
        data={'targets':tar_ids, 'best_params':final_params, 'f1_score':final_scores}
        f1_data = pd.DataFrame(data)
        f1_data.to_csv(export_file_path,index=False)

In [None]:
def run_ext(dataset_X, dataset_y, model, epoch, test_X, test_y,traindataset,tar_id): 

    train_dataset = dc.data.NumpyDataset(X=dataset_X, y=dataset_y)
    test_dataset = dc.data.NumpyDataset(X=test_X, y=test_y)

    loss = model.fit(train_dataset, nb_epoch=epoch)

    y_true = test_dataset.y
    y_pred = model.predict(test_dataset)[:,1]
    y_pred_binary = (y_pred > 0.5).astype(int)

    f1 = f1_score(y_true, y_pred_binary, zero_division=0) 


    # torch.save(model, external_path+algorithm+'_'+traindataset+'_'+tar_id+".pt")

    return f1


In [None]:
#设置epoch, batch_size参数
epoch = 50
batch_size = 32

In [None]:
data_all = pd.DataFrame()

for traindataset in training_list:

    params_data = pd.read_csv(resultPath+algorithm+'_'+traindataset+'_f1mean.csv')
    params_data.insert(0, 'dataset', traindataset)
    
    target_list = os.listdir(external_path+'ex_'+traindataset)
    target_list = [item[:-4] for item in target_list]
    # print(traindataset)
    
    for tar_id in tqdm(target_list):

        params = literal_eval(params_data.at[target_list.index(tar_id), 'best_params'])
        featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

        # 加载训练集
        smiles = pd.read_csv(training_path +traindataset+'/'+tar_id+'.csv', header=0,index_col=False)['c_smiles'].tolist()
        labels = pd.read_csv(training_path +traindataset+'/'+tar_id+'.csv', header=0,index_col=False)['active_label'].tolist()
        labels = np.array(labels).reshape((len(labels), 1))
        X = featurizer.featurize(smiles)
        dataset = dc.data.NumpyDataset(X=X, y=labels)
        dataset_X = dataset.X
        dataset_y = dataset.y

        # 加载外部验证集
        smiles = pd.read_csv(external_path +'ex_'+traindataset+'/'+tar_id+'.csv', header=0,index_col=False)['c_smiles'].tolist()
        labels = pd.read_csv(external_path +'ex_'+traindataset+'/'+tar_id+'.csv', header=0,index_col=False)['active_label'].tolist()
        labels = np.array(labels).reshape((len(labels), 1))
        X = featurizer.featurize(smiles)
        dataset = dc.data.NumpyDataset(X=X, y=labels)
        test_X = dataset.X
        test_y = dataset.y
        

        model = dc.models.AttentiveFPModel(
            batch_size=batch_size, 
            learning_rate=0.001,
            n_tasks=1,
            mode='classification',
            graph_conv_layers=params['graph_conv_layers'],
            attention_hidden_size=params['attention_hidden_size'],
            dense_layer_size=params['dense_layer_size_value'],
            dropout=params['dropout_value'],
            )
            
        model.model.to(device)
        
        try:
            f1_external= run_ext(dataset_X, dataset_y, model, epoch, test_X, test_y,traindataset,tar_id)

        except Exception as e:
            f1_external = e
            print(e)

        params_data.loc[params_data['targets'] == (tar_id+'.csv'), 'external_f1'] = f1_external

    data_all = pd.concat([data_all, params_data], ignore_index=True)
    
data_all.dropna(axis=0, how='any', inplace=True)
data_all.to_csv(external_path+algorithm+'_ex_f1.csv',index=False)
     