In [None]:
!pip install simpletransformers


In [None]:
# 在当前文件夹下下载chemberta预训练模型
!git clone https://github.com/seyonechithrananda/bert-loves-chemistry.git

In [None]:
import os

import numpy as np
import pandas as pd

from typing import List

# import molnet loaders from deepchem
import sklearn
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import StratifiedKFold

In [None]:
#1. 路径设置


basePath = os.getcwd()
training_data_path = basePath+'/training_data/'
external_path = basePath+'/external_data/'
resultpath = basePath+'/results/'
training_list = os.listdir(training_data_path)

algorithm='Chemberta'

In [None]:
def nested_cv(df):
    #模型设置与重置
    # 设立fine_tune模型
    # 模型将自动在GPU上运行
    model = ClassificationModel('roberta',
                            'seyonec/PubChem10M_SMILES_BPE_396_250',
                            args={'evaluate_each_epoch': True,
                                  'evaluate_during_training_verbose': True,
                                  'no_save': True, #是否不保存，若设置为false则保存，一个epoch有接近一个G
                                  'num_train_epochs': 10,
                                  'auto_weights': True}) # You can set class weights by using the optional weight argument


    outer_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    X, y = df.smiles, df.labels

    #交叉验证
    f1_score = []
    for train_index, val_index in outer_skf.split(X, y):
        X_train, X_test = X[train_index], X[val_index]
        y_train, y_test = y[train_index], y[val_index]
        train_df = pd.concat([X_train, y_train], axis=1, keys=['smiles', 'labels'])
        val_df = pd.concat([X_test, y_test], axis=1, keys=['smiles', 'labels'])


        # Train the model
        model.train_model(train_df, eval_df=val_df,
                        output_dir=os.getcwd() + '/BPE_PubChem_10M_TCM_run',
                        args={'overwrite_output_dir': True})

        # 获得f1分数
        result, model_outputs, wrong_predictions = model.eval_model(val_df,
                                                                    acc=sklearn.metrics.f1_score)

        print(result['f1_score'])
        f1_score.append(result['f1_score'])

    average_f1_score = np.mean(f1_score)
    return model, average_f1_score

In [None]:
#设置循环与训练
for traindataset in training_list:

    targets=[]
    training_f1_scores=[]
    ex_target=[]
    external_f1_scores=[]

    pertarget_files = external_path+'/ex_'+traindataset
    files_list = os.listdir(pertarget_files)


    df_prob_all=pd.DataFrame()
    for tar_id in files_list:
        smiles = pd.read_csv(training_data_path + '/' +traindataset+'/'+tar_id, header=0,index_col=False)['c_smiles'].tolist()
        labels = pd.read_csv(training_data_path + '/' +traindataset+'/'+tar_id, header=0,index_col=False)['active_label'].tolist()
        df = pd.DataFrame(list(zip(smiles, labels)), columns=["smiles", "labels"])
        #训练集
        model, average_f1_score = nested_cv(df)

        targets.append(tar_id)
        training_f1_scores.append(average_f1_score)
        print('targets:',targets)
        print('trainscore:',training_f1_scores)




        #测试集
        ex_smiles = pd.read_csv(external_path +'/'+'ex_'+traindataset+'/'+tar_id, header=0,index_col=False)['c_smiles'].tolist()
        ex_labels = pd.read_csv(external_path +'/'+'ex_'+traindataset+'/'+tar_id, header=0,index_col=False)['active_label'].tolist()
        ex_df = pd.DataFrame(list(zip(ex_smiles, ex_labels)), columns=["smiles", "labels"])
        result, model_outputs, wrong_predictions = model.eval_model(ex_df,
                                                                acc=sklearn.metrics.f1_score)

        #增加预测概率和预测标签

        # 使用 softmax 函数获取概率值
        probability_values = np.exp(model_outputs) / np.sum(np.exp(model_outputs), axis=1, keepdims=True)

        # 将概率值添加到验证数据框中
        ex_df['pred_proba'] = probability_values[:, 1]

        #获取预测标签
        predicted_labels = np.argmax(model_outputs, axis=1)
        #添加预测标签
        ex_df['y_test'] = predicted_labels
        #添加靶点名
        ex_df['target'] = tar_id


        ex_target.append(tar_id)
        external_f1_scores.append(result['f1_score'])
        print('extar:',ex_target)
        print('exscore:',external_f1_scores)


        df_prob_all = pd.concat([df_prob_all,ex_df])

    #训练数据输出
    train_data = {'targets':targets, 'best_params':"", 'f1_score':training_f1_scores}
    train_f1_data = pd.DataFrame(train_data)
    train_f1_data.to_csv(resultpath+'/'+algorithm+'_'+traindataset+'_f1mean.csv',index=False)
    #外部测试数据输出
    ex_data={'ex_targets':ex_target, 'best_params':"", 'ex_f1_score':external_f1_scores}
    ex_f1_data = pd.DataFrame(ex_data)
    ex_f1_data.to_csv(resultpath+'/'+'ex_'+algorithm+'_'+traindataset+'_f1mean.csv',index=False)
    df_prob_all.to_excel(resultpath+'/'+'ex_'+algorithm+'_'+traindataset+'.xlsx',index=False)



In [None]:
# 合并文件

import pandas as pd

files = ['ex_Chemberta_TCM_30t','ex_Chemberta_TCM_50t','ex_Chemberta_TCM_80t','ex_Chemberta_TCM_100t','ex_Chemberta_TCM2000_30t','ex_Chemberta_TCM2000_50t','ex_Chemberta_TCM2000_80t','ex_Chemberta_TCM2000_100t']

df_all = pd.DataFrame()
for file in files:
    df = pd.read_excel(resultpath + file + '.xlsx')
    df['dataset'] = file.split('_')[2] + '_' + file.split('_')[3]
    df_all = pd.concat([df_all,df])

df_all.to_excel('/content/gdrive/MyDrive/Colab Notebooks/ex_results/chemberta_pred_proba.xlsx',index=False)

