In [1]:
from tdc.single_pred import ADME
import pandas as pd


In [2]:
# Load dataset

from tdc.single_pred import Tox
data = Tox(name = 'hERG')
split = data.get_split()

Downloading...
100%|██████████| 50.2k/50.2k [00:00<00:00, 1.13MiB/s]
Loading...
Done!


In [3]:
template = """### Instructions: Answer the following question about drug properties. 
### Context: Human ether-à-go-go related gene (hERG) is crucial for the coordination of the heart's beating. Thus, 
if a drug blocks the hERG, it could lead to severe adverse effects. Therefore, reliable prediction of hERG liability 
in the early stages of drug design is quite important to reduce the risk of cardiotoxicity-related attritions in the 
later development stages.
### Question: Given a drug SMILES string, predict whether it 
(A) does not block hERG (B) does block hERG
SMILES: {} 
### Answer: {}"""


In [4]:
split["train"].head()

Unnamed: 0,Drug_ID,Drug,Y
0,DEMETHYLASTEMIZOLE,Oc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1,1.0
1,GBR-12909,Fc1ccc(C(OCC[NH+]2CC[NH+](CCCc3ccccc3)CC2)c2cc...,1.0
2,CLOFILIUM PHOSPHATE,CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+...,1.0
3,FLUSPIRILENE,O=C1NCN(c2ccccc2)C12CC[NH+](CCCC(c1ccc(F)cc1)c...,1.0
4,VANOXERINE HYDROCHLORIDE,Fc1ccc(C(OCCN2CCN(CCCc3ccccc3)CC2)c2ccc(F)cc2)cc1,1.0


In [5]:
# process dataset

def formatting_prompts_func(examples):
    drugs = examples["Drug"]
    outputs       = examples["Y"]
    texts = []
    for instruction, output in zip(drugs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = template.format(instruction, "(B)" if output else "(A)")
        texts.append(text)
    return { "text" : texts, "drug":drugs, "outputs":outputs, "ds_ident":"ADME_BBB_Martins"}

def formatting_prompts_func_test(examples):
    drugs = examples["Drug"]
    outputs       = examples["Y"]
    texts = []
    for instruction, output in zip(drugs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = template.format(instruction, "")
        texts.append(text)
    return { "text" : texts, "drug":drugs, "outputs":outputs, "ds_ident":"ADME_BBB_Martins"}

In [6]:
train_dataset = formatting_prompts_func(split["train"])
valid_dataset = formatting_prompts_func_test(split["valid"])
test_dataset = formatting_prompts_func_test(split["test"])

In [7]:
len(train_dataset["text"]), len(valid_dataset["text"]), len(test_dataset["text"])

(458, 66, 131)

In [8]:
import pickle
import os


def dump_file(obj, path, file_name):
    os.makedirs(path, exist_ok=True)
    file_name = os.path.join(path,file_name)
    with open(file_name, "wb") as f0:
        pickle.dump(obj,f0)

In [9]:
dump_file(train_dataset, "processed", "train.pkl")
dump_file(valid_dataset, "processed", "valid.pkl")
dump_file(test_dataset, "processed", "test.pkl")