In [11]:
import pandas as pd


In [12]:
# Load dataset

from tdc.single_pred import ADME
data = ADME(name = 'BBB_Martins')
split = data.get_split()

Found local copy...
Loading...
Done!


In [13]:
template = """### Instructions: Answer the following question about drug properties. 
### Context: As a membrane separating circulating blood and brain extracellular fluid, the blood-brain barrier (BBB) is the protection 
layer that blocks most foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver to the site of 
action forms a crucial challenge in development of drugs for central nervous system. 
### Question: Given a drug SMILES string, predict whether it 
(A) does not cross the BBB (B) crosses the BBB Drug 
SMILES: {} 
### Answer: {}"""


In [14]:
split["train"].head()

Unnamed: 0,Drug_ID,Drug,Y
0,Terbutylchlorambucil,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1,1
1,40730,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,1
2,cloxacillin,Cc1onc(-c2ccccc2Cl)c1C(=O)N[C@@H]1C(=O)N2[C@@H...,1
3,cefoperazone,CCN1CCN(C(=O)N[C@@H](C(=O)N[C@@H]2C(=O)N3C(C(=...,1
4,rolitetracycline,CN(C)[C@@H]1C(=O)/C(=C(/O)NCN2CCCC2)C(=O)[C@@]...,1


In [15]:
# process dataset

def formatting_prompts_func(examples):
    drugs = examples["Drug"]
    outputs       = examples["Y"]
    texts = []
    for instruction, output in zip(drugs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = template.format(instruction, "(B)" if output else "(A)")
        texts.append(text)
    return { "text" : texts, "drug":drugs, "outputs":outputs, "ds_ident":"ADME_BBB_Martins"}

def formatting_prompts_func_test(examples):
    drugs = examples["Drug"]
    outputs       = examples["Y"]
    texts = []
    for instruction, output in zip(drugs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = template.format(instruction, "")
        texts.append(text)
    return { "text" : texts, "drug":drugs, "outputs":outputs, "ds_ident":"ADME_BBB_Martins"}

In [16]:
train_dataset = formatting_prompts_func(split["train"])
valid_dataset = formatting_prompts_func_test(split["valid"])
test_dataset = formatting_prompts_func_test(split["test"])

In [17]:
len(train_dataset["text"]), len(valid_dataset["text"]), len(test_dataset["text"])

(1421, 203, 406)

In [18]:
import pickle
import os


def dump_file(obj, path, file_name):
    os.makedirs(path, exist_ok=True)
    file_name = os.path.join(path,file_name)
    with open(file_name, "wb") as f0:
        pickle.dump(obj,f0)

In [19]:
dump_file(train_dataset, "processed", "train.pkl")
dump_file(valid_dataset, "processed", "valid.pkl")
dump_file(test_dataset, "processed", "test.pkl")