In [1]:
import os
import pandas as pd
import json
import uuid
import requests
from rdkit import Chem
import selfies as sf
from sklearn.model_selection import train_test_split

ttd_data_folder = "../BioT5/data/ttd/"

In [2]:
join_data = dict()

drug_cids, proteins = set(), set() #to get pubchem drug descriptions, there are duplicates

#get drug names, types, targets, and PubChem CIDs if present
drugs = []
smiles = 0
with open(ttd_data_folder + "COVID19-Drug-Data.txt") as f, open(ttd_data_folder + "Sequence-of-COVID19-Protein-Drugs.txt") as protf:
    f_content = f.readlines()[22:]
    prot_content = protf.readlines()
    texts = "".join(f_content)[:-1].split("\n\t\t\t\n")
    for text in texts:
        # print("a")
        lines = text.split("\n")
        entry = dict()
        for line in lines:
            split_line = line.split("\t")
            abbr = split_line[1]
            if abbr == "DRUGNAME":
                entry["drug_name"] = split_line[2]
            if abbr == "DRUGTYPE":
                # if split_line[2] == "Small molecular drug":
                #     entry["drug_type"] = "molecule"
                # if split_line[2] == "Protein/peptide drug":
                #     entry["drug_type"] = "protein/peptide"
                entry["drug_type"] = split_line[2]
            if abbr == "TARGNAME":
                entry["target_name"] = split_line[2]
            if abbr == "PCHEMCID":
                entry["CID"] = int(split_line[2][5:])
                drug_cids.add(split_line[2][5:])
        if entry["drug_name"] + ".sdf" in os.listdir(ttd_data_folder + "2D-Structures-of-Small-Molecule-Based-COVID19-Drugs"): #small molecule drugs
            mol = Chem.SDMolSupplier(ttd_data_folder + "2D-Structures-of-Small-Molecule-Based-COVID19-Drugs/" + entry["drug_name"] + ".sdf")[0]
            try: #check for cid mismatches
                sdf_cid = mol.GetProp('PUBCHEM_COMPOUND_CID')
                if "CID" in entry:
                    if int(sdf_cid) != entry["CID"]:
                        print(f"Mismatched CID: {sdf_cid} in SDF file vs {entry['CID']} in drug file")
                else:
                    print(f"Found missing CID: {sdf_cid}")
                    entry["CID"] = int(sdf_cid)
                    drug_cids.add(sdf_cid)
            except:
                pass
            try:
                sdf_smiles = mol.GetProp('PUBCHEM_OPENEYE_CAN_SMILES')
                smiles += 1
                entry["smiles"] = sdf_smiles
            except:
                # print(mol.GetPropNames())
                # print(entry["drug_name"])
                pass
        
        if "CID" not in entry and entry["drug_name"] in "".join(prot_content): #protein/peptide drugs
            sequence = prot_content[prot_content.index(f">{entry['drug_name']}\n") + 1]
            mol = Chem.MolFromSequence(sequence)
            if mol: #some proteins contain unknown amino acids
                entry["smiles"] = Chem.MolToSmiles(mol)
        drugs.append(entry)
    print(len(texts), len(drugs), len(drug_cids))
drugs_df = pd.DataFrame(drugs)
drugs_df = drugs_df.assign(counts=drugs_df.count(axis=1)).sort_values(['CID', 'counts']).drop_duplicates('CID', keep='last').drop('counts', axis=1) #filter duplicates, keeping the one with the fewest null values

# drugs_df.to_csv("test_drug_df.csv")

Found missing CID: 219104
Found missing CID: 479503
Found missing CID: 3117


[18:56:51] Explicit valence for atom # 2 N, 4, is greater than permitted
[18:56:51] ERROR: Could not sanitize molecule ending on line 44
[18:56:51] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted


357 357 154


In [3]:
cids_string = ",".join(drug_cids)

#Use pubchem to get some molecule descriptions
pubchem_desc_request_string = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + cids_string + "/description/JSON"
pubchem_desc_response = requests.get(pubchem_desc_request_string).json()
# with_names, with_description = set(), set()
name_df, desc_df = [], []
for entry in pubchem_desc_response["InformationList"]["Information"]:
    if "Title" in entry:
        name_df.append(entry)
    elif "Description" in entry:
        desc_df.append(entry)
name_df, desc_df = pd.DataFrame(name_df), pd.DataFrame(desc_df)
# drug_desc_df = pd.merge(name_df,desc_df, how = "outer")
drug_desc_df = pd.merge(name_df,desc_df)
drugs_df = pd.merge(drugs_df, drug_desc_df, how="left")


#Use pubchem to get missing SMILES

pubchem_smiles_request_string = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + cids_string + "/property/CanonicalSMILES/JSON"
pubchem_smiles_response = requests.get(pubchem_smiles_request_string).json()
for entry in pubchem_smiles_response["PropertyTable"]["Properties"]:
    if "CanonicalSMILES" in entry:
        entry_cid = entry["CID"]
        try:
            df_row = drugs_df.loc[drugs_df['CID'] == entry_cid].iloc[0] #we removed the duplicates so there should only be 1 matching row
        except:
            # print(entry_cid, drugs_df['CID'].values, entry_cid in drugs_df['CID'].values)
            # print(type(entry_cid), type(drugs_df['CID'].values[5]))
            raise
        if not pd.isna(df_row["smiles"]):
            if df_row["smiles"] != entry["CanonicalSMILES"]:
                print(f"mismatch: {df_row['smiles']} in file vs {entry['CanonicalSMILES']} in pubchem")                        
        else:
            row_idx = drugs_df.loc[drugs_df['CID'] == entry_cid].index[0] 
            drugs_df.loc[row_idx, "smiles"] = entry['CanonicalSMILES']

drugs_df = drugs_df.dropna(subset="smiles")

# drugs_df.to_csv("pubchem_test.csv")
# print(drugs_df.head())

In [4]:
def get_selfies(row):
    return sf.encoder(row["smiles"])

drugs_df["selfies"] = drugs_df.apply(get_selfies, axis=1)
# drugs_df.to_csv(ttd_data_folder + "processed_dataset.csv", index=False)

In [5]:
# # Get target descriptions
# targets = []
# with open(ttd_data_folder + "COVID19-Target-Data.txt") as f:
#     content = f.readlines()[25:]
#     texts = "".join(content).split("\n\n") 
#     for text in texts:
#         lines = text.split("\n")
#         entry = dict()
#         for line in lines:
#             split_line = line.split("\t")
#             abbr = split_line[1]
#             if abbr == "TARGNAME":
#                 entry["target_name"] = split_line[2]
#             if abbr == "FUNCTION":
#                 entry["target_function"] = split_line[2]
#         if "target_function" not in entry:
#             entry["target_function"] = "helps with COVID-19"

In [6]:
# process descriptions

for index, row in drugs_df.iterrows():
    if pd.isna(row["drug_type"]): 
        covid_desc = "The molecule is a COVID-19 drug candidate."
    else:
        covid_desc = f"The molecule is a COVID-19 {row['drug_type']} candidate."
    if not pd.isna(row["target_name"]):
        covid_desc += f" This molecule affects the target {row['target_name']}."
    if pd.isna(row["Description"]):
        drugs_df.loc[index, "Description"] = covid_desc
    else:
        drug_name = row["drug_name"]
        desc = row["Description"]
        if desc.find(" is ") != -1:
            drug_name_or_synonym = desc[:desc.find(" is ")]
            desc = desc.replace(drug_name, "The molecule")
            desc = desc.replace(drug_name_or_synonym, "The molecule") #in case it was a synonym
        desc += (" " + covid_desc)
        drugs_df.loc[index, "Description"] = desc

# drugs_df.to_csv(ttd_data_folder + "processed_dataset.csv", index=False)

In [7]:
#prepare splits
# BioT5 datasets are formatted using https://github.com/allenai/natural-instructions as a template.

os.makedirs("../BioT5/data/splits/covid/mol2text/", exist_ok=True)
os.makedirs("../BioT5/data/splits/covid/text2mol/", exist_ok=True)

mol2text_task_id = 201 #arbitrary id number, biot5's task ids go up to 180. 
text2mol_task_id = 202


# train_df, val_test_df = train_test_split(drugs_df, test_size=0.2, random_state=42, stratify=drugs_df[["drug_type"]])
# val_df, test_df = train_test_split(val_test_df, test_size=0.1, random_state=42, stratify=drugs_df[["drug_type"]])

train_df, val_test_df = train_test_split(drugs_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)

for dset, df in enumerate([train_df, val_df, test_df]):
    mol2text = {"Contributors": ["Nguyen Duy Minh"], "Categories": ["Translation"], "Reasoning": [], "URL": ["https://db.idrblab.net/ttd/"], "Instruction_language": ["English"], "Domains": ["Chemistry"], "Positive Examples": [], "Negative Examples": [],"Instances": [], "Source": ["Translation from molecule SELFIES to natural language"], "Definition": ["You are given a molecule SELFIES. Your job is to generate the molecule description in English that fits the molecule SELFIES."], "Input_language": ["SELFIES"], "Output_language": ["English"], "Instance License": ["Unknown"]}
    text2mol = {"Contributors": ["Nguyen Duy Minh"], "Categories": ["Translation"], "Reasoning": [], "URL": ["https://db.idrblab.net/ttd/"], "Instruction_language": ["English"], "Domains": ["Chemistry"], "Positive Examples": [], "Negative Examples": [],"Instances": [], "Source": ["Translation from natural language to molecule SELFIES"], "Definition": ["You are given a molecule description in English. Your job is to generate the molecule SELFIES that fits the description."], "Input_language": ["SELFIES"], "Output_language": ["English"], "Instance License": ["Unknown"]}
    for i, row in df.iterrows():
        selfies = row["selfies"]
        desc = row["Description"]
        mol2text["Instances"].append({
            "id": f"task{mol2text_task_id}-{uuid.uuid4().hex}",
            "input": f"<bom>{selfies}<eom>",
            "output": [desc]
        })
        text2mol["Instances"].append({
            "id": f"task{text2mol_task_id}-{uuid.uuid4().hex}",
            "input": desc,
            "output": [f"<bom>{selfies}<eom>"]
        })
    with open(f'../BioT5/data/tasks/task{mol2text_task_id}_COVID_drug_generation_{["train", "validation", "test"][dset]}.json', "w") as out:
        json.dump(mol2text, out, indent=4)
    with open(f'../BioT5/data/splits/covid/mol2text/{["train", "validation", "test"][dset]}_tasks.txt', "w") as out:
        out.write(f'task{mol2text_task_id}_COVID_drug_generation_{["train", "validation", "test"][dset]}')
    with open(f'../BioT5/data/tasks/task{text2mol_task_id}_COVID_drug_generation_{["train", "validation", "test"][dset]}.json', "w") as out:
        json.dump(text2mol, out, indent=4)
    with open(f'../BioT5/data/splits/covid/text2mol/{["train", "validation", "test"][dset]}_tasks.txt', "w") as out:
        out.write(f'task{text2mol_task_id}_COVID_drug_generation_{["train", "validation", "test"][dset]}')