# Goals
- Download needed data needed for performing generative pretraining step

In [1]:
import os
import json
import requests
import pandas as pd
from tqdm import tqdm

In [2]:
def get_smile_from_drugbank(db_id):
    result = requests.get(f"https://go.drugbank.com/structures/small_molecule_drugs/{db_id}.smiles").content.decode()
    return result

def get_gene_sequence_from_uniprot(ut_id):
    result = requests.get(f"https://rest.uniprot.org/uniprotkb/{ut_id}.fasta").content.decode()
    # remove first line
    result = "".join(result.split("\n")[1:])
    return result

In [3]:
df = pd.read_csv("../data/ChG-Miner_miner-chem-gene.tsv", sep="\t")

In [4]:
get_smile_from_drugbank("DB00357")

'CCC1(CCC(=O)NC1=O)C1=CC=C(N)C=C1'

In [5]:
get_gene_sequence_from_uniprot("P23219")

'MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCTIPGLWTWLRNSLRPSPSFTHFLLTHGRWFWEFVNATFIREMLMRLVLTVRSNLIPSPPTYNSAHDYISWESFSNVSYYTRILPSVPKDCPTPMGTKGKKQLPDAQLLARRFLLRRKFIPDPQGTNLMFAFFAQHFTHQFFKTSGKMGPGFTKALGHGVDLGHIYGDNLERQYQLRLFKDGKLKYQVLDGEMYPPSVEEAPVLMHYPRGIPPQSQMAVGQEVFGLLPGLMLYATLWLREHNRVCDLLKAEHPTWGDEQLFQTTRLILIGETIKIVIEEYVQQLSGYFLQLKFDPELLFGVQFQYRNRIAMEFNHLYHWHPLMPDSFKVGSQEYSYEQFLFNTSMLVDYGVEALVDAFSRQIAGRIGGGRNMDHHILHVAVDVIRESREMRLQPFNEYRKRFGMKPYTSFQELVGEKEMAAELEELYGDIDALEFYPGLLLEKCHPNSIFGESMIEIGAPFSLKGLLGNPICSPEYWKPSTFGGEVGFNIVKTATLKKLVCLNTKTCPYVSFRVPDASQDDGPAVERPSTEL'

In [6]:
drugs = df["#Drug"].unique() 
genes = df["Gene"].unique()

In [7]:
print(f"There are {len(drugs)} drugs")
print(f"There are {len(genes)} targets")

There are 5018 drugs
There are 2325 targets


In [8]:
drugs = drugs.tolist()
genes = genes.tolist()

In [None]:
db_id_to_smile = {}
ut_id_to_seq = {}

In [None]:
print("Downloading SMILEs for drug id's in dataset")

for drug_id in tqdm(drugs):
    db_id_to_smile[drug_id] = get_smile_from_drugbank(drug_id)

print("Downloading Gene Sequence for protein id's in dataset")

for uniprot_id in tqdm(genes):
    ut_id_to_seq[uniprot_id] = get_gene_sequence_from_uniprot(uniprot_id)

In [None]:
tokens = []
for key in db_id_to_smile.keys():
    for t in list(set(db_id_to_smile[key])):
        if t in ["!", "\n"]:
            print(key)s
        if t not in tokens:
            tokens.append(t)
print(len(tokens))

In [None]:
tokens = []
for key in ut_id_to_seq.keys():
    for t in list(set(ut_id_to_seq[key])):
        if t in ["!", "\n"]:
            print(key)
        if t not in tokens:
            tokens.append(t)
print(len(tokens))

In [None]:
sorted(tokens)

In [None]:
json.dump(db_id_to_smile, open("../data/databankid_to_smile.json", "w"))
json.dump(ut_id_to_seq, open("../data/uniprotid_to_seq.json", "w"))