## Get All Synonyms

Este Notebook abre las tablas con los compuestos de Food o Drug (lo que sea), y para cada uno de ellos toma los sinónimos (sin repetir ID)

In [1]:
# import modules

import csv
import os
import requests
from tqdm import tqdm
import pickle
import pandas as pd
import traceback

In [2]:
# Open Table Database

file_name = os.path.join(".", "TaggerData", "HMDB", "diterpenoids", "hmdb_diterpenoids.tsv")

with open(file_name, 'r') as f:
    
    row_list = []
    
    file = csv.reader(f, delimiter='\t')
    next(file)
    
    for row in file:
        row_list.append(row[:2])

In [4]:
# Define getSynonymFunction

def getSynonyms(name):
    '''
    Build URL and do the request. Return a list with the synonyms
    '''

    url_prolog = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
    url_input = "/compound/name/" + name
    operation = "/synonyms"
    output = "/TXT"

    url = url_prolog + url_input + operation + output

    try:
        response = requests.get(url)
    
    except Exception:
        traceback.print_exc()
        return [-1]
        

    if response.ok:
        synonyms_list = response.text.split('\n')
        return synonyms_list[:-1]
    
    else: 
        return [-1]

In [5]:
# Loop over compounds to get synonyms

reviewed_ID = []
all_syns = [] # [[[syn1, syn2...],db_id], [syn1, syn2...],db_id], [syn1, syn2...],db_id]...]


for name, db_id in tqdm(row_list):
    
    if db_id in reviewed_ID:
        continue
    
    req = getSynonyms(name)
    
    if req != [-1]:
        reviewed_ID.append(db_id)
        all_syns.append([req, db_id])
        


with open(f'{os.path.splitext(file_name)[0]}', 'wb') as f:
    pickle.dump(all_syns, f)


100%|██████████| 921/921 [02:43<00:00,  5.63it/s]


In [6]:
with open(f'{os.path.splitext(file_name)[0]}', 'rb') as f:
    all_syns_open = pickle.load(f)

In [7]:
rows_pubchem = [[name, db_id] for syns_list, db_id in all_syns_open for name in syns_list]

In [8]:
all_rows = rows_pubchem + row_list

In [9]:
print(len(row_list))
print(len(rows_pubchem))
print(len(all_rows))

921
2885
3806


In [10]:
df = pd.DataFrame(all_rows, columns=['Name', 'HMDB_ID'])
df.drop_duplicates(inplace=True)
df.to_csv(file_name, sep='\t', index=False)

In [11]:
df

Unnamed: 0,Name,HMDB_ID
0,"11-[(2R)-3-[2-amino-3-methyl-4-(2-methyl-1,3-t...",HMDB0127547
1,"11-[(2R)-3-[2-amino-3-methyl-4-(2-methyl-1,3-t...",HMDB0127548
2,"11-[(2R,3S)-3-[2-amino-3-methyl-4-(2-methyl-1,...",HMDB0127524
3,Gibberellin A80,HMDB0036892
4,11b-Hydroxygibberellin A7,HMDB0036892
...,...,...
3801,α-bixin,HMDB0035317
3802,α-crocin,HMDB0002398
3803,α-hydroxyphytanate,HMDB0061666
3804,α-hydroxyphytanic acid,HMDB0061666
