* Es para obtener más información de relaciones entre los conceptos
    * ``https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C0042904/atoms?apiKey=a6f141b2-6c07-4d21-868e-6d316346dfbd``
* Retorna jsons con MUCHOS links y vueltas.
* Se pueden encontrar relaciones indirectas.
* Se entra por los CUI, con lo que hay que obener los CUI de los ICD.

In [None]:
import pandas as pd
import json
import os

In [None]:
path_dir = './'
umls_api_key = 'a'

In [None]:
icds = pd.read_pickle(path_dir + 'icd9_umls.pickle')
url_atoms = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/__ID__/atoms'

all_urls = set()
for k,v in icds.items():
    if 'cui' not in v:
        continue
    all_urls.add(url_atoms.replace('__ID__',v['cui']))

len(all_urls)

In [None]:
import urllib.request, urllib.error, urllib.parse
import json
import os
import requests
import time

# def get_json(url): 
#     opener = urllib.request.build_opener()
#     return json.loads(opener.open(url + '&apiKey='+umls_api_key if '?' in url else url + '?apiKey='+umls_api_key).read())


def get_json(url):
    response = None
    timeout = True
    retries = 5
    while retries > 0 and timeout:
        retries -= 1
        try:
            response = requests.get(url + '&apiKey='+umls_api_key if '?' in url else url + '?apiKey='+umls_api_key, timeout=10)
            timeout = False
        except requests.exceptions.RequestException as e:
            print('Timeout...')
            timeout = True
            time.sleep(3)
    if response is None:
        return response
    
    return json.loads(response.text)

In [None]:
def process_hierarchy(results_,desc=False):
    rr = []
    urls = set()
    for aa in results_['result']:

        if aa['rootSource'] not in vocabulary_sources:
            continue
    
        rr.append(aa['ui'])
        urls.add(aa['atoms'])

    if desc:
        rr.reverse()
        
    return rr,urls

def process_relations(rels):

    in_ = defaultdict(set)
    out_ = defaultdict(set)

    urls = set()

    for rr in rels:
        if rr['rootSource'] not in vocabulary_sources:
            continue

        if len(rr['additionalRelationLabel']) == 0:
            continue

        if cui_save['ui'] in rr['relatedId']:
            in_[rr['additionalRelationLabel']].add(rr['relatedFromId'].split('/')[-1])
            urls.add(rr['relatedFromId'])
        elif cui_save['ui'] in rr['relatedFromId']:
            out_[rr['additionalRelationLabel']].add(rr['relatedId'].split('/')[-1])
            urls.add(rr['relatedId'])
        else:
            continue
        
    return in_,out_,urls

In [None]:
from collections import defaultdict
import pickle

vocabulary_sources = set(['ICD9CM'])

if os.path.exists(path_dir + 'uts_entities.pickle'):
    aa = pd.read_pickle(path_dir + 'uts_entities.pickle')
    all_cuis = aa[0]
    all_urls = aa[1]
    processed_urls = aa[2]
else:
    all_cuis = defaultdict(dict)
    processed_urls = set()
    all_urls = set()
    
print(len(all_urls),len(processed_urls),len(all_cuis))
while len(all_urls) != 0:
    
    uu = all_urls.pop()
    try:
        cui_data = get_json(uu)
    except json.JSONDecodeError:
        continue
    
    if cui_data is None:
        all_urls.add(uu)
        print('Error in url...')
        continue
        
    print(len(all_urls),'---',uu)
    processed_urls.add(uu)
    cui_data = cui_data['result']
    
    if isinstance(cui_data,dict): # en los casos que retorna uno solo
        cui_data = [cui_data]
    
    for x in cui_data:
        if x['language'] != 'ENG':
            continue
        if x['rootSource'] not in vocabulary_sources:
            continue

        cui_save = {}
        cui_save['ui'] = x['ui']
        cui_save['obsolete'] = False if x['obsolete'] == 'false' else True
        cui_save['rootSource'] = x['rootSource']

        if x['sourceConcept'] != 'NONE':
            cui_save['sourceConcept'] = x['sourceConcept']

        cui_save['code'] = x['code']

        try:
            x_code = get_json(cui_save['code'])['result'] # try exceptS
        except json.JSONDecodeError:
            continue

        cui_save['id'] = x_code['ui']

        if x_code['ancestors'] != 'NONE':
            try:
                jj = get_json(x_code['ancestors'])
                if jj is not None:
                    if 'result' in jj:
                        ancestors_ = process_hierarchy(jj,False)
                        cui_save['ancestors'] = ancestors_[0]
                        all_urls.update(ancestors_[1] - processed_urls)
            except json.JSONDecodeError:
                pass
            
        if x_code['descendants'] != 'NONE':
            try:
                jj = get_json(x_code['descendants'])
                if jj is not None:
                    if 'result' in jj:
                        descendants_ = process_hierarchy(jj,False)
                        cui_save['descendants'] = descendants_[0]
                        all_urls.update(descendants_[1] - processed_urls)
            except json.JSONDecodeError:
                pass

        if x_code['relations'] != 'NONE':
            try:
                jj = get_json(x_code['relations'])
                if jj is not None:
                    if 'result' in jj:
                        rels = process_relations(jj['result'])
                        cui_save['relations_in'] = rels[0]
                        cui_save['relations_out'] = rels[1] # { rel_type : {ids} } 
                        all_urls.update(rels[2] - processed_urls)
            except json.JSONDecodeError:
                pass

        if x_code['attributes'] != 'NONE':
            cui_save['attributes'] = {}
            try:
                att = get_json(x_code['attributes'])
                if att is not None:
                    for a in att['result']:
                        cui_save['attributes'][a['name']] = a['value']
            except json.JSONDecodeError:
                pass
                
        all_cuis[cui_save['ui']][cui_save['id']] = cui_save
        
        if len(all_cuis) % 20 == 0:
            print('Saving....')
            with open(path_dir + 'uts_entities.pickle','wb') as file:
                pickle.dump([all_cuis,all_urls,processed_urls],file) # de esta forma no tengo que controlar nada, dado que ya levanta las urls que faltan
                # guardo las procesadas just in case haya muchas repetidas
                

In [None]:
# controlar que todas las urls que supuestamente están procesadas, estén bajadas, si no lo están, hacer de nuevo la descarga

missing = set()
for purl in tqdm(processed_urls):
    xx = purl.split('/')[-2]
    if xx not in elements and xx not in elements.values():
        missing.add(purl)
len(missing)    

In [None]:
# hay 57k urls que no fueron analizadas... o que se perdieron o que les pasó algo...
# es bastante más que la cantidad de cosas bajadas
# ver de poner de nuevo a bajar

In [None]:
len(processed_urls - missing)

In [None]:
from tqdm.notebook import tqdm

elements = {}
for k,v in tqdm(all_cuis.items()):
    for kk,vv in v.items():
        if 'id' not in vv:
            print('------',vv)
            continue
        if 'ui' in vv:
            elements[vv['id']] = vv['ui']
        else:
            elements[vv['id']] = k

In [None]:
elements