Once we have all the entities we were interested in, we build the triples.

Two options:
1. Build the triples as simply as we could. Attributes are also relations.
2. Build a neo4j entity, where nodes can have attributes and be used as embeddings.

In [None]:
import pandas as pd
import os
from tqdm.notebook import tqdm

In [None]:
path_dir = './'
aa = pd.read_pickle(path_dir + 'uts_entities.pickle')
len(aa[0]),len(aa[1]),len(aa[2]) # aa[0] dict de entidades, aa[1] set de urls, aa[2] processed urls

In [None]:
path_dir = './'
path_nodes = path_dir + 'pdd_nt/'

#### Simple triples

Everything is a triple.

In [None]:
from collections import deque
import pickle
added_nodes = set() 

triples = deque()

for ff in tqdm(os.listdir(path_nodes)):
    if '__PDD_nodes' not in ff:
        continue
    print(ff)
    nodes = pd.read_pickle(path_nodes + ff)
    
    for n,data in tqdm(nodes.items()):
        
        if n in added_nodes:
            print(n)
            continue

        added_nodes.add(n)

        if data['type_'] == 'vocabulary:Prescription':
            triples.append((n,'is a','prescription'))
            triples.append((n,'has a duration of',data['duration_days']))
            triples.append((n,'has a dose of',data['dose']))
            triples.append((n,'has a drug of',data['drug_type']))
            continue

        if data['type_'] == 'drug':
            triples.append((n,'is a','drug'))
            continue

        if data['type_'] == 'vocabulary:Admission':
            triples.append((n,'is a','admission'))
            if 'age' in data:
                triples.append((n,'has age',data['age']))
            if 'gender' in data:
                triples.append((n,'has gender',data['gender']))
            if 'bmi_first' in data:
                triples.append((n,'has bmi',data['bmi_first']))
            continue

        if data['type_'] == 'vocabulary:Patient':
            triples.append((n,'is a','patient'))
            continue

        if data['type_'] == 'ICD_diagnose':
            triples.append((n.split('/')[-1],'is a','diagnose'))
            continue

        print(n,data)
    
    del nodes
    
with open(path_dir + '__triples_nodes_pdd.pickle','wb') as file:
    pickle.dump(triples,file)
    
len(triples)

In [None]:
edges = pd.read_pickle(path_nodes + '__PDD_edges.pickle')

In [None]:
from collections import deque
import pickle

triples = deque()
for e,data in tqdm(edges.items()):
    
    ss = e.split('___')
    if data['type_'] == 'has_prescription':
        triples.append((ss[0],data['type_'].replace('_',' '),ss[1]))
        continue
    
    if data['type_'] == 'hospital_admission_id':
        triples.append((ss[0],'was admitted in',ss[1]))
        continue
    
    if data['type_'] == 'take_drug_id' or data['type_'] == 'take_drugbank_id':
        triples.append((ss[0],'takes drug',ss[1]))
        continue
        
    if data['type_'] == 'diagnoses_icd9':
        triples.append((ss[0],'was diagnosed',ss[1]))
        continue    
    
    if data['type_'] == 'interact':
        triples.append((ss[0],'interacts with',ss[1]))
        continue    
    
    
    print(e,data)
    break

with open(path_dir + '__triples_edges_pdd.pickle','wb') as file:
    pickle.dump(triples,file)
    
len(triples)

In [None]:
drugs = pd.read_pickle(path_dir + 'dict_all_drugs.pickle')

In [None]:
triples_nodes = pd.read_pickle(path_dir + '__triples_nodes_pdd.pickle')

In [None]:
len(triples_nodes)

In [None]:
drugs_nodes = set()
for t in tqdm(triples_nodes):
    if t[2] == 'drug':
        drugs_nodes.add(t[0])

len(drugs_nodes)

In [None]:
drugs_triples = set()

processed = set()

while len(drugs_nodes) != 0:
    
    d = drugs_nodes.pop()
    
    print(len(drugs_nodes),d)
    
    if d not in drugs:
        continue
    
    if d in processed:
        continue
        
    processed.add(d)
    data = drugs[d]
    
    if 'group' in data:
        for g in data['group']:
            drugs_triples.add((d,'is in group',g))
        
    if 'name' in data:
        drugs_triples.add((d,'is known as',data['name']))

    if 'description' in data:
        drugs_triples.add((d,'can be described as',data['description']))

    if 'mechanism_of_action' in data:
        drugs_triples.add((d,'acts as',data['mechanism_of_action']))

    if 'drug_interactions' in data:
        for i in data['drug_interactions'].keys():
            drugs_triples.add((d,'interacts with',i))
            drugs_nodes.add(i) # we add the interacted 

    if 'indication' in data:
        drugs_triples.add((d,'is indicated for',data['indication']))
        
with open(path_dir + '__triples_drugs_pdd','wb') as file:
    pickle.dump(drugs_triples,file)
    
len(drugs_triples)        

In [None]:
icds = pd.read_pickle(path_dir + 'icd9_umls.pickle') # no relations between diseases... esas están en el otro

cuis_info = aa[0]
icd_index_cui = {}
for k,v in cuis_info.items():
    for kk in v:
        icd_index_cui[kk] = k
icd_index_cui

In [None]:
icds

In [None]:
icds_nodes = set()
for t in tqdm(triples_nodes):
    if t[2] == 'diagnose':
        icds_nodes.add(t[0])

len(icds_nodes)

In [None]:
icds_triples = set()

icds_nodes_ = set(icds_nodes)

processed_icds = set()
while len(icds_nodes_) > 0:
    
    ii = icds_nodes_.pop()
    if ii in processed_icds:
        continue
    
    processed_icds.add(ii)
    
    if ii not in icds:
        continue
    
    print(len(icds_nodes_),ii)
    
    icds_triples.add((ii,'is known as',icds[ii]['label']))
    if 'STY' in icds[ii]:
        if icds[ii]['STY'] in icds:
            icds_triples.add((ii,'has type',icds[icds[ii]['STY']]['label']))
            
    if ii in icd_index_cui: # si lo tengo acá
        dd = cuis_info[icd_index_cui[ii]][ii]

        if 'ancestors' in dd:
#             print('--- ancestors:',dd['ancestors'])
            if len(dd['ancestors']) > 0:
                icds_triples.add((ii,'is child of',dd['ancestors'][0]))
                icds_nodes_.add(dd['ancestors'][0])
                for j in range(1,len(dd['ancestors'])):
                    icds_triples.add((dd['ancestors'][j-1],'is child of',dd['ancestors'][j]))
                    icds_nodes_.add(dd['ancestors'][j])
                    
        if 'relations_out' in dd:
            for k,v in dd['relations_out'].items():
                for vv in v:
#                     print(icd_index_cui[ii],vv)
                    if vv in cuis_info:
                        vv = list(cuis_info[vv].keys())[0]
#                         print(cuis_info[vv])
                        if ii == vv:
                            continue
                        icds_triples.add((ii,k.replace('_',' '),vv))
#                         print((ii,k.replace('_',' '),vv))
        
        if 'relations_in' in dd:
            for k,v in dd['relations_in'].items():
                for vv in v:
                    if vv in cuis_info:
#                         vv = list(cuis_info[vv].keys())[0]
                        if ii == vv:
                            continue
                        icds_triples.add((ii,k.replace('_',' '),vv))
#                         print((ii,k.replace('_',' '),vv))
 
# with open(path_dir + '__triples_icds_pdd','wb') as file:
#     pickle.dump(drugs_triples,file)
                        
len(icds_triples)

In [None]:
import pickle
with open(path_dir + '__triples_icds_pdd','wb') as file:
    pickle.dump(icds_triples,file)