* Originally thought to be done with BeautifulSoup, but 1) took ages to read the full xml, and 2) didn't build a hierarchical structure.
* Also tried with: ``bigxml`` and ``from xml.etree.ElementTree import iterparse``.
* Thought about partitioning the document in each drug, but in the end, there was no need for that.
* ``xmltodict`` worked well, even with the full document. Generates the adequate hierarchical structure.

In [8]:
import pandas as pd
from tqdm.notebook import tqdm
import xmltodict
import pickle
import os

In [5]:
import re
re_spaces = re.compile('\s+')

In [6]:
path_dir = './'

In [None]:
with open(path_dir + 'full database.xml','r',encoding='utf8') as xml_file:
    data_dict = xmltodict.parse(xml_file.read())

In [None]:
# por cada una de estos dicts, tengo que agregar información y relaciones a los nodos
all_drugs = {}
for drug in tqdm(data_dict['drugbank']['drug']):
    dd = {}
    
    dd['type'] = drug['@type']
    
    if isinstance(drug['drugbank-id'],dict):
        dd['drugbank-id'] = drug['drugbank-id']['#text']
    else:
        for did in drug['drugbank-id']:
            if isinstance(did,dict):
                dd['drugbank-id'] = did['#text']
            else:
                if 'alternate_ids' not in dd:
                    dd['alternate_ids'] = []
                dd['alternate_ids'].append(did)
            
    dd['groups'] = drug['groups']['group']
    dd['name'] = drug['name']
    
    if drug['description'] is not None:
        dd['description'] = re_spaces.sub(' ',drug['description'])
    
    if drug['indication'] is not None:
        dd['indication'] = re_spaces.sub(' ',drug['indication'])
    
    if drug['mechanism-of-action'] is not None:
        dd['mechanism_of_action'] = re_spaces.sub(' ',drug['mechanism-of-action'])
    
    if drug['toxicity'] is not None:
        dd['toxicity'] = re_spaces.sub(' ',drug['toxicity'])
    
    if 'products' in drug and drug['products'] is not None:
        pp = drug['products']['product']
        if isinstance(pp,dict):
            pp = [pp]
        dd['product'] = set([x['name'] for x in pp])

    if 'food-interactions' in drug and drug['food-interactions'] is not None:
        dd['food_interactions'] = drug['food-interactions']['food-interaction']
    
    if 'drug-interactions' in drug and drug['drug-interactions'] is not None:
        di = drug['drug-interactions']['drug-interaction']
        if isinstance(di,dict):
            di = [di]
        dd['drug_interactions'] = {x['drugbank-id'] : x['description'] for x in di}
    
    if 'categories' in drug and drug['categories'] is not None:
        cc = drug['categories']['category']
        if isinstance(cc,dict):
            cc = [cc]
        dd['categories'] = {x['category'] : x['mesh-id'] for x in cc}

    if 'classification' in drug:
        dd['classification'] = {}
        dd['classification']['description'] =  drug['classification']['description']
        dd['classification']['direct_parent'] = drug['classification']['direct-parent'] 
        dd['classification']['kingdom'] = drug['classification']['kingdom'] 
        dd['classification']['class'] = drug['classification']['class'] 
        dd['classification']['subclass'] = drug['classification']['subclass']
        if 'synonyms' in drug['classification']:
            dd['classification']['synonyms'] = [x['#text'] for x in drug['classification']['synonyms']['synonym'] if x['@language'] == 'english']

    all_drugs[dd['drugbank-id']] = dd

In [None]:
import pickle

with open(path_dir + 'dict_all_drugs.pickle','wb') as file:
    pickle.dump(all_drugs,file)

Up to this point, the drugs are parsed and information has been stored.

We need to check whether there is any missing drug in the KG and fix both nodes and edges.

In [None]:
all_drugs = pd.read_pickle(path_dir + 'dict_all_drugs.pickle')
len(all_drugs)

In [None]:
# Check si falta alguna de las drogas
import os

graph_drugs = set()
path_nodes = path_dir + 'pdd_nt/'
for ff in tqdm(os.listdir(path_nodes)):
    if '__PDD_nodes' not in ff:
        continue
    print(ff)
    nodes = pd.read_pickle(path_nodes + ff)
    for n in nodes:
        if 'type_' in nodes[n]: # si no tiene el type_ son prescriptions, que quedaron mal !!! :face-palm:
            if 'drug' in nodes[n]['type_']:
                graph_drugs.add(n)
    del nodes
len(graph_drugs)

In [None]:
graph_drugs - set(all_drugs.keys()) # de forma directa faltan 12 drogas!

In [None]:
for dd in graph_drugs: # chequeando los otros nombres faltan 6 drogas
    if dd in all_drugs:
        continue
    found = False
    for d,data in all_drugs.items():
        if 'alternate_ids' in data:
            if dd in data['alternate_ids']:
                found = True
                print(dd,d)
                break
    if not found:
        print(dd)

In [None]:
# reemplazar los nodos y los edges
mapping_missing_drugs = {'DB05278':'DB00030','DB11280':'DB01914','DB08914':'DB00030','DB09159':'DB09154','DB01398':'DB00936','DB00021':'DB09532','DB09396':'DB00647','DB11245':'DB03088','DB09162':'DB14520','DB11122':'DB09255','DB09323':'DB01053','DB09396':'DB00647'} 
to_remove = ['DB05813']

In [None]:
# updating nodes with the mapping drugs
for ff in tqdm(os.listdir(path_nodes)):
    if '__PDD_nodes' not in ff:
        continue
    print(ff)
    nodes = pd.read_pickle(path_nodes + ff)
    
    drugs_here = set(nodes.keys()) # para evitar tener los concurrent modification exception
    changed = False
    for k,v in mapping_missing_drugs.items():
        if k in drugs_here and v not in drugs_here: # para evitar tener que volver a guardar el file
            nodes[v] = dict(nodes[k])
            del nodes[k]
            changed = True
            print(nodes[v])
    
    for n in to_remove:
        if n in drugs_here:
            del nodes[n]
            changed = True
    
    if changed:
        print('Updating...')
        with open(path_nodes + ff,'wb') as file:
            pickle.dump(nodes,file)

In [None]:
# actualizar los arcos, para que apunten a los mappings de las drogas
# Hay casos en los cuales hay que reemplazar los dos elementos, por eso se puede romper!!!
all_edges = pd.read_pickle(path_nodes + '__PDD_edges.pickle')

ae = set(all_edges.keys())
deleted = dict()

for edge in tqdm(ae):
    changed = False
    new_edge = edge
    for md,v in mapping_missing_drugs.items(): # hay arcos en los que hay que cambiar ambos elementos, por eso primero hay que chequear todos
        if md in edge:
            new_edge = new_edge.replace(md,v) 
            changed = True
    if not changed:
        continue
        
    edgesp = new_edge.split('__')
    if edgesp[0] == edgesp[1]:
        del all_edges[edge]
        continue

    if new_edge in all_edges:
        if edge in all_edges:
            del all_edges[edge]
        continue

    all_edges[new_edge] = dict(all_edges[edge])
    del all_edges[edge]

with open(path_nodes + '__PDD_edges.pickle','wb') as file:
    pickle.dump(all_edges,file)         