### Notebook for extracting relevant info. about cardiovascular drugs extracted from DrugBank XML (related entities, name, synonyms, etc.) and subsequently cleansing the data

In [113]:
import json
import os
import _pickle
import xml.etree.ElementTree as ET
from lxml import etree
from nltk import flatten
from importlib import reload

In [2]:
os.chdir(r'C:\Users\ttran\OneDrive\Desktop\COVID-CDV-DATA\covidii_KG')

In [3]:
import parse_xml
from parse_xml import *
import cleanse_data
from cleanse_data import *
import parse_web
from parse_web import *

Reload modules and re-import classes when needed

In [30]:
reload(parse_xml)
from parse_xml import *

<module 'parse_xml' from 'C:\\Users\\ttran\\OneDrive\\Desktop\\COVID-CDV-DATA\\covidii_graph_codes\\parse_xml.py'>

In [76]:
reload(cleanse_data)
from cleanse_data import *

In [56]:
reload(parse_web)
from parse_web import *

In [4]:
tree = ET.parse('fulldb21.xml')
root = tree.getroot()

Previously, we extracted a list of CV drugs from the XML file. We saved a list of the corresponding DrugBank accession numbers/IDs of those drugs as a python file called 'cv_drugs_dbid'. We will use these IDs to extract all the relevant info. about the drug using the ParseXML class.

In [77]:
with open("cv_drugs_dbid", 'rb') as f:
    cv_drugs_dbid = _pickle.load(f)

In [78]:
DATA = []

for ele in root:
    if ParseXML.getID(ele) in cv_drugs_dbid:
        data = {}
        ID = ParseXML.getID(ele)
        name = ParseXML.getName(ele) 
        synonyms = ParseXML.getSynonyms(ele)
        description = ParseXML.getDescription(ele)
        category = ParseXML.getCategory(ele)
        atccode = ParseXML.getATCCode(ele)
        indication = ParseXML.getIndication(ele)
        carriers = ParseXML.getEntities(ele,'carriers')
        targets = ParseXML.getEntities(ele,'targets')
        transporters = ParseXML.getEntities(ele,'transporters')
        enzymes = ParseXML.getEntities(ele,'enzymes')
        pathways = ParseXML.getPathways(ele)
    
        data.update({"drugbank_id": ID,\
                "name": name,\
                "synonyms":synonyms,\
                "descriptions": description,\
                "categories": category,\
                "ATC code": atccode,\
                "indication": indication,\
                "carriers": carriers,\
                "targets":targets,\
                "transporters":transporters,\
                "enzymes":enzymes,\
                "pathways": pathways})    
        DATA.append(data)

In [79]:
DATA[0]

{'drugbank_id': 'DB00009',
 'name': 'Alteplase',
 'synonyms': ['Alteplasa',
  'Alteplase (genetical recombination)',
  'Alteplase, recombinant',
  'Alteplase,recombinant',
  'Plasminogen activator (human tissue-type protein moiety)',
  'rt-PA',
  't-PA',
  't-plasminogen activator',
  'Tissue plasminogen activator',
  'Tissue plasminogen activator alteplase',
  'Tissue plasminogen activator, recombinant',
  'tPA'],
 'descriptions': 'Human tissue plasminogen activator, purified, glycosylated, 527 residues purified from CHO cells',
 'categories': ['Agents causing angioedema',
  'Amino Acids, Peptides, and Proteins',
  'Anticoagulants',
  'Biological Factors',
  'Blood and Blood Forming Organs',
  'Blood Proteins',
  'Cardiovascular Agents',
  'Endopeptidases',
  'Enzymes',
  'Enzymes and Coenzymes',
  'Fibrin Modulating Agents',
  'Fibrinolytic Agents',
  'Hematologic Agents',
  'Hydrolases',
  'Ophthalmologicals',
  'Peptide Hydrolases',
  'Plasminogen Activators',
  'Proteins',
  'Sens

Get list of index from DATA where the drug does not interact with any entity (i.e. the value of 'targets', 'carriers', 'enzymes', and 'transporters' is an empty list). These will be removed.

In [80]:
oCleanseData = CleanseData(DATA)
index_list = oCleanseData.getDrugIndexList()
print(index_list)

[221, 233, 235, 238, 249, 254, 267, 273, 274, 276, 279, 289, 301, 305, 311, 314, 317, 319, 320, 323, 324, 326, 328, 329, 331, 332, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 376, 377, 378, 379, 381, 382, 384, 385, 386, 387, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 421, 424, 425, 426, 428, 429, 430, 431, 432, 433, 434, 435, 439, 440, 441, 442, 447, 448, 449]


Get list of index from DATA where the drug contains at least one entity with a 'Null' UniProt ID. These entities will be removed and the drugs interacting exclusively with these entities will also be removed. The values for each dictionary represent the names of ALL the entities (not just the entity with the null UniProt ID) for that particular index.

In [82]:
nulluid_index_info = oCleanseData.getNullUIDIndexInfo()
nulluid_index_info

[{134: ['Bile acids']},
 {234: ['mRNA of ApoB-100']},
 {45: ['Bile acids']},
 {237: ['Hemoglobin subunit alpha',
   'Hemoglobin subunit beta',
   'Free radicals',
   'Guanylate cyclase soluble subunit alpha-2',
   'Nitric oxide synthase, endothelial',
   'Aldehyde dehydrogenase, mitochondrial']},
 {208: ["cGMP-inhibited 3',5'-cyclic phosphodiesterase A",
   'Tumor necrosis factor',
   "cGMP-inhibited 3',5'-cyclic phosphodiesterase B",
   "cAMP-specific 3',5'-cyclic phosphodiesterase 4",
   "cAMP-specific 3',5'-cyclic phosphodiesterase 3"]},
 {210: ['Bile acids']},
 {118: ['Adenosine receptor A1',
   'Adenosine receptor A2a',
   'Phosphodiesterase enzymes',
   "5'-nucleotidase",
   'Cytochrome P450 1A2']},
 {119: ['Solute carrier family 12 member 3',
   'Cytochrome P450 3A4',
   'Epoxide hydrolase 1',
   'alpha1-acid glycoprotein',
   'Serum albumin',
   'Erythrocyte']},
 {250: ['Calcium ions', 'Coagulation factor XII']}]

Remove entities with 'Null' UniProt ID

In [83]:
DATA = oCleanseData.removeNullUIDEntity()

In [None]:
DATA[134]
# Entity named 'Bile acids' (does not have a UniProt ID) is gone.

Remove drugs that interact exclusively with entities with 'Null' UniProt ID

In [85]:
# re-initialize object with updated data
oCleanseData = CleanseData(DATA)

# remove drugs
DATA = oCleanseData.removeDrugs()

In [86]:
# DATA now has 322 instead of 450 drugs.
len(DATA)

322

Make a list (uid_list) of unique UniProt IDs from DATA

In [87]:
uid_list = []
for drug in DATA:
        for entity in ['targets','enzymes','transporters','carriers']:
            for ent in drug[entity]:
                uid = ent['uniprot_id']
                if not uid in uid_list:
                    uid_list.append(uid)

Get list (dup_uids) of Uniprot IDs that correspond to multiple entities (i.e. duplicate UniProt IDs) from DATA

In [88]:
# re-initialize object with updated data
oCleanseData = CleanseData(DATA)

dup_uids = oCleanseData.getDupUIDs()

In [89]:
dup_uids

{'P08588': ['BE0000172', 'BE0004872'],
 'P08684': ['BE0002638', 'BE0004866'],
 'Q12809': ['BE0000090', 'BE0009629'],
 'Q14500': ['BE0001131', 'BE0009631'],
 'P35348': ['BE0000501', 'BE0004863', 'BE0004888'],
 'P08913': ['BE0004864', 'BE0000289'],
 'P00915': ['BE0000267', 'BE0009965'],
 'P35498': ['BE0004901', 'BE0000141', 'BE0009738'],
 'Q13936': ['BE0009739', 'BE0008715', 'BE0000430'],
 'P02763': ['BE0000925', 'BE0004879'],
 'P27815': ['BE0001133', 'BE0009963'],
 'Q06432': ['BE0000679', 'BE0004902'],
 'Q14654': ['BE0000708', 'BE0004923'],
 'P21397': ['BE0002198', 'BE0004909'],
 'Q12791': ['BE0000553', 'BE0004906'],
 'P08908': ['BE0000291', 'BE0004862'],
 'P54750': ['BE0003565', 'BE0004922'],
 'O43497': ['BE0000483', 'BE0008716'],
 'Q09428': ['BE0000207', 'BE0008670'],
 'P21728': ['BE0000020', 'BE0004889', 'BE0009376'],
 'P28482': ['BE0009668', 'BE0000923']}

Make dictionaries with the names, DrugBank IDs, and UniProt IDs as keys. The dictionary with the UniProt IDs as the keys will only contain non-duplicate UniProt IDs. These dictionaries will make extracting info. much easier downstream.

In [90]:
ent_name = []
ent_dbid = []
ent_uid = []
name_dict = {}
dbid_dict = {}
uid_dict = {}

for drug in DATA:
    for entity in ['targets','carriers','transporters','enzymes']:
        for ent in drug[entity]:
            if not ent['name'] in ent_name:
                ent_name.append(ent['name'])
                name_dict.update({ent['name']:{'drugbank_id':ent['drugbank_id'],\
                                                     'uniprot_id':ent['uniprot_id']}})
            if not ent['drugbank_id'] in ent_dbid:
                ent_dbid.append(ent['drugbank_id'])
                dbid_dict.update({ent['drugbank_id']:{'name':ent['name'],\
                                                     'uniprot_id':ent['uniprot_id']}})
            if not ent['uniprot_id'] in ent_uid and not \
            ent['uniprot_id'] in list(dup_uids.keys()):
                ent_uid.append(ent['uniprot_id'])
                uid_dict.update({ent['uniprot_id']:{'name':ent['name'],\
                                                     'drugbank_id':ent['drugbank_id']}})

Make a list (ent2check) containing the DrugBank IDs of all the duplicate UniProt IDs. Using the ParseWeb class (contains methods for web scraping), we will check whether each ID in the list corresponds to a protein or protein group. If the latter, then we will find the protein members of the group in the DrugBank webpage corresponding to that group. The entries corresponding to the protein group in DATA will be replaced with the members.

In [91]:
ent2check = flatten(list(dup_uids.values()))
ent2check = list(set(ent2check))

From the ent2check list, make a sub-list (protein_dbids) containing only the DrugBank IDs corresponding to proteins, not protein groups. Then, create a list (ent_list) containg dictionaries with each corresponding to a protein group in the ent2check list and info. about its name, DrugBank ID, and members.

In [92]:
ent_list = []
protein_dbids = []

for ent in ent2check:
    oParseWeb = ParseWeb('https://go.drugbank.com/bio_entities/' + ent,\
                            ent)
    html = oParseWeb.getHTML()
    kind = oParseWeb.getKind(html)
    if kind=='protein':
        protein_dbids.append(ent)

for ent in ent2check:
    oParseWeb = ParseWeb('https://go.drugbank.com/bio_entities/' + ent,\
                            ent)
    html = oParseWeb.getHTML()
    memberDict = oParseWeb.getMembers(html,dup_uids,uid_list,protein_dbids,\
                                      dbid_dict,uid_dict)
    if memberDict!=None:
        ent_list.append(memberDict)

In [93]:
ent_list

[{'name': 'Alpha adrenergic receptor',
  'drugbank_id': 'BE0004888',
  'members': [{'name': 'Alpha-1A adrenergic receptor',
    'drugbank_id': 'BE0000501',
    'uniprot_id': 'P35348'},
   {'name': 'Alpha-1B adrenergic receptor',
    'drugbank_id': 'BE0000575',
    'uniprot_id': 'P35368'},
   {'name': 'Alpha-1D adrenergic receptor',
    'drugbank_id': 'BE0000715',
    'uniprot_id': 'P25100'},
   {'name': 'Alpha-2A adrenergic receptor',
    'drugbank_id': 'BE0000289',
    'uniprot_id': 'P08913'},
   {'name': 'Alpha-2B adrenergic receptor',
    'drugbank_id': 'BE0000572',
    'uniprot_id': 'P18089'},
   {'name': 'Alpha-2C adrenergic receptor',
    'drugbank_id': 'BE0000342',
    'uniprot_id': 'P18825'}]},
 {'name': 'Beta adrenergic receptor',
  'drugbank_id': 'BE0004872',
  'members': [{'name': 'Beta-1 adrenergic receptor',
    'drugbank_id': 'BE0000172',
    'uniprot_id': 'P08588'},
   {'name': 'Beta-2 adrenergic receptor',
    'drugbank_id': 'BE0000694',
    'uniprot_id': 'P07550'},
   

Make a list (ent_dbid) containing the DrugBank IDs of the protein groups.

In [94]:
ent_dbid = []
for i in ent_list:
    ent_dbid.append(i['drugbank_id'])

Merge entities with the same UniProt ID

In [96]:
# before merge
DATA[21]['targets']

[{'name': 'Alpha-2 adrenergic receptors',
  'drugbank_id': 'BE0004864',
  'actions': ['agonist'],
  'uniprot_id': 'P08913'},
 {'name': 'ATP-sensitive inward rectifier potassium channel 1',
  'drugbank_id': 'BE0000644',
  'actions': ['inhibitor'],
  'uniprot_id': 'P48048'},
 {'name': 'Beta adrenergic receptor',
  'drugbank_id': 'BE0004872',
  'actions': ['antagonist'],
  'uniprot_id': 'P08588'}]

In [107]:
DATA = oCleanseData.mergeDuplicateUIDs(ent_dbid,ent_list)

In [108]:
# after merge
DATA[21]['targets']

[{'name': 'ATP-sensitive inward rectifier potassium channel 1',
  'drugbank_id': 'BE0000644',
  'actions': ['inhibitor'],
  'uniprot_id': 'P48048'},
 {'name': 'Alpha-2A adrenergic receptor',
  'drugbank_id': 'BE0000289',
  'uniprot_id': 'P08913',
  'group_name': 'Alpha-2 adrenergic receptors',
  'actions_of_group': ['agonist']},
 {'name': 'Alpha-2B adrenergic receptor',
  'drugbank_id': 'BE0000572',
  'uniprot_id': 'P18089',
  'group_name': 'Alpha-2 adrenergic receptors',
  'actions_of_group': ['agonist']},
 {'name': 'Alpha-2C adrenergic receptor',
  'drugbank_id': 'BE0000342',
  'uniprot_id': 'P18825',
  'group_name': 'Alpha-2 adrenergic receptors',
  'actions_of_group': ['agonist']},
 {'name': 'Beta-1 adrenergic receptor',
  'drugbank_id': 'BE0000172',
  'uniprot_id': 'P08588',
  'group_name': 'Beta adrenergic receptor',
  'actions_of_group': ['antagonist']},
 {'name': 'Beta-2 adrenergic receptor',
  'drugbank_id': 'BE0000694',
  'uniprot_id': 'P07550',
  'group_name': 'Beta adrenerg

In [110]:
# re-initialize object with updated data
oCleanseData = CleanseData(DATA)

# call getDupUIDs method again to see if the duplicate Uniprot IDs are gone
dup_uids = oCleanseData.getDupUIDs()

In [111]:
# no more duplicate UniProt IDs!
dup_uids

{}

Export cleansed data as json file

In [115]:
with open("cvdrug_ent_pw.json", 'w') as f:
    json.dump(DATA,f)