### Notebook for extracting cardiovascular drugs from DrugBank XML

In [1]:
import json
import os
import _pickle
import pandas as pd
from lxml import etree

In [2]:
import xml.etree.ElementTree as ET
tree = ET.parse('fulldb21.xml')
root = tree.getroot()

In [None]:
import parse_xml
from parse_xml import *

#### Identify Drugbank IDs and names of the 160 cardivascular drugs in the 'drug-category-target.csv' file

In [12]:
os.chdir(r'C:\Users\ttran\OneDrive\Desktop\COVID-CDV-DATA')
df = pd.read_csv('drug-category-target.csv')
df2 = pd.read_excel('cvd_drug_syn.xlsx')

In [173]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,name,category,target
0,0,heparin,Anticoagulants,"['P01008', 'P00742', 'P16109', 'P22455', 'P086..."
1,1,warfarin,Anticoagulants,"['Q9BQB6', 'O75469']"


In [174]:
df2.head(2)

Unnamed: 0,name,ID,category,desc,syn
0,heparin,DB01109,Anticoagulants,Unfractionated heparin (UH) is a heterogenous ...,"Calciparine,Eparina,heparina,Heparinate,Hepari..."
1,warfarin,DB00682,Anticoagulants,Warfarin is an anticoagulant drug normally use...,"4-Hydroxy-3-(3-oxo-1-phenylbutyl)coumarin,Coum..."


In [175]:
print(df.shape, df2.shape)

(161, 4) (142, 5)


In [13]:
# check for duplicates
print(len(df['name'].unique()), len(df2['name'].unique()))

# out of the 161 drugs, only 156 are unique for df.

156 142


The 'drug-category-target.csv' file contains more drugs than the 'cvd_drug_syn.xlsx' file. However, the csv file (df) does not have ID info.

In [14]:
# drugs in df that are not in df2:
diff = list(set([name.lower() for name in df['name'].tolist()]).\
            difference([name.lower() for name in df2['name'].tolist()]))
print(diff)

['insulin aspart (novolog)', 'glyburide (glynase)', 'glipizide (glucotrol)', 'tolbutamide (orinase)', 'pioglitazone (actos)', 'insulin lispro (humalog)', 'insulin detemir (levemir)', 'insulin glulisine (apidra)', 'insulin regular (humulin r, novolin r)', 'rosiglitazone (avandia)', 'chlorpropamide (diabenese)', 'nph, neutral protamine hagedorn', 'insulin glargine (lantus/toujeo)', 'insulin']


These drugs are the ones that we need to identify in Drugbank b/c they don't have an ID. Their names are likely different than those used in Drugbank as well. Note that insulin regular (humulin r, novolin r), insulin, and nph, neutral protamine hagedorn are all annotated as 'human insulin' in Drugbank, with the latter (NPH) being a product of human insulin. Therefore, we only need to keep one of these entries.

In [15]:
df_drug_list = []
# get rid of other name of drug in parentheses
for drug in diff:
    df_drug_list.append(drug.split(" (",1)[0])
cv_drug_list = []

for ele in root:
    cv_drug_dict = {}
    syn = [drug.lower() for drug in ParseXML.getSynonyms(ele)]
    if ParseXML.getName(ele).lower() in df_drug_list or\
    any(drug in syn for drug in df_drug_list) or\
    any(drug in df_drug_list for drug in syn):
        name = ParseXML.getName(ele)
        ID = ParseXML.getID(ele)

        cv_drug_dict.update({"name": name,\
                                "ID": ID})
        cv_drug_list.append(cv_drug_dict)

In [16]:
cv_drug_list

[{'name': 'Insulin human', 'ID': 'DB00030'},
 {'name': 'Insulin lispro', 'ID': 'DB00046'},
 {'name': 'Insulin glargine', 'ID': 'DB00047'},
 {'name': 'Rosiglitazone', 'ID': 'DB00412'},
 {'name': 'Chlorpropamide', 'ID': 'DB00672'},
 {'name': 'Glyburide', 'ID': 'DB01016'},
 {'name': 'Glipizide', 'ID': 'DB01067'},
 {'name': 'Tolbutamide', 'ID': 'DB01124'},
 {'name': 'Pioglitazone', 'ID': 'DB01132'},
 {'name': 'Insulin aspart', 'ID': 'DB01306'},
 {'name': 'Insulin detemir', 'ID': 'DB01307'},
 {'name': 'Insulin glulisine', 'ID': 'DB01309'}]

In [17]:
for ele in root:
    cv_drug_dict = {}
    if ParseXML.getID(ele) in df2['ID'].tolist():
        name = ParseXML.getName(ele)
        ID = ParseXML.getID(ele)
        
        cv_drug_dict.update({"name": name,\
                                "ID": ID})
        cv_drug_list.append(cv_drug_dict)

In [18]:
len(cv_drug_list)

154

This is the expected number of drugs in list. 142 in df2 + 12 from df = 154

#### Get Drugbank IDs and names of all drugs with ATC classification "C"

In [19]:
atc_drug_list = []

for ele in root:
    cv_drug_dict = {}
    try:
        all_atc_codes = ele.find("{http://www.drugbank.ca}atc-codes").\
                        findall("{http://www.drugbank.ca}atc-code")
        for atc_code in all_atc_codes:
            if atc_code.get('code')[0]=="C":
                name = ParseXML.getName(ele)
                ID = ParseXML.getID(ele)

                cv_drug_dict.update({"name": name,\
                                       "ID": ID})
                atc_drug_list.append(cv_drug_dict)
                break
    except:
        continue

In [20]:
len(atc_drug_list)

399

There are ~400 drugs that have the ATC code that begins with "C", which is indicative of drugs related to the cardivascular system.

In [21]:
tmp_ID = []
cv_drug_list_ID = []
for i in atc_drug_list:
    tmp_ID.append(i['ID'])
for i in cv_drug_list:
    cv_drug_list_ID.append(i['ID'])

In [22]:
# find number of drugs extracted using ATC code that are not already in
# dfs we started out with.
diff = list(set(tmp_ID).difference(cv_drug_list_ID))
len(diff)

296

In [23]:
for drug in atc_drug_list:
    if drug['ID'] in diff:
        cv_drug_list.append(drug)

In [187]:
len(cv_drug_list)
# 154 drugs from dfs + 296 from ATC code = 450

450

In [24]:
# make list containing only IDs of CV drugs
cv_drug_list_ID = []
for drug in cv_drug_list:
    cv_drug_list_ID.append(drug['ID'])
len(cv_drug_list_ID)

450

#### Export list containing CV drug DrugBank IDs. This list will be used as input to generate a list containing CV drugs + relevant info. about each drug (related entities, name, syn, etc.)

In [26]:
with open("cv_drugs_dbid", 'wb') as f:
    _pickle.dump(cv_drug_list_ID,f)

#### Get all elements (run if needed)

In [None]:
allelements = [elem.tag for elem in root.iter()]

In [None]:
[i for i, x in enumerate(allelements) if x == "{http://www.drugbank.ca}enzyme"]