# Convert DrugCentral relationships to Rephetio identifiers

In [1]:
import urllib
import json

import pandas

## Read DrugBank Slim

In [2]:
import os
#import csv
import gzip
import collections
#import re
#import io
import json
import xml.etree.ElementTree as ET

#import requests
import pandas

In [None]:
#download https://go.drugbank.com/releases/latest 
download_dir = '../data/DrugCentral'
xml_path = os.path.join(download_dir, 'full database.xml')  
tree = ET.parse(xml_path)
root = tree.getroot()

"""
download_dir = '../data/DrugCentral/dhimmel_drugbank'
xml_path = os.path.join(download_dir, 'drugbank.xml.gz')  
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()
"""

In [4]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [5]:
alias_dict = {row['drugbank_id']: row['aliases'] for row in rows}
with open(os.path.join(download_dir, 'aliases.json'), 'w') as fp:
    json.dump(alias_dict, fp, indent=2, sort_keys=True)

In [6]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [7]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
drugbank_df = pandas.DataFrame.from_dict(rows)[columns]
drugbank_df.shape

(16581, 9)

In [8]:
drugbank_slim_df = drugbank_df[
    drugbank_df.groups.map(lambda x: 'approved' in x) &
    drugbank_df.inchi.map(lambda x: x is not None) &
    drugbank_df.type.map(lambda x: x == 'small molecule')
]
drugbank_slim_df.shape

(2778, 9)

In [9]:
# write drugbank tsv
path = os.path.join(download_dir, 'drugbank.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)

# write slim drugbank tsv
path = os.path.join(download_dir, 'drugbank-slim.tsv')
drugbank_slim_df.to_csv(path, sep='\t', index=False)

In [10]:
#url = 'https://github.com/dhimmel/drugbank/raw/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank-slim.tsv'
drugbank_df = drugbank_slim_df #pandas.read_table(url)
drugbank_df = drugbank_df[['drugbank_id', 'name']]
drugbank_df = drugbank_df.rename(columns={'name': 'drugbank_name'})
drugbank_df.head(2)

Unnamed: 0,drugbank_id,drugbank_name
5,DB00006,Bivalirudin
6,DB00007,Leuprolide


In [11]:
len(drugbank_df)

2778

## Read identifiers

In [12]:
path = os.path.join(download_dir, 'identifiers.tsv') #'https://github.com/olegursu/drugtarget/raw/9a6d84bed8650c6c507a2d3d786814c774568610/identifiers.tsv'
id_df = pandas.read_table(path,sep=',')
id_df = id_df.query("id_type == 'DRUGBANK_ID'")[['struct_id', 'identifier']]
id_df = id_df.rename(columns={'identifier': 'drugbank_id'})
drugbank_df = id_df.merge(drugbank_df)
drugbank_df.head(2)

Unnamed: 0,struct_id,drugbank_id,drugbank_name
0,5392,DB11791,Capmatinib
1,5393,DB15685,Selpercatinib


In [13]:
len(drugbank_df)

2674

## Pharmacologic class

In [14]:
path = os.path.join(download_dir, 'pharm_class.tsv') #'https://github.com/olegursu/drugtarget/raw/9a6d84bed8650c6c507a2d3d786814c774568610/pharm_class.tsv'
class_df = pandas.read_table(path,sep=',')
class_df = drugbank_df.merge(class_df)

In [15]:
classes_df = class_df[['type', 'class_code', 'name', 'source']].drop_duplicates()
class_df = class_df[['drugbank_id', 'drugbank_name', 'class_code', 'name']]
class_df = class_df.rename(columns={'class_code': 'class_id', 'name': 'class_name'})
class_df = class_df.drop_duplicates()
class_df.head(2)

Unnamed: 0,drugbank_id,drugbank_name,class_id,class_name
0,DB11791,Capmatinib,N0000175605,Kinase Inhibitor
1,DB11791,Capmatinib,N0000182138,Cytochrome P450 1A2 Inhibitors


In [16]:
# Pharmacologic mappings
len(classes_df)

1967

In [17]:
# Class to Drug mappings
len(class_df)

18995

In [18]:
classes_df['type'].unique()

array(['EPC', 'MoA', 'CS', 'PA', 'has role', 'PE', 'EXT',
       'Chemical/Ingredient'], dtype=object)

In [19]:
class_type_map = {
    'MoA': 'Mechanism of Action',
    'PE': 'Physiologic Effect',
    'CS': 'Chemical Structure',
    'EPC': 'FDA Established Pharmacologic Class',
    'PA': 'Pharmacological Action',
    'has role': 'Application',
    'Chemical/Ingredient': 'Chemical/Ingredient',
}

def get_class_url(class_source, class_id):
    """Create URLs for pharmacological classes based on their source"""
    class_id = urllib.parse.quote(class_id)
    if class_source == 'CHEBI':
        return 'http://identifiers.org/chebi/{}'.format(class_id)
    if class_source == 'MeSH':
        return 'http://identifiers.org/mesh/{}'.format(class_id)
    if class_source == 'FDA':
        #return 'https://rxnav.nlm.nih.gov/REST/Ndfrt/id?idType=NUI&idString={}'.format(class_id)
        # Use bioportal link until something better arises
        return 'http://purl.bioontology.org/ontology/NDFRT/{}'.format(class_id)

classes_df['class_type'] = classes_df.type.map(class_type_map)
del classes_df['type']
classes_df = classes_df.sort_values(['class_type', 'class_code'])
classes_df = classes_df.rename(columns={'class_code': 'class_id', 'name': 'class_name', 'source': 'class_source'})
classes_df['url'] = classes_df.apply(lambda x: get_class_url(x.class_source, x.class_id), axis='columns')
classes_df.head(2)

Unnamed: 0,class_id,class_name,class_source,class_type,url
19873,CHEBI:102248,EC 1.14.11.29 inhibitors,CHEBI,Application,http://identifiers.org/chebi/CHEBI%3A102248
10946,CHEBI:110725,histamine 1-methyltransferase inhibitor,CHEBI,Application,http://identifiers.org/chebi/CHEBI%3A110725


In [20]:
class_df.shape

(18995, 4)

In [21]:
classes_df.shape

(1967, 5)

In [22]:
class_df.to_csv(os.path.join(download_dir, 'drug-to-class.tsv'), sep='\t', index=False)
classes_df.to_csv(os.path.join(download_dir, 'classes.tsv'), sep='\t', index=False)

In [1]:
import pandas
import os
download_dir = '../data/DrugCentral'
classes_df = pandas.read_table(os.path.join(download_dir, 'classes.tsv'))
classes_df.shape

(1967, 5)

In [3]:
class_types = {'Physiologic Effect', 'Mechanism of Action', 'Chemical/Ingredient', 'Chemical Structure'}
classes_df = classes_df.query("class_type in @class_types")
classes_df.shape

(478, 5)

In [4]:
classes_df['class_type'].unique()

array(['Chemical Structure', 'Chemical/Ingredient', 'Mechanism of Action',
       'Physiologic Effect'], dtype=object)

In [23]:
#compare with hetionet version  345 drugclass
DrugClass_hetionet = pandas.read_table('https://raw.githubusercontent.com/dhimmel/drugcentral/master/rephetio/classes.tsv') 
DrugClass_hetionet['class_type'].unique()

array(['Application', 'Chemical/Ingredient',
       'FDA Established Pharmacologic Class', 'Mechanism of Action',
       'Pharmacological Action', 'Physiologic Effect'], dtype=object)

In [24]:
class_types = {'Physiologic Effect', 'Mechanism of Action', 'Chemical/Ingredient'}
DrugClass_hetionet = DrugClass_hetionet.query("class_type in @class_types")
DrugClass_hetionet.shape

(345, 5)

In [29]:
df_merge = classes_df[['class_name']].merge(DrugClass_hetionet[['class_name']], how='outer', indicator=True )
df_merge

Unnamed: 0,class_name,_merge
0,Adrenocorticotropic Hormone,left_only
1,Alkaloids,left_only
2,Allergens,both
3,Allylamine,both
4,Amides,both
...,...,...
486,Methylated Sulfonamides,right_only
487,Nonsteroidal Anti-inflammatory Compounds,right_only
488,Antidiuretic Hormone Antagonists,right_only
489,Bile-acid Binding Activity,right_only


In [19]:
df_merge = classes_df[['class_name','class_source']].merge(DrugClass_hetionet[['class_name','class_source']], how='outer', indicator=True )
df_merge

Unnamed: 0,class_name,class_source,_merge
0,Adrenocorticotropic Hormone,FDA,left_only
1,Alkaloids,FDA,left_only
2,Allergens,FDA,both
3,Allylamine,FDA,both
4,Amides,FDA,both
...,...,...,...
486,Methylated Sulfonamides,FDA,right_only
487,Nonsteroidal Anti-inflammatory Compounds,FDA,right_only
488,Antidiuretic Hormone Antagonists,FDA,right_only
489,Bile-acid Binding Activity,FDA,right_only


In [30]:
df_merge[df_merge['_merge']=='both']

Unnamed: 0,class_name,_merge
2,Allergens,both
3,Allylamine,both
4,Amides,both
5,Amino Acids,both
6,Aminoglycosides,both
...,...,...
471,Reversed Anticoagulation Activity,both
472,Increased Hematopoietic Stem Cell Mobilization,both
473,Decreased Blood Pressure,both
474,Increased Blood Pressure,both


In [31]:
df_merge[df_merge['_merge']=='left_only']

Unnamed: 0,class_name,_merge
0,Adrenocorticotropic Hormone,left_only
1,Alkaloids,left_only
10,Anthelmintics,left_only
11,"Anti-Inflammatory Agents, Non-Steroidal",left_only
12,Ascorbic Acid,left_only
...,...,...
439,Skin Barrier Activity,left_only
442,Decreased Diuresis,left_only
456,Neuromuscular Blockade,left_only
476,Increased IgG Production,left_only


In [32]:
check = df_merge[df_merge['_merge']=='right_only']
check.shape

(13, 2)

In [33]:
# check if the above 13 classes have drug connected to it in current alzkb
check.merge(DrugClass_hetionet,how='left')#['class_id'].to_list()

Unnamed: 0,class_name,_merge,class_id,class_source,class_type,url
0,"Heparin, Low-Molecular-Weight",right_only,N0000007961,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
1,"Estrogens, Conjugated (USP)",right_only,N0000167510,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
2,Platinum-containing Compounds,right_only,N0000175073,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
3,pleuromutilin,right_only,N0000175433,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
4,Nucleoside Analog,right_only,N0000175459,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
5,Non-Nucleoside Analog,right_only,N0000175460,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
6,Pyrophosphate Analog,right_only,N0000175469,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
7,Amphenicols,right_only,N0000175479,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
8,Methylated Sulfonamides,right_only,N0000175512,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...
9,Nonsteroidal Anti-inflammatory Compounds,right_only,N0000175721,FDA,Chemical/Ingredient,http://purl.bioontology.org/ontology/NDFRT/N00...


In [48]:
DrugClass_hetionet['class_type'].unique()

array(['Chemical/Ingredient', 'Mechanism of Action', 'Physiologic Effect'],
      dtype=object)

In [34]:
path = os.path.join(download_dir, 'pharm_class.tsv') #'https://github.com/olegursu/drugtarget/raw/9a6d84bed8650c6c507a2d3d786814c774568610/pharm_class.tsv'
check = pandas.read_table(path,sep=',')
check[check['class_code']=='N0000007961']

Unnamed: 0,id,struct_id,type,name,class_code,source
