In [1]:
from __future__ import print_function
import pandas as pd 
import xmltodict
from collections import OrderedDict

In [2]:
with open('../Drugs/full_database.xml') as fd:
    doc = xmltodict.parse(fd.read())


In [3]:
def parse_targets(targets):
    # only polipeptides considered, e.g. not DNA, asparagine 
    if type(targets) == OrderedDict:
        targets = [targets]
    targets_dict = {}
    for target in targets:
        if 'polypeptide' in target.keys():
            if type(target['polypeptide']) == OrderedDict:
                target['polypeptide'] = [target['polypeptide']]
            for pept in target['polypeptide']:
                gene = pept['gene-name']
                targets_dict[gene] = {}
                targets_dict[gene]["gene"] = gene
                for ext_id in pept['external-identifiers']['external-identifier']:
                    if ext_id["resource"] == 'HUGO Gene Nomenclature Committee (HGNC)':
                        targets_dict[gene]['HGNC'] =  ext_id['identifier']
                        break
                targets_dict[gene]['taxid'] = pept['organism']['@ncbi-taxonomy-id']
                targets_dict[gene]['gene_name'] = pept['name']
                if target['actions'] is None:
                    targets_dict[gene]["action"] = target['actions']
                else:
                    targets_dict[gene]["action"] = target['actions']['action']
                targets_dict[gene]['known_action'] = target['known-action']
    return pd.DataFrame.from_dict(targets_dict).T

def drug2target(drug):
    if drug['targets'] is not None:
        targets = parse_targets(drug['targets']['target'])
        if targets.shape[0] > 0:
            targets['drug_name']= [drug['name']]*targets.shape[0]
            targets['drug_type'] = [drug['@type']]*targets.shape[0]
            ids = drug['drugbank-id']
            if type(ids)  == OrderedDict:
                ids = [ids]
            old_ids = []
            for i in ids:
                if type(i) == OrderedDict:
                    if i['@primary'] == "true":
                        targets['drugbank_id'] =  [i['#text']]*targets.shape[0]
                        break
                    else:
                        targets['drugbank_id_non_primary'] =  [i['#text']]*targets.shape[0]
                else: 
                    old_ids.append(i)
            #targets["old_ids"] = ",".join(old_ids)*targets.shape[0]
            #targets["subclass"] = [drug['classification']['subclass']]*targets.shape[0]
            #targets["mesh"] = [",".join([x  for x in map(lambda x : x["mesh-id"],drug['categories']['category']) if x is not None])]*targets.shape[0]
            #targets["synonyms"] = [",".join(map(lambda x : x["#text"],drug['synonyms']['synonym']))]*targets.shape[0]
            if 'drugbank_id' not in targets.keys():
                print(ids)
            targets.set_index(['drugbank_id','gene'],drop=False,inplace =True)
            return targets
d = drug2target(doc[u'drugbank']['drug'][1])
d

Unnamed: 0_level_0,Unnamed: 1_level_0,HGNC,action,gene,gene_name,known_action,taxid,drug_name,drug_type,drugbank_id
drugbank_id,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DB00002,C1QA,HGNC:1241,,C1QA,Complement C1q subcomponent subunit A,unknown,9606,Cetuximab,biotech,DB00002
DB00002,C1QB,HGNC:1242,,C1QB,Complement C1q subcomponent subunit B,unknown,9606,Cetuximab,biotech,DB00002
DB00002,C1QC,HGNC:1245,,C1QC,Complement C1q subcomponent subunit C,unknown,9606,Cetuximab,biotech,DB00002
DB00002,C1R,HGNC:1246,,C1R,Complement C1r subcomponent,unknown,9606,Cetuximab,biotech,DB00002
DB00002,C1S,HGNC:1247,,C1S,Complement C1s subcomponent,unknown,9606,Cetuximab,biotech,DB00002
DB00002,EGFR,HGNC:3236,antagonist,EGFR,Epidermal growth factor receptor,yes,9606,Cetuximab,biotech,DB00002
DB00002,FCGR1A,HGNC:3613,,FCGR1A,High affinity immunoglobulin gamma Fc receptor I,unknown,9606,Cetuximab,biotech,DB00002
DB00002,FCGR2A,HGNC:3616,,FCGR2A,Low affinity immunoglobulin gamma Fc region re...,unknown,9606,Cetuximab,biotech,DB00002
DB00002,FCGR2B,HGNC:3618,,FCGR2B,Low affinity immunoglobulin gamma Fc region re...,unknown,9606,Cetuximab,biotech,DB00002
DB00002,FCGR2C,HGNC:15626,,FCGR2C,Low affinity immunoglobulin gamma Fc region re...,unknown,9606,Cetuximab,biotech,DB00002


In [10]:
drugbank_targets = []
for drug in doc[u'drugbank']['drug']:
    drug_record = drug2target(drug)
    if drug_record is not None:
        drugbank_targets.append(drug_record)
drugbank_targets  = pd.concat(drugbank_targets)

In [11]:
print("unique drug-target pairs",drugbank_targets[["drugbank_id","gene"]].drop_duplicates().shape[0])
print("unique drugs",len(set(drugbank_targets["drugbank_id"].values)),"genes", len(set(drugbank_targets["gene"].values)))
drugbank_targets = drugbank_targets[drugbank_targets["taxid"]=="9606"]
print("drug-target pairs for human targets only",drugbank_targets[["drugbank_id","gene"]].drop_duplicates().shape[0])

print("unique drugs",len(set(drugbank_targets["drugbank_id"].values)))
print("unique genes", len(set(drugbank_targets["gene"].values)))

unique drug-target pairs 17240
unique drugs 6890 genes 3757
drug-target pairs for human targets only 13467
unique drugs 5138
unique genes 2573


In [12]:
drugbank_targets = drugbank_targets[["drugbank_id","gene","gene_name","HGNC","drug_name","drug_type","action","known_action","taxid"]]
#drugbank_targets.to_csv("../Drugs/Drugbank.509.drugs2target.human_genes.tsv",sep="\t",index=False,encoding="utf-8")

In [52]:
positive = ['agonist',
 'agonist|activator',
 'agonist|modulator',
 'agonist|partial agonist',
 'agonist|positive allosteric modulator',
 'agonist|stimulator',
 'inducer',
 'activator',
 'positive allosteric modulator',
 'positive modulator',
 'partial agonist'
  'stimulator']
negative = ['antagonist',
 'antagonist|antibody','antagonist|binder','antagonist|blocker','antagonist|inhibitor',
 'antagonist|inhibitory allosteric modulator','antagonist|inhibitory immune response','antagonist|multitarget', 'antagonist|other/unknown',
 'antibody',
 'antisense oligonucleotide',
 'blocker',
 'inactivator',
 'inhibitor',
 'inverse agonist',
 'negative modulator',
 'partial antagonist',
 'suppressor',
 'weak inhibitor']

drugbank_targets["action"] = drugbank_targets["action"].apply(lambda x: "|".join(x)  if type(x)==list else x)
other  = set([x for x in drugbank_targets["action"].values ])
other  = list(other.difference(set(positive)).difference(set(negative)))
print("other:",other)
actions_dict = dict(zip(positive,["positive"]*len(positive))+zip(negative,["negaive"]*len(negative))+zip(other,["other"]*len(other)))
drugbank_targets["simplified_action"] = drugbank_targets["action"].apply(lambda x : actions_dict[x])
drugbank_targets = drugbank_targets[["drugbank_id","gene","gene_name","HGNC","drug_name","drug_type","action","simplified_action","known_action","taxid"]]
drugbank_targets.to_csv("../Drugs/Drugbank.509.drugs2target.human_genes.tsv",sep="\t",index=False,encoding="utf-8")
drugbank_targets.head()

other: [u'adduct', u'stimulator', u'aggregation inhibitor', u'product of', u'binding', u'agonist|inhibitor', u'desensitize the target', u'nucleotide exchange blocker', u'neutralizer', u'cofactor', u'potentiator', u'allosteric modulator', u'ligand', u'other/unknown', u'unknown', u'other', None, u'oxidizer', u'component of', u'binder', u'substrate', u'antagonist|agonist|negative modulator', u'multitarget', u'antagonist|partial agonist', u'modulator', u'acetylation', u'chaperone', u'partial agonist', u'antagonist|agonist', u'cleavage']


Unnamed: 0_level_0,Unnamed: 1_level_0,drugbank_id,gene,gene_name,HGNC,drug_name,drug_type,action,simplified_action,known_action,taxid
drugbank_id,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DB00001,F2,DB00001,F2,Prothrombin,HGNC:3535,Lepirudin,biotech,inhibitor,negaive,yes,9606
DB00002,C1QA,DB00002,C1QA,Complement C1q subcomponent subunit A,HGNC:1241,Cetuximab,biotech,,other,unknown,9606
DB00002,C1QB,DB00002,C1QB,Complement C1q subcomponent subunit B,HGNC:1242,Cetuximab,biotech,,other,unknown,9606
DB00002,C1QC,DB00002,C1QC,Complement C1q subcomponent subunit C,HGNC:1245,Cetuximab,biotech,,other,unknown,9606
DB00002,C1R,DB00002,C1R,Complement C1r subcomponent,HGNC:1246,Cetuximab,biotech,,other,unknown,9606
