# Realizando Imports

In [1]:
import os
import csv
import gzip
import collections
import re
import io
import json
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
#from pandas_profiling import ProfileReport

pd.set_option('display.max_columns', None)

# Abrindo o XML (DataBase)

In [2]:
xml_path = (r"/home/gabriel/Documents/Gabriel/lcdia/drugbankdb/full_db.xml")
with open(xml_path,encoding="utf8") as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

# Selecionando o tipo de dado para aquilo que queremos trabalhar

In [3]:
ns = '{http://www.drugbank.ca}'
mw_template = "{ns}calculated-properties/{ns}property[{ns}kind='Molecular Weight']/{ns}value"
ws_template = "{ns}calculated-properties/{ns}property[{ns}kind='Water Solubility']/{ns}value"
mp_template = "{ns}experimental-properties/{ns}property[{ns}kind='Melting Point']/{ns}value"
bp_template = "{ns}experimental-properties/{ns}property[{ns}kind='Boiling Point']/{ns}value"
lp_template = "{ns}calculated-properties/{ns}property[{ns}kind='logP']/{ns}value"
ls_template = "{ns}calculated-properties/{ns}property[{ns}kind='logS']/{ns}value"
psa_template = "{ns}calculated-properties/{ns}property[{ns}kind='Polar Surface Area (PSA)']/{ns}value"
hac_template = "{ns}calculated-properties/{ns}property[{ns}kind='H Bond Acceptor Count']/{ns}value"
hdc_template = "{ns}calculated-properties/{ns}property[{ns}kind='H Bond Donor Count']/{ns}value"
rbc_template = "{ns}calculated-properties/{ns}property[{ns}kind='Rotatable Bond Count']/{ns}value"
ref_template = "{ns}calculated-properties/{ns}property[{ns}kind='Refractivity']/{ns}value"
pol_template = "{ns}calculated-properties/{ns}property[{ns}kind='Polarizability']/{ns}value"
pkaa_template = "{ns}calculated-properties/{ns}property[{ns}kind='pKa (strongest acidic)']/{ns}value"
pkab_template = "{ns}calculated-properties/{ns}property[{ns}kind='pKa (strongest basic)']/{ns}value"
pc_template = "{ns}calculated-properties/{ns}property[{ns}kind='Physiological Charge']/{ns}value"
nor_template = "{ns}calculated-properties/{ns}property[{ns}kind='Number of Rings']/{ns}value"
bio_template = "{ns}calculated-properties/{ns}property[{ns}kind='Bioavailability']/{ns}value"



rows = []
for i, drug in enumerate(root):
    row = {}
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['molecular_weight'] = drug.findtext(mw_template.format(ns = ns))
    row['state'] = drug.findtext(ns + "state")
    row['water_solubility'] = drug.findtext(ws_template.format(ns = ns))
    row['melt_point'] = drug.findtext(mp_template.format(ns = ns))
    row['boil_point'] = drug.findtext(bp_template.format(ns = ns))
    row['logP'] = drug.findtext(lp_template.format(ns = ns))
    row['logS'] = drug.findtext(ls_template.format(ns = ns))
    row['pKa_Acid'] = drug.findtext(pkaa_template.format(ns = ns))
    row['pKa_Basic'] = drug.findtext(pkab_template.format(ns = ns))
    row['physiological_charge'] = drug.findtext(ls_template.format(ns = ns))
    row['polar_surface_area'] = drug.findtext(psa_template.format(ns = ns))
    row['hydrogen_acceptor_count'] = drug.findtext(hac_template.format(ns = ns))
    row['hydrogen_donor_count'] = drug.findtext(hdc_template.format(ns = ns))
    row['rotable_bond_count'] = drug.findtext(rbc_template.format(ns = ns))
    row['refractivity'] = drug.findtext(ref_template.format(ns = ns))
    row['polarizability'] = drug.findtext(pol_template.format(ns = ns))
    row['number_of_rings'] = drug.findtext(nor_template.format(ns = ns))
    row['bioavailability'] = drug.findtext(bio_template.format(ns = ns))


    
    rows.append(row)

# Criando o DataFrame

In [4]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'categories', 'molecular_weight', 
           'state','water_solubility', 'melt_point','boil_point','logP','logS','pKa_Acid','pKa_Basic','physiological_charge',
           'polar_surface_area','hydrogen_acceptor_count', 'hydrogen_donor_count', 'rotable_bond_count', 
           'refractivity', 'polarizability','number_of_rings','bioavailability']
#drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df = pd.DataFrame(rows,columns = columns)



drugbank_df
drugbank_df.fillna(value=np.nan)
drugbank_df.to_pickle('db_df')