In [3]:
# General py packages
import pandas as pd
import xml.etree.ElementTree as ET

In [1]:
def findLeafNode(root: list, terms: list):
    """
    Recursively extract ontology level, terms, and definitions
    from HMDB Ontology xml structure
    
    Parameters
    ----------
    root : List
        List of nodes from an XML structure
    terms : List
        List of ontology terms
    
    Returns
    -------
    terms : List
        List of ontology terms
    """
    descendants = root.find("{http://www.hmdb.ca}descendants")
    if descendants is not None:
        descendant = descendants.findall("{http://www.hmdb.ca}descendant")
        for d in descendant:
            terms_new = findLeafNode(d, terms)
            terms = list(set(terms_new + terms))
    else:
        level = root.find("{http://www.hmdb.ca}level")
        ont_term = root.find("{http://www.hmdb.ca}term")
        definition = root.find("{http://www.hmdb.ca}definition")
        if level is not None and ont_term is not None and definition is not None:
            terms.append(ont_term.text)
    return terms

def createPathwayDatabaseHMDB(filename: str, met_start=0, met_end=100):
    """
    Create a pathway database from HMDB
    See section 'Metabolite and Protein Data (in XML format)'
    from https://hmdb.ca/downloads

    Parameters
    ----------
    filename : str
        location and filename for the HMDB database
    
    Returns
    -------
    pathway_df: pandas.DataFrame
        dataframe with columns for Reaction,
        Pathway, and Metabolite
    """
    pathway_dict = {}
    cnt = 1
    with open(filename, encoding="utf-8") as file:
        tree = ET.parse(file)
        root = tree.getroot()
        met_iter = 0
        for metabolite in root:
            # extract out specific chunks to save on memory
            if met_iter < met_start: continue
            if met_iter > met_end: break
            met_iter = met_iter + 1
            
            # identifiers
            name = metabolite.find("{http://www.hmdb.ca}name")
            formula = metabolite.find("{http://www.hmdb.ca}chemical_formula")
            accession = metabolite.find("{http://www.hmdb.ca}accession")
            #inchi = metabolite.find("{http://www.hmdb.ca}inchi")

            # taxonomy
            taxonomy = metabolite.find("{http://www.hmdb.ca}taxonomy")
            kingdom = taxonomy.find("{http://www.hmdb.ca}kingdom")
            super_class = taxonomy.find("{http://www.hmdb.ca}super_class")
            class_class = taxonomy.find("{http://www.hmdb.ca}class")
            sub_class = taxonomy.find("{http://www.hmdb.ca}sub_class")
            molecular_framework = taxonomy.find("{http://www.hmdb.ca}molecular_framework")

            # ontology
            ontology = metabolite.find("{http://www.hmdb.ca}ontology")
            ont_terms = []
            for node in ontology.findall("{http://www.hmdb.ca}root"):
                term = node.find("{http://www.hmdb.ca}term")
                if term.text == "Process": # we assume there is only 1 Process
                    ont_terms = findLeafNode(node, [])

            # # biological properties (very verbose and redundant with Ontology Process)
            # properties = metabolite.find("{http://www.hmdb.ca}biological_properties")
            # bio_terms = []
            # if properties is not None:
            #     pathways = properties.find("{http://www.hmdb.ca}pathways")
            #     for node in pathways.findall("{http://www.hmdb.ca}pathway"):
            #         term = node.find("{http://www.hmdb.ca}name")
            #         bio_terms.append(term.text)

            # protein associations
            protein_associations = metabolite.find("{http://www.hmdb.ca}protein_associations")
            pro_name, uniprot_id, gene_name, protein_type = [], [], [], []
            if protein_associations is not None:
                for node in protein_associations.findall("{http://www.hmdb.ca}protein"):
                    pname = node.find("{http://www.hmdb.ca}name")
                    uniid = node.find("{http://www.hmdb.ca}uniprot_id")
                    gname = node.find("{http://www.hmdb.ca}gene_name")
                    ptype = node.find("{http://www.hmdb.ca}protein_type")
                    pro_name.append(pname.text)
                    uniprot_id.append(uniid.text)
                    gene_name.append(gname.text)
                    protein_type.append(ptype.text)

            # record all of the information
            pathway_dict[cnt] = {"Name": name.text, 
                "Formula": formula.text, 
                "Accession": accession.text, 
                #"inchi": inchi.text, 
                "Kingdom": kingdom.text, 
                "Super_class": super_class.text, 
                "Class": class_class.text, 
                "Sub_class": sub_class.text, 
                "Molecular_framework": molecular_framework.text, 
                "Process": ont_terms,
                #"Pathway": bio_terms,
                "pro_name": pro_name,
                "uniprot_id": uniprot_id,
                "gene_name": gene_name,
                "protein_type": protein_type,
                }
            cnt = cnt + 1
    pathway_df = pd.DataFrame.from_dict(pathway_dict, "index")
    return pathway_df

In [6]:
filename = "C:/Users/dmccl/Downloads/hmdb_metabolites/hmdb_metabolites.xml"
pathway_df = createPathwayDatabaseHMDB(filename, 0, 10)

In [7]:
pathway_df.to_csv("C:/Users/dmccl/Downloads/hmdb_metabolites/hmdb_metabolites.csv")