In [1]:
from datetime import datetime
import pandas as pd
import itertools
import requests
import zipfile
import json
import sys
import io
import re
import os

SUBSTRUCTURE_DATA = "Substructures-DB-latest.txt"
CHEMONTID_DICTIONARY = "CHEMONTID-mapper.json"

ONT_URL = 'http://classyfire.wishartlab.com/system/downloads/1_0/chemont/ChemOnt_2_1.obo.zip'
ONT_FILE = 'ChemOnt_2_1.obo'
ONT_DELIMITER = '[Term]'

SDF_URL = 'https://hmdb.ca/system/downloads/current/structures.zip'
SDF_FILE = 'structures.sdf'
SDF_DELIMITER = '$$$$'
ENCODING = 'utf-8'
CLASSYFIRE_HMDB_URL = 'http://classyfire.wishartlab.com/system/downloads/1_0/datasets/HMDB_36_classyfire_21_annotations.csv.zip'
CLASSYFIRE_HMDB_FILE = 'HMDB_36_classyfire_21_annotations.csv'
CLASSYFIRE_CHEBI_URL = 'http://classyfire.wishartlab.com/system/downloads/1_0/datasets/ChEBI_126_classyfire_21_annotations.csv.zip'
CLASSYFIRE_CHEBI_FILE = 'ChEBI_126_classyfire_21_annotations.csv'

HMDB_KEY = 'HMDB_ID'
ID_KEY = 'DATABASE_ID'
DATABASE_KEY = 'DATABASE_NAME'
SMILES_KEY = 'SMILES'
FORMULA_KEY = 'FORMULA'
IUPAC_KEY = 'JCHEM_IUPAC'
NAME_KEY = 'GENERIC_NAME'
SYNONYMS_KEY = 'SYNONYMS'
SYNONYM_DELIMITER = ';'

CLASSYFIER_ID_COLUMN = 0
CLASSYFIER_ONT_COLUMN = 1
CLASSYFIER_CLASS_COLUMN = 2

## Classes

In [36]:
class Metabolite:
    def __init__(
            self,
            record):
        self.sdf_record = record
        self.hmdb_id = self.lookup_sdf_info(HMDB_KEY)
        self.iupac_id = self.lookup_sdf_info(IUPAC_KEY)
        self.common_name = self.lookup_sdf_info(NAME_KEY)
        self.smiles = self.lookup_sdf_info(SMILES_KEY)
        self.synonyms = self.lookup_sdf_info(SYNONYMS_KEY)
        self.taxonomy_ids = []
        self.taxonomy_terms = []

        del self.sdf_record
            
    def lookup_sdf_info(
            self,
            _key,
            _delimiter=SYNONYM_DELIMITER):
        """
        """

        for index in range(len(self.sdf_record)):
            if _key in self.sdf_record[index]:
                contents = self.sdf_record[index + 1] \
                    .replace('\n', '') \
                    .replace(' ', '')
                if _delimiter in contents:
                    contents = contents.split(_delimiter)
                return contents

## Functions

In [3]:
def download_zip_archive(
        output,
        zip_url,
        file_name):
    """ Download HMDB SDF file containing all database SDF records
    """
    output_file = os.path.join(output, file_name)
    this_url = requests.get(zip_url)
    if this_url.ok:
        print("Unzipping HMDB metabolite SDF reference...")
        this_zip = zipfile.ZipFile(io.BytesIO(this_url.content))
        this_zip.extractall(output)
        this_zip = None
    else:
        raise Exception("Unable to download file at: " + zip_url)
    return output_file


def parse_records_as_list(
        sdf_file,
        _delimiter=SDF_DELIMITER,
        _encoding=ENCODING):
    """
    """
    output_records = []
    with open(sdf_file, "r", encoding=_encoding) as _f:
        for key, group in itertools.groupby(_f, lambda _l: _delimiter in _l):
            if not key:
                group = [x for x in list(group) if x != '\n']
                output_records.append(group)
    return output_records


def parse_records_as_datatable(
        input_file,
        _delimiter=','):
    """
    """
    output_records = pd.read_csv(
        input_file,
        sep=_delimiter,
        error_bad_lines=False,
        warn_bad_lines=True)
    return output_records


def dict_from_datatable(
        input_source,
        id_col=CLASSYFIER_ID_COLUMN,
        ont_col=CLASSYFIER_ONT_COLUMN):
    """
    """
    table_dict = {}
    for index, row in input_source.iterrows():
        if row[id_col] in table_dict:
            table_dict[row[id_col]].append(row[CLASSYFIER_ONT_COLUMN])
        else:
            table_dict[row[id_col]] = [row[CLASSYFIER_ONT_COLUMN]]
    return table_dict


def make_ontology_dictionary(
        hmdb_source,
        chebi_source,
        ont_col=CLASSYFIER_ONT_COLUMN,
        name_col=CLASSYFIER_CLASS_COLUMN):
    """
    """

    def add_ontology(
            ont_dict,
            data_source,
            ont_col,
            name_col):
        for index, row in data_source.iterrows():
            if row[ont_col] in ontology_dictionary:
                if ont_dict[row[ont_col]] != row[name_col]:
                    print(
                        'Mismatched ID & Name for: '
                        + str(row[ont_col])
                        + '\n- ' + str(ont_dict[row[ont_col]])
                        + '\n- ' + str(row[name_col])
                        + '\n')
            else:
                ont_dict[row[ont_col]] = row[name_col]
        return ont_dict

    ontology_dictionary = {}
    ontology_dictionary = add_ontology(
        ontology_dictionary,
        chebi_source,
        ont_col,
        name_col)
    ontology_dictionary = add_ontology(
        ontology_dictionary,
        hmdb_source,
        ont_col,
        name_col)
    print('Found ' + str(len(ontology_dictionary.keys())) + ' terms')
    return ontology_dictionary


def parse_metadata_from_records(
        records):
    """
    """
    output_metadata = []
    for r in records:
        thisSDF = Metabolite(record=r)
        output_metadata.append(thisSDF)
    return output_metadata


def retrieve_substructures(
        records,
        hmdb_dictionary,
        chebi_dictionary,
        ontology_dictionary):
    """
    """
    for r in range(len(records)):
        base_id = records[r].hmdb_id \
            .replace('HMDB', '') \
            .lstrip('0')
        for x in range(12):
            this_id = ('HMDB' + base_id.zfill(x + 1))
            if this_id in hmdb_dictionary:
                records[r].taxonomy_ids = hmdb_dictionary[this_id]
                for t in records[r].taxonomy_ids:
                    if t in ontology_dictionary:
                        records[r].taxonomy_terms \
                            .append(ontology_dictionary[t])
                break
    return records


def write_output(
        output_database,
        output_location,
        synonym_key='synonyms',
        tax_id_key='taxonomy_ids',
        tax_term_key='taxonomy_terms'):
    """
    """
    def try_join(
            l,
            sep=';'):
        try:
            return str(sep).join(map(str, l))
        except TypeError:
            return ''

    mod_output = [d.__dict__ for d in output_database]
    output_table = pd.DataFrame(mod_output)
    output_table[synonym_key] = [
        try_join(l) for l in output_table[synonym_key]]
    output_table[tax_id_key] = [
        try_join(l) for l in output_table[tax_id_key]]
    output_table[tax_term_key] = [
        try_join(l) for l in output_table[tax_term_key]]
    output_table.to_csv(
        os.path.join(
            str(output_location), SUBSTRUCTURE_DATA),
        sep='\t')


def write_dictionary(
        output_dictionary,
        output_location,
        output_name=CHEMONTID_DICTIONARY):
    """
    """
    with open(os.path.join(output_location, output_name), 'w') as fp:
        json.dump(output_dictionary, fp)


def clean_downloads(
        output_files):
    """
    """
    for o in output_files:
        try:
            os.remove(o)
        except:
            print('Unable to remove file ' + o)


def import_table(
        _path,
        _file,
        index_col=None):
    """Import URL as tab-delimited table
    """

    return pd.read_csv(
        os.path.join(_path, _file),
        sep='\t',
        index_col=index_col,
        low_memory=False
    )


def import_json(
        _path,
        _file):
    """Import URL as JSON-formatted dictionary
    """
    with open(os.path.join(_path, _file)) as json_file:
        data = json.load(json_file)

    return data


def parse_ont_as_list(
        ont_file,
        _delimiter=ONT_DELIMITER,
        _encoding=ENCODING):
    """
    """
    output_records = []
    with open(ont_file, "r", encoding=_encoding) as _f:
        for key, group in itertools.groupby(_f, lambda _l: _delimiter in _l):
            if not key:
                group = [x for x in list(group) if x != '\n']
                output_records.append(group)
    return output_records

def parse_dict_from_records(
        ont_data):

    ont_dict = {}
    for x in ont_data:
        id = ""
        name = ""
        is_a = ""
        
        _x = [i for i in x if "is_a"]
        if len(_x) > 0:
            for i in x:
                if "id: " in i:
                    id = i.split("id: ")[1].split("\n")[0]
                if "name: " in i:
                    name = i.split("name: ")[1].split("\n")[0]
                if "is_a: " in i:
                    is_a = i.split("is_a: ")[1].split(" ! ")[0]
        
        ont_dict[id] = {
            "id": id,
            "name": name,
            "is_a": is_a
        }
    
    return ont_dict

In [4]:
args_dict = {
    "output": os.getcwd()
}

In [5]:
# Parse all HMDB SDF records
sdf_file = download_zip_archive(
    output=args_dict["output"],
    zip_url=SDF_URL,
    file_name=SDF_FILE)
record_list = parse_records_as_list(
    sdf_file=sdf_file)
output_metadata = parse_metadata_from_records(
    records=record_list)

Unzipping HMDB metabolite SDF reference...


In [6]:
# Parse ClassyFire HMDB records
cf_hmdb_file = download_zip_archive(
    output=args_dict["output"],
    zip_url=CLASSYFIRE_HMDB_URL,
    file_name=CLASSYFIRE_HMDB_FILE)
hmdb_records = parse_records_as_datatable(
    input_file=cf_hmdb_file)
hmdb_dictionary = dict_from_datatable(
    input_source=hmdb_records)

Unzipping HMDB metabolite SDF reference...


In [14]:
hmdb_dictionary["HMDB01166"]

['CHEMONTID:0003224',
 'CHEMONTID:0002960',
 'CHEMONTID:0002987',
 'CHEMONTID:0001090',
 'CHEMONTID:0000129',
 'CHEMONTID:0003457',
 'CHEMONTID:0002449',
 'CHEMONTID:0000347',
 'CHEMONTID:0000013',
 'CHEMONTID:0001262',
 'CHEMONTID:0004139',
 'CHEMONTID:0000436',
 'CHEMONTID:0001878',
 'CHEMONTID:0000011',
 'CHEMONTID:0001831',
 'CHEMONTID:0003872',
 'CHEMONTID:0000475',
 'CHEMONTID:0001093',
 'CHEMONTID:0000265',
 'CHEMONTID:9999999',
 'CHEMONTID:0001143',
 'CHEMONTID:0001346',
 'CHEMONTID:0003909',
 'CHEMONTID:0003327',
 'CHEMONTID:0000331',
 'CHEMONTID:0002105',
 'CHEMONTID:0002203',
 'CHEMONTID:0004144',
 'CHEMONTID:0004150',
 'CHEMONTID:0000078',
 'CHEMONTID:0001797',
 'CHEMONTID:0003886',
 'CHEMONTID:0000012',
 'CHEMONTID:0003458',
 'CHEMONTID:0001541',
 'CHEMONTID:0001540',
 'CHEMONTID:0001096',
 'CHEMONTID:0002311',
 'CHEMONTID:0000289',
 'CHEMONTID:0000264',
 'CHEMONTID:0000000',
 'CHEMONTID:0004707',
 'CHEMONTID:0003940',
 'CHEMONTID:0000463',
 'CHEMONTID:0004603',
 'CHEMONTI

In [7]:
# Parse ClassyFire CHEBI records
cf_chebi_file = download_zip_archive(
    output=args_dict["output"],
    zip_url=CLASSYFIRE_CHEBI_URL,
    file_name=CLASSYFIRE_CHEBI_FILE)
chebi_records = parse_records_as_datatable(
    input_file=cf_chebi_file)
chebi_dictionary = dict_from_datatable(
    input_source=chebi_records)

Unzipping HMDB metabolite SDF reference...


b'Skipping line 775780: expected 4 fields, saw 6\n'
b'Skipping line 1705535: expected 4 fields, saw 5\n'


In [8]:
# Make ontology reference
ontology_dictionary = make_ontology_dictionary(
    hmdb_source=hmdb_records,
    chebi_source=chebi_records)

write_dictionary(
    output_dictionary=ontology_dictionary,
    output_location=args_dict["output"])

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Mismatched ID & Name for: nan
- nan
- nan

Found 3250 terms


In [9]:
# Add ontology IDs and terms to each SDF record
output_substructures = retrieve_substructures(
    records=output_metadata,
    hmdb_dictionary=hmdb_dictionary,
    chebi_dictionary=chebi_dictionary,
    ontology_dictionary=ontology_dictionary)

# Output and clean
write_output(
    output_database=output_substructures,
    output_location=args_dict["output"])

In [28]:
for r in range(len(output_metadata)):
    base_id = output_metadata[r].hmdb_id \
        .replace('HMDB', '') \
        .lstrip('0')
    if base_id == "1166":
        print(output_metadata[r].common_name)
        print(output_metadata[r].hmdb_id)
    for x in range(12):
        this_id = ('HMDB' + base_id.zfill(x + 1))
        if this_id in hmdb_dictionary:
            output_metadata[r].taxonomy_ids = hmdb_dictionary[this_id]
            for t in output_metadata[r].taxonomy_ids:
                if t in ontology_dictionary:
                    output_metadata[r].taxonomy_terms \
                        .append(ontology_dictionary[t])
            
            break
    
    if base_id == "1166":
        print(output_metadata[r].taxonomy_terms)

3-Hydroxybutyryl-CoA
HMDB0001166
['(R)-3-hydroxyacyl CoAs', '3-hydroxyacyl CoAs', '6-aminopurines', 'Acyl CoAs', 'Alcohols and polyols', 'Alkyl phosphates', 'Amines', 'Amino acids and derivatives', 'Amino acids, peptides, and analogues', 'Aminopyrimidines and derivatives', 'Azacyclic compounds', 'Azoles', 'Beta amino acids and derivatives', 'Carbohydrates and carbohydrate conjugates', 'Carbonyl compounds', 'Carbothioic S-esters', 'Carboxylic acid amides', 'Carboxylic acid derivatives', 'Carboxylic acids and derivatives', 'Chemical entities', 'Coenzyme A and derivatives', 'Diazines', 'Fatty Acyls', 'Fatty acyl thioesters', 'Fatty amides', 'Glycosyl compounds', 'Glycosylamines', 'Heteroaromatic compounds', 'Hydrocarbon derivatives', 'Imidazoles', 'Imidazopyrimidines', 'Imidolactams', 'Lipids and lipid-like molecules', 'Monoalkyl phosphates', 'Monosaccharide phosphates', 'Monosaccharides', 'N-acyl amines', 'N-substituted imidazoles', 'Nucleosides, nucleotides, and analogues', 'Organic aci

In [35]:
for x in output_substructures:
    if x.hmdb_id == "HMDB0001166":
        print(vars(x))

{'hmdb_id': 'HMDB0001166', 'iupac_id': '{[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-4-hydroxy-2-({[hydroxy({hydroxy[3-hydroxy-3-({2-[(2-{[(3R)-3-hydroxybutanoyl]sulfanyl}ethyl)carbamoyl]ethyl}carbamoyl)-2,2-dimethylpropoxy]phosphoryl}oxy)phosphoryl]oxy}methyl)oxolan-3-yl]oxy}phosphonicacid', 'common_name': '3-Hydroxybutyryl-CoA', 'smiles': 'C[C@@H](O)CC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(O)(=O)OP(O)(=O)OC[C@H]1O[C@H]([C@H](O)[C@@H]1OP(O)(O)=O)N1C=NC2=C1N=CN=C2N', 'synonyms': ['(3R)-3-Hydroxybutanoyl-CoA', '(3R)-3-Hydroxybutanoyl-CoenzymeA', '(R)-3-Hydroxybutanoyl-CoA', '(R)-3-Hydroxybutanoyl-CoenzymeA', '(S)-3-hydroxybutanoyl-CoA', '(S)-3-hydroxybutanoyl-CoenzymeA', '3-Hydroxybutanoyl-CoA', '3-Hydroxybutanoyl-coenzymea', '3-Hydroxybutyryl-CoA', '3-Hydroxybutyryl-coenzymeA', '3-OH-butyryl-CoA', '3-OH-butyryl-CoenzymeA', 'beta-Hydroxybutyryl-CoA', 'beta-Hydroxybutyryl-CoenzymeA', 'beta-Hydroxybutyryl-S-CoA', 'beta-Hydroxybutyryl-S-CoenzymeA', 'Hydroxy-butyryl-CoA', 'Hydroxy-butyryl-Coenzym

In [10]:
# Import parsed ClassyFire sub-structure database
substructure_table = import_table(
    _path=args_dict["output"],
    _file=SUBSTRUCTURE_DATA,
    index_col=0)
substructure_table.index = substructure_table['hmdb_id']
substructure_dictionary = substructure_table.T.to_dict()

write_dictionary(
    output_dictionary=substructure_dictionary,
    output_location=args_dict["output"], 
    output_name="CHEMONTID-substructure-dictionary.json")

In [38]:
substructure_dictionary

{'HMDB0000001': {'hmdb_id': 'HMDB0000001',
  'iupac_id': '(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)propanoicacid',
  'common_name': '1-Methylhistidine',
  'smiles': 'CN1C=NC(C[C@H](N)C(O)=O)=C1',
  'synonyms': '1Methylhistidine;1-Methylhistidine;1-Methyl-Histidine;1-Methyl-L-histidine;1-MHis;1-N-Methyl-L-histidine;L-1-Methylhistidine;N1-Methyl-L-histidine;Pi-methylhistidine',
  'taxonomy_ids': 'CHEMONTID:0002404;CHEMONTID:0000060;CHEMONTID:0002449;CHEMONTID:0004176;CHEMONTID:0000347;CHEMONTID:0000013;CHEMONTID:0003899;CHEMONTID:0004139;CHEMONTID:0000436;CHEMONTID:0001831;CHEMONTID:0001205;CHEMONTID:0000265;CHEMONTID:9999999;CHEMONTID:0004144;CHEMONTID:0004311;CHEMONTID:0004150;CHEMONTID:0000078;CHEMONTID:0001227;CHEMONTID:0004146;CHEMONTID:0000469;CHEMONTID:0001137;CHEMONTID:0002311;CHEMONTID:0000264;CHEMONTID:0000000;CHEMONTID:0004707;CHEMONTID:0003940;CHEMONTID:0004603;CHEMONTID:0000002;CHEMONTID:0000278;CHEMONTID:0000323;CHEMONTID:0004557;CHEMONTID:0002450;CHEMONTID:0002310',
  'ta

In [11]:
# Generate CHEMONTID hierarchal structure reference 
ont_file = download_zip_archive(
    output=args_dict["output"],
    zip_url=ONT_URL,
    file_name=ONT_FILE)
ont_data = parse_ont_as_list(
    ont_file=ont_file)
ont_dict = parse_dict_from_records(
    ont_data=ont_data)
write_dictionary(
    output_dictionary=ont_dict,
    output_location=args_dict["output"],
    output_name='CHEMONTID-hierarchy-dictionary.json')

Unzipping HMDB metabolite SDF reference...


In [None]:
clean_downloads(
    output_files=[
        sdf_file,
        cf_hmdb_file,
        cf_chebi_file,
        ont_file])