### HMDB

In [None]:
# Download Link: https://www.hmdb.ca/downloads#
# Download Date: 2025-03-21
# Download Version: 2021-11-17

import xml.etree.ElementTree as ET
import pandas as pd

context = ET.iterparse('hmdb_metabolites.xml', events=('start', 'end'))
context = iter(context)

event, root = next(context)

namespace = {'hmdb': 'http://www.hmdb.ca'}

metabolites = []

for event, elem in context:
    if event == 'end' and elem.tag == '{http://www.hmdb.ca}metabolite':
        accession = elem.find('hmdb:accession', namespace).text
        
        for disease in elem.findall('hmdb:diseases/hmdb:disease', namespace):
            omim_id = disease.find('hmdb:omim_id', namespace).text if disease.find('hmdb:omim_id', namespace) is not None else None
            
            metabolites.append({
                'accession': accession,
                'omim_id': omim_id
            })

        elem.clear()

hmdb_disease = pd.DataFrame(metabolites)

hmdb_disease.to_csv('hmdb_disease.csv', index=False)

print("Data has been saved to hmdb_disease.csv")

In [1]:
import pandas as pd

hmdb_disease = pd.read_csv('hmdb_disease.csv')
hmdb_disease.dropna(subset=['omim_id'], inplace=True)
hmdb_disease.dropna(subset=['accession'], inplace=True)
hmdb_disease['omim_id'] = hmdb_disease['omim_id'].astype(int).astype(str)
hmdb_disease

Unnamed: 0,accession,omim_id
4,HMDB0000001,104300
5,HMDB0000001,601665
6,HMDB0000001,125853
7,HMDB0000001,606054
8,HMDB0000001,248600
...,...,...
27665,HMDB0240219,114500
27666,HMDB0240252,114500
27667,HMDB0240261,114500
27668,HMDB0240262,114500


### BioMedGraphica ID

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_metabolite = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite' / 'BioMedGraphica_Metabolite.csv'
target_dir_disease = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease' / 'BioMedGraphica_Disease.csv'
biomedgraphica_metabolite = pd.read_csv(target_dir_metabolite, dtype=str)
biomedgraphica_disease = pd.read_csv(target_dir_disease, dtype=str)

### HMDB Mapping

HMDB ID

In [3]:
hmdb_biomedgraphica = biomedgraphica_metabolite[['BioMedGraphica_ID', 'HMDB_ID']]
hmdb_biomedgraphica.dropna(subset=['HMDB_ID'], inplace=True)
hmdb_biomedgraphica = hmdb_biomedgraphica.assign(HMDB_ID=hmdb_biomedgraphica['HMDB_ID'].str.split(';')).explode('HMDB_ID')

hmdb_to_individualID = hmdb_biomedgraphica.groupby('HMDB_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()
hmdb_disease['From_ID'] = hmdb_disease['accession'].map(hmdb_to_individualID)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hmdb_biomedgraphica.dropna(subset=['HMDB_ID'], inplace=True)


OMIM ID

In [4]:
omim_biomedgraphica = biomedgraphica_disease[['BioMedGraphica_ID', 'OMIM_ID']]
omim_biomedgraphica.dropna(subset=['OMIM_ID'], inplace=True)
omim_biomedgraphica = omim_biomedgraphica.assign(OMIM_ID=omim_biomedgraphica['OMIM_ID'].str.split(';')).explode('OMIM_ID')

omim_to_individualID = omim_biomedgraphica.groupby('OMIM_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()
hmdb_disease['To_ID'] = hmdb_disease['omim_id'].map(omim_to_individualID)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  omim_biomedgraphica.dropna(subset=['OMIM_ID'], inplace=True)


In [5]:
metabolite_disease = hmdb_disease[['From_ID', 'To_ID']]
metabolite_disease.dropna(subset=['From_ID'], inplace=True)
metabolite_disease.dropna(subset=['To_ID'], inplace=True)
metabolite_disease

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_disease.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_disease.dropna(subset=['To_ID'], inplace=True)


Unnamed: 0,From_ID,To_ID
4,BMG_MT000001,BMG_DS040780
5,BMG_MT000001,BMG_DS059381
6,BMG_MT000001,BMG_DS000821
7,BMG_MT000001,BMG_DS010232
8,BMG_MT000001,BMG_DS039850
...,...,...
27665,BMG_MT156643,BMG_DS018120
27666,BMG_MT156676,BMG_DS018120
27667,BMG_MT156683,BMG_DS018120
27668,BMG_MT156684,BMG_DS018120


In [6]:
metabolite_disease['From_ID'] = metabolite_disease['From_ID'].str.split(';')
metabolite_disease['To_ID'] = metabolite_disease['To_ID'].str.split(';')

metabolite_disease = metabolite_disease.explode('From_ID')
metabolite_disease = metabolite_disease.explode('To_ID')

metabolite_disease.drop_duplicates(inplace=True)
metabolite_disease

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_disease['From_ID'] = metabolite_disease['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_disease['To_ID'] = metabolite_disease['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
4,BMG_MT000001,BMG_DS040780
5,BMG_MT000001,BMG_DS059381
6,BMG_MT000001,BMG_DS000821
7,BMG_MT000001,BMG_DS010232
8,BMG_MT000001,BMG_DS039850
...,...,...
27666,BMG_MT156676,BMG_DS018120
27667,BMG_MT156683,BMG_DS018120
27668,BMG_MT156684,BMG_DS018120
27669,BMG_MT156715,BMG_DS009904


## Metabolite-Disease Relationship

In [7]:
metabolite_disease['Source'] = 'HMDB'
metabolite_disease['Type'] = 'Metabolite-Disease'
max_length = len(str(len(metabolite_disease)))
metabolite_disease['BioMedGraphica_ID'] = ['BMG_ED_MTDS' + str(i).zfill(max_length) for i in range(1, len(metabolite_disease) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in metabolite_disease.columns if col != 'BioMedGraphica_ID']  # re-order columns
metabolite_disease = metabolite_disease[columns]
metabolite_disease

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
4,BMG_ED_MTDS00001,BMG_MT000001,BMG_DS040780,HMDB,Metabolite-Disease
5,BMG_ED_MTDS00002,BMG_MT000001,BMG_DS059381,HMDB,Metabolite-Disease
6,BMG_ED_MTDS00003,BMG_MT000001,BMG_DS000821,HMDB,Metabolite-Disease
7,BMG_ED_MTDS00004,BMG_MT000001,BMG_DS010232,HMDB,Metabolite-Disease
8,BMG_ED_MTDS00005,BMG_MT000001,BMG_DS039850,HMDB,Metabolite-Disease
...,...,...,...,...,...
27666,BMG_ED_MTDS24966,BMG_MT156676,BMG_DS018120,HMDB,Metabolite-Disease
27667,BMG_ED_MTDS24967,BMG_MT156683,BMG_DS018120,HMDB,Metabolite-Disease
27668,BMG_ED_MTDS24968,BMG_MT156684,BMG_DS018120,HMDB,Metabolite-Disease
27669,BMG_ED_MTDS24969,BMG_MT156715,BMG_DS009904,HMDB,Metabolite-Disease


In [8]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Metabolite-Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_Disease.csv'
metabolite_disease.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Metabolite-Disease\BioMedGraphica_Metabolite_Disease.csv
