### HMDB

In [None]:
# Download Link: https://www.hmdb.ca/downloads#
# Download Date: 2025-03-21
# Download Version: 2021-11-17

import xml.etree.ElementTree as ET
import pandas as pd

context = ET.iterparse('hmdb_metabolites.xml', events=('start', 'end'))
context = iter(context)

event, root = next(context)

namespace = {'hmdb': 'http://www.hmdb.ca'}

metabolites = []

for event, elem in context:
    if event == 'end' and elem.tag == '{http://www.hmdb.ca}metabolite':
        accession = elem.find('hmdb:accession', namespace).text
        
        drugbank_id = elem.find('hmdb:drugbank_id', namespace).text if elem.find('hmdb:drugbank_id', namespace) is not None else None
        
        metabolites.append({
            'accession': accession,
            'drugbank_id': drugbank_id
        })

        elem.clear()

hmdb_drug = pd.DataFrame(metabolites)

hmdb_drug.to_csv('hmdb_drug.csv', index=False)

print("Data has been saved to hmdb_drug.csv")

In [3]:
import pandas as pd

hmdb_drug = pd.read_csv('hmdb_drug.csv')
hmdb_drug.dropna(subset=['drugbank_id'], inplace=True)
hmdb_drug.dropna(subset=['accession'], inplace=True)
hmdb_drug

Unnamed: 0,accession,drugbank_id
0,HMDB0000001,DB04151
2,HMDB0000005,DB04553
6,HMDB0000012,DB02256
7,HMDB0000014,DB02594
11,HMDB0000019,DB04074
...,...,...
217869,HMDB0304897,DB11855
217870,HMDB0304898,DB13985
217872,HMDB0304900,DB15873
217873,HMDB0304901,DB14914


### BioMedGraphica ID

In [4]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_metabolite = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite' / 'BioMedGraphica_Metabolite.csv'
target_dir_drug = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Drug' / 'BioMedGraphica_Drug.csv'
biomedgraphica_metabolite = pd.read_csv(target_dir_metabolite, dtype=str)
biomedgraphica_drug = pd.read_csv(target_dir_drug, dtype=str)

### HMDB Mapping

In [5]:
hmdb_biomedgraphica = biomedgraphica_metabolite[['BioMedGraphica_ID', 'HMDB_ID']]
hmdb_biomedgraphica.dropna(subset=['HMDB_ID'], inplace=True)
hmdb_biomedgraphica = hmdb_biomedgraphica.assign(HMDB_ID=hmdb_biomedgraphica['HMDB_ID'].str.split(';')).explode('HMDB_ID')

hmdb_to_individualID = hmdb_biomedgraphica.groupby('HMDB_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

hmdb_drug['To_ID'] = hmdb_drug['accession'].map(hmdb_to_individualID)
hmdb_drug.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hmdb_biomedgraphica.dropna(subset=['HMDB_ID'], inplace=True)


Unnamed: 0,accession,drugbank_id,To_ID
0,HMDB0000001,DB04151,BMG_MT000001
2,HMDB0000005,DB04553,BMG_MT000003
6,HMDB0000012,DB02256,BMG_MT000007
7,HMDB0000014,DB02594,BMG_MT000008
11,HMDB0000019,DB04074,BMG_MT000012


In [6]:
drugbank_biomedgraphica = biomedgraphica_drug[['DrugBank_ID', 'BioMedGraphica_ID']]
drugbank_biomedgraphica.dropna(subset=['DrugBank_ID'], inplace=True)
drugbank_biomedgraphica = drugbank_biomedgraphica.assign(DrugBank_ID=drugbank_biomedgraphica['DrugBank_ID'].str.split(';')).explode('DrugBank_ID')

drugbank_to_individualID = drugbank_biomedgraphica.groupby('DrugBank_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()
hmdb_drug['From_ID'] = hmdb_drug['drugbank_id'].map(drugbank_to_individualID)
hmdb_drug.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_biomedgraphica.dropna(subset=['DrugBank_ID'], inplace=True)


Unnamed: 0,accession,drugbank_id,To_ID,From_ID
0,HMDB0000001,DB04151,BMG_MT000001,BMG_DG211518
2,HMDB0000005,DB04553,BMG_MT000003,BMG_DG169584
6,HMDB0000012,DB02256,BMG_MT000007,BMG_DG085197
7,HMDB0000014,DB02594,BMG_MT000008,BMG_DG085194
11,HMDB0000019,DB04074,BMG_MT000012,BMG_DG150709


In [7]:
hmdb_drug = hmdb_drug[['From_ID', 'To_ID']]
hmdb_drug.dropna(subset=['From_ID'], inplace=True)
hmdb_drug.dropna(subset=['To_ID'], inplace=True)
hmdb_drug

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hmdb_drug.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hmdb_drug.dropna(subset=['To_ID'], inplace=True)


Unnamed: 0,From_ID,To_ID
0,BMG_DG211518,BMG_MT000001
2,BMG_DG169584,BMG_MT000003
6,BMG_DG085197,BMG_MT000007
7,BMG_DG085194,BMG_MT000008
11,BMG_DG150709,BMG_MT000012
...,...,...
217869,BMG_DG013669,BMG_MT217892
217870,BMG_DG197343;BMG_DG263754,BMG_MT217893
217872,BMG_DG265585,BMG_MT217895
217873,BMG_DG186007;BMG_DG264642,BMG_MT217896


In [8]:
hmdb_drug['From_ID'] = hmdb_drug['From_ID'].str.split(';')
hmdb_drug['To_ID'] = hmdb_drug['To_ID'].str.split(';')

hmdb_drug = hmdb_drug.explode('From_ID')
hmdb_drug = hmdb_drug.explode('To_ID')

hmdb_drug['From_ID'] = hmdb_drug['From_ID'].str.strip()
hmdb_drug['To_ID'] = hmdb_drug['To_ID'].str.strip()
hmdb_drug.drop_duplicates(inplace=True)
hmdb_drug

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hmdb_drug['From_ID'] = hmdb_drug['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hmdb_drug['To_ID'] = hmdb_drug['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_DG211518,BMG_MT000001
2,BMG_DG169584,BMG_MT000003
6,BMG_DG085197,BMG_MT000007
7,BMG_DG085194,BMG_MT000008
11,BMG_DG150709,BMG_MT000012
...,...,...
217870,BMG_DG263754,BMG_MT217893
217872,BMG_DG265585,BMG_MT217895
217873,BMG_DG186007,BMG_MT217896
217873,BMG_DG264642,BMG_MT217896


### Drug-Metabolite Relation

In [9]:
hmdb_drug['Source'] = 'HMDB'
hmdb_drug['Type'] = 'Drug-Metabolite'

max_length = len(str(len(hmdb_drug)))
hmdb_drug['BioMedGraphica_ID'] = ['BMG_ED_DGMT' + str(i).zfill(max_length) for i in range(1, len(hmdb_drug) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in hmdb_drug.columns if col != 'BioMedGraphica_ID']  # re-order columns
hmdb_drug = hmdb_drug[columns]
hmdb_drug

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
0,BMG_ED_DGMT0001,BMG_DG211518,BMG_MT000001,HMDB,Drug-Metabolite
2,BMG_ED_DGMT0002,BMG_DG169584,BMG_MT000003,HMDB,Drug-Metabolite
6,BMG_ED_DGMT0003,BMG_DG085197,BMG_MT000007,HMDB,Drug-Metabolite
7,BMG_ED_DGMT0004,BMG_DG085194,BMG_MT000008,HMDB,Drug-Metabolite
11,BMG_ED_DGMT0005,BMG_DG150709,BMG_MT000012,HMDB,Drug-Metabolite
...,...,...,...,...,...
217870,BMG_ED_DGMT3585,BMG_DG263754,BMG_MT217893,HMDB,Drug-Metabolite
217872,BMG_ED_DGMT3586,BMG_DG265585,BMG_MT217895,HMDB,Drug-Metabolite
217873,BMG_ED_DGMT3587,BMG_DG186007,BMG_MT217896,HMDB,Drug-Metabolite
217873,BMG_ED_DGMT3588,BMG_DG264642,BMG_MT217896,HMDB,Drug-Metabolite


In [10]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Drug-Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Drug_Metabolite.csv'
hmdb_drug.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Drug-Metabolite\BioMedGraphica_Drug_Metabolite.csv
