# BioMedGraphica Metabolite

## 1. Data Access  
### Direct Download Links  
**HMDB**: Can be downloaded directly via the link without the need for registration. [Link](https://hmdb.ca/downloads)  
**ChEBI**: Can be downloaded directly via the link without the need for registration. [Link1](https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=77746); [Link2](https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/database_accession.tsv)

### HMDB Pre-Process

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import csv

tree = ET.parse('hmdb_metabolites.xml')
root = tree.getroot()

# Extract the relevant fields
namespace = {'hmdb': 'http://www.hmdb.ca'}
metabolites = []
for metabolite in root.findall('hmdb:metabolite', namespace):
    data = {
        'accession': metabolite.find('hmdb:accession', namespace).text,
        'name': metabolite.find('hmdb:name', namespace).text,
        'iupac_name': metabolite.find('hmdb:iupac_name', namespace).text if metabolite.find('hmdb:iupac_name', namespace) is not None else None,
        'cas_registry_number': metabolite.find('hmdb:cas_registry_number', namespace).text if metabolite.find('hmdb:cas_registry_number', namespace) is not None else None,
        'smiles': metabolite.find('hmdb:smiles', namespace).text if metabolite.find('hmdb:smiles', namespace) is not None else None,
        'inchi': metabolite.find('hmdb:inchi', namespace).text if metabolite.find('hmdb:inchi', namespace) is not None else None,
        'inchikey': metabolite.find('hmdb:inchikey', namespace).text if metabolite.find('hmdb:inchikey', namespace) is not None else None,
        'chemspider_id': metabolite.find('hmdb:chemspider_id', namespace).text if metabolite.find('hmdb:chemspider_id', namespace) is not None else None,
        'drugbank_id': metabolite.find('hmdb:drugbank_id', namespace).text if metabolite.find('hmdb:drugbank_id', namespace) is not None else None,
        'pubchem_compound_id': metabolite.find('hmdb:pubchem_compound_id', namespace).text if metabolite.find('hmdb:pubchem_compound_id', namespace) is not None else None,
        'pdb_id': metabolite.find('hmdb:pdb_id', namespace).text if metabolite.find('hmdb:pdb_id', namespace) is not None else None,
        'chebi_id': metabolite.find('hmdb:chebi_id', namespace).text if metabolite.find('hmdb:chebi_id', namespace) is not None else None,
        'kegg_id': metabolite.find('hmdb:kegg_id', namespace).text if metabolite.find('hmdb:kegg_id', namespace) is not None else None
    }
    metabolites.append(data)

# Convert the data to a DataFrame
hmdb = pd.DataFrame(metabolites)  # Correctly create DataFrame from metabolites
hmdb.to_csv('hmdb.csv', index=False)  # Save DataFrame to CSV file

## 2. Load Data

### 2.1 HMDB

In [1]:
import pandas as pd

hmdb = pd.read_csv('hmdb.csv')  # Load the CSV file into a DataFrame
hmdb

Unnamed: 0,accession,name,iupac_name,cas_registry_number,smiles,inchi,inchikey,chemspider_id,drugbank_id,pubchem_compound_id,pdb_id,chebi_id,kegg_id
0,HMDB0000001,1-Methylhistidine,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,332-80-9,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,BRMWTNUJHUMWMS-LURJTMIESA-N,83153.0,DB04151,92105.0,,50599.0,C01152
1,HMDB0000002,"1,3-Diaminopropane","propane-1,3-diamine",109-76-2,NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,XFNJVJPLKCPIBV-UHFFFAOYSA-N,415.0,,428.0,,15725.0,C00986
2,HMDB0000005,2-Ketobutyric acid,2-oxobutanoic acid,600-18-0,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",TYEYBOSBBBHJIV-UHFFFAOYSA-N,57.0,DB04553,58.0,,30831.0,C00109
3,HMDB0000008,2-Hydroxybutyric acid,(2S)-2-hydroxybutanoic acid,3347-90-8,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",AFENDNXGAFYKQO-VKHMYHEASA-N,389701.0,,440864.0,,50613.0,C05984
4,HMDB0000010,2-Methoxyestrone,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",362-08-3,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,WHEUWNKSCXYKBU-QPWUGHHJSA-N,389515.0,,440624.0,,1189.0,C05299
...,...,...,...,...,...,...,...,...,...,...,...,...,...
217915,HMDB0304947,Nordeoxycholic acid,"3-{5,16-dihydroxy-2,15-dimethyltetracyclo[8.7....",,CC(CC(O)=O)C1CCC2C3CCC4CC(O)CCC4(C)C3CC(O)C12C,InChI=1S/C23H38O4/c1-13(10-21(26)27)17-6-7-18-...,PLRQOCVIINWCFA-UHFFFAOYSA-N,278117.0,,314374.0,,,
217916,HMDB0304950,3-Oxo-5beta-cholanoic acid,"4-{2,15-dimethyl-5-oxotetracyclo[8.7.0.0^{2,7}...",,CC(CCC(O)=O)C1CCC2C3CCC4CC(=O)CCC4(C)C3CCC12C,InChI=1S/C24H38O3/c1-15(4-9-22(26)27)19-7-8-20...,KIQFUORWRVZTHT-UHFFFAOYSA-N,473110.0,,543448.0,,,
217917,HMDB0304951,Glycerol 1-myristate,"2,3-dihydroxypropyl tetradecanoate",,CCCCCCCCCCCCCC(=O)OCC(O)CO,InChI=1S/C17H34O4/c1-2-3-4-5-6-7-8-9-10-11-12-...,DCBSHORRWZKAKO-UHFFFAOYSA-N,,,79050.0,,75562.0,
217918,HMDB0304953,O-Phenolsulfonic acid,2-hydroxybenzene-1-sulfonic acid,,OC1=CC=CC=C1S(O)(=O)=O,"InChI=1S/C6H6O4S/c7-5-3-1-2-4-6(5)11(8,9)10/h1...",IULJSGIJJZZUMF-UHFFFAOYSA-N,,,11867.0,,71049.0,


In [3]:
hmdb['chemspider_id'] = hmdb['chemspider_id'].apply(lambda x: str(int(x)) if pd.notnull(x) else '')  # Convert the 'chemspider_id' column to a string
hmdb['pubchem_compound_id'] = hmdb['pubchem_compound_id'].apply(lambda x: str(int(x)) if pd.notnull(x) else '')  # Convert the 'pubchem_compound_id' column to a string
hmdb['chebi_id'] = hmdb['chebi_id'].apply(lambda x: str(int(x)) if pd.notnull(x) else '')  # Convert the 'pdb_id' column to a string
hmdb.replace('', pd.NA, inplace=True)  # Replace empty strings with missing values
hmdb

Unnamed: 0,accession,name,iupac_name,cas_registry_number,smiles,inchi,inchikey,chemspider_id,drugbank_id,pubchem_compound_id,pdb_id,chebi_id,kegg_id
0,HMDB0000001,1-Methylhistidine,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,332-80-9,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,BRMWTNUJHUMWMS-LURJTMIESA-N,83153,DB04151,92105,,50599,C01152
1,HMDB0000002,"1,3-Diaminopropane","propane-1,3-diamine",109-76-2,NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,XFNJVJPLKCPIBV-UHFFFAOYSA-N,415,,428,,15725,C00986
2,HMDB0000005,2-Ketobutyric acid,2-oxobutanoic acid,600-18-0,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",TYEYBOSBBBHJIV-UHFFFAOYSA-N,57,DB04553,58,,30831,C00109
3,HMDB0000008,2-Hydroxybutyric acid,(2S)-2-hydroxybutanoic acid,3347-90-8,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",AFENDNXGAFYKQO-VKHMYHEASA-N,389701,,440864,,50613,C05984
4,HMDB0000010,2-Methoxyestrone,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",362-08-3,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,WHEUWNKSCXYKBU-QPWUGHHJSA-N,389515,,440624,,1189,C05299
...,...,...,...,...,...,...,...,...,...,...,...,...,...
217915,HMDB0304947,Nordeoxycholic acid,"3-{5,16-dihydroxy-2,15-dimethyltetracyclo[8.7....",,CC(CC(O)=O)C1CCC2C3CCC4CC(O)CCC4(C)C3CC(O)C12C,InChI=1S/C23H38O4/c1-13(10-21(26)27)17-6-7-18-...,PLRQOCVIINWCFA-UHFFFAOYSA-N,278117,,314374,,,
217916,HMDB0304950,3-Oxo-5beta-cholanoic acid,"4-{2,15-dimethyl-5-oxotetracyclo[8.7.0.0^{2,7}...",,CC(CCC(O)=O)C1CCC2C3CCC4CC(=O)CCC4(C)C3CCC12C,InChI=1S/C24H38O3/c1-15(4-9-22(26)27)19-7-8-20...,KIQFUORWRVZTHT-UHFFFAOYSA-N,473110,,543448,,,
217917,HMDB0304951,Glycerol 1-myristate,"2,3-dihydroxypropyl tetradecanoate",,CCCCCCCCCCCCCC(=O)OCC(O)CO,InChI=1S/C17H34O4/c1-2-3-4-5-6-7-8-9-10-11-12-...,DCBSHORRWZKAKO-UHFFFAOYSA-N,,,79050,,75562,
217918,HMDB0304953,O-Phenolsulfonic acid,2-hydroxybenzene-1-sulfonic acid,,OC1=CC=CC=C1S(O)(=O)=O,"InChI=1S/C6H6O4S/c7-5-3-1-2-4-6(5)11(8,9)10/h1...",IULJSGIJJZZUMF-UHFFFAOYSA-N,,,11867,,71049,


### 2.2 ChEBI

Metabolome ChEBI

In [4]:
# Since there seems to be misalignment in the data, I'll read the file line by line, and split manually
with open('ChEBI_Results.tsv', 'r') as file:
    lines = file.readlines()

# Split each line by tab and manually build the dataframe
data = [line.strip().split('\t') for line in lines]

# Create a DataFrame from the split data
chebi = pd.DataFrame(data[1:], columns=["ID", "NAME"])
chebi['ID'] = chebi['ID'].apply(lambda x: x.split(':')[-1])  # Extract the ChEBI ID from the 'ID' column
chebi = chebi.rename(columns={'ID': 'chebi_id', 'NAME': 'chebi_name'})  # Rename the columns
chebi

Unnamed: 0,chebi_id,chebi_name
0,10983,(R)-3-hydroxybutyrate
1,17771,trans-urocanate
2,18703,D-pipecolate
3,45479,13-cis-retinol
4,45487,13-cis-retinal
...,...,...
5752,192078,L-citrulline-d2
5753,192080,L-methionine-d3
5754,192086,L-valine-d8
5755,192088,L-proline-d7


Xref

In [5]:
chebi_xref = pd.read_csv('database_accession.tsv', sep='\t')  # Load the 'chebi_xrefs.tsv' file into a DataFrame
chebi_xref['COMPOUND_ID'] = chebi_xref['COMPOUND_ID'].apply(lambda x: str(int(x)))  # Convert the 'COMPOUND_ID' column to a string
chebi_xref = chebi_xref[['COMPOUND_ID', 'TYPE', 'ACCESSION_NUMBER']]  # Select only the relevant columns
chebi_xref = chebi_xref.rename(columns={'COMPOUND_ID': 'chebi_id'})  # Rename the columns

chebi_xref_pivot = chebi_xref.pivot_table(index='chebi_id', columns='TYPE', values='ACCESSION_NUMBER', aggfunc=lambda x: ';'.join(map(str, x)))  # Pivot the DataFrame
chebi_xref_pivot.reset_index(inplace=True)  # Reset the index
chebi_xref_pivot

TYPE,chebi_id,Agricola citation,BPDB accession,Beilstein Registry Number,CAS Registry Number,COMe accession,ChemIDplus accession,Chemspider accession,Chinese Abstracts citation,CiteXplore citation,...,PubMed citation,Pubchem accession,RESID accession,Reaxys Registry Number,SMID accession,UM-BBD compID,VSDB accession,WebElements accession,Wikipedia accession,YMDB accession
0,10,,,,21008-67-3,,,,,,...,,,,,,,,,,
1,100,,,1257009,32383-76-9;32383-76-9,,,,,,...,,,,,,,,,,
2,10000,,,,87605-72-9,,,,,,...,,,,,,,,,,
3,100000,,,,,,,,,,...,,,,,,,,,,
4,100001,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161978,99995,,,,,,,,,,...,,,,,,,,,,
161979,99996,,,,,,,,,,...,,,,,,,,,,
161980,99997,,,,,,,,,,...,,,,,,,,,,
161981,99998,,,,,,,,,,...,,,,,,,,,,


In [6]:
chebi_final = pd.merge(chebi, chebi_xref, on='chebi_id', how='left')  # Merge the 'chebi' and 'chebi_xref' DataFrames
chebi_filter = chebi_final[(chebi_final['TYPE'] == 'CAS Registry Number') | (chebi_final['TYPE'] == 'HMDB accession')]  # Filter the 'chebi_final' DataFrame for rows where the 'TYPE' column is 'CAS Registry Number' or 'HMDB accession'

# create a pivot table from the 'chebi_filter' DataFrame
chebi_pivot = chebi_filter.pivot_table(index=['chebi_id', 'chebi_name'], columns='TYPE', values='ACCESSION_NUMBER', aggfunc=lambda x: ';'.join(pd.Series(x).unique()))
chebi_pivot = chebi_pivot.reset_index()  # Reset the index of the DataFrame
chebi_pivot

TYPE,chebi_id,chebi_name,CAS Registry Number,HMDB accession
0,10216,cedr-8-ene,469-61-4,HMDB0059695
1,10224,alpha-cubebene,17699-14-8,HMDB0036413
2,102485,5-carboxy-2'-deoxyuridine,14599-46-3,HMDB0060774
3,10295,alpha-L-sorbopyranose,470-15-5,
4,10319,1-naphthol,90-15-3,HMDB0012138
...,...,...,...,...
2929,9410,taurodeoxycholic acid,516-50-7,HMDB0000896
2930,9440,tenuazonic acid,27778-66-1,
2931,9533,thiamine(1+) monophosphate,10023-48-0,HMDB0002666
2932,9534,thiamine(1+) triphosphate,50851-39-3,HMDB0001512


## 3. Merge Data

In [7]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

### 3.1 HMDB + ChEBI

In [8]:
hmdb_chebi = pd.merge(hmdb, chebi_pivot, on='chebi_id', how='outer')  # Merge the 'hmdb' and 'chebi' DataFrames
hmdb_chebi_v1 = merge_column(hmdb_chebi, 'accession', 'HMDB accession', 'HMDB_ID')
hmdb_chebi_v1 = merge_column(hmdb_chebi_v1, 'cas_registry_number', 'CAS Registry Number', 'CAS_ID')
hmdb_chebi_v1 = hmdb_chebi_v1.replace('', pd.NA)
hmdb_chebi_v1

Unnamed: 0,name,iupac_name,smiles,inchi,inchikey,chemspider_id,drugbank_id,pubchem_compound_id,pdb_id,chebi_id,kegg_id,chebi_name,HMDB_ID,CAS_ID
0,Visnagin,"4-methoxy-7-methyl-5H-furo[3,2-g]chromen-5-one",COC1=C2C=COC2=CC2=C1C(=O)C=C(C)O2,InChI=1S/C13H10O4/c1-7-5-9(14)12-11(17-7)6-10-...,NZVQLVGOZRELTG-UHFFFAOYSA-N,6460,,,,10002,C09049,,HMDB0259840,
1,"13,14-Dihydro PGE1","7-[(1R,2R,3R)-3-hydroxy-2-[(3S)-3-hydroxyoctyl...",CCCCC[C@H](O)CC[C@H]1[C@H](O)CC(=O)[C@@H]1CCCC...,InChI=1S/C20H36O5/c1-2-3-6-9-15(21)12-13-17-16...,DPOINJQWXDTOSF-DODZYUBVSA-N,141668,,161273,,1000694,,,HMDB0002689,19313-28-1
2,Dihydrocubebin,"2,3-bis(2H-1,3-benzodioxol-5-ylmethyl)butane-1...",OCC(CC1=CC2=C(OCO2)C=C1)C(CO)CC1=CC2=C(OCO2)C=C1,InChI=1S/C20H22O6/c21-9-15(5-13-1-3-17-19(7-13...,JKCVMTYNARDGET-UHFFFAOYSA-N,3683161,,4485343,,1001275,C10558,,HMDB0030709,24563-03-9
3,Nalidixic Acid,"1-ethyl-7-methyl-4-oxo-1,4-dihydro-1,8-naphthy...",CCN1C=C(C(O)=O)C(=O)C2=C1N=C(C)C=C2,InChI=1S/C12H12N2O3/c1-3-14-6-9(12(16)17)10(15...,MHWLWQUZZRMNGJ-UHFFFAOYSA-N,4268,DB00779,4421,,100147,C05079,,HMDB0014917,389-08-2
4,Girinimbine,"3,3,5-trimethyl-3H,11H-pyrano[3,2-a]carbazole",CC1=C2OC(C)(C)C=CC2=C2NC3=CC=CC=C3C2=C1,InChI=1S/C18H17NO/c1-11-10-14-12-6-4-5-7-15(12...,GAEQWKVGMHUUKO-UHFFFAOYSA-N,87534,,96943,,1001644,,,HMDB0030241,23095-44-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219110,Glycohyodeoxycholic acid,"2-(4-{5,8-dihydroxy-2,15-dimethyltetracyclo[8....",CC(CCC(=O)NCC(O)=O)C1CCC2C3CC(O)C4CC(O)CCC4(C)...,InChI=1S/C26H43NO5/c1-15(4-7-23(30)27-14-24(31...,SPOIYSFQOFYOFZ-UHFFFAOYSA-N,,,13955650,,,,,HMDB0304944,
219111,Indole-3-acetic acid ethyl ester,ethyl 2-(1H-indol-3-yl)acetate,CCOC(=O)CC1=CNC2=CC=CC=C12,InChI=1S/C12H13NO2/c1-2-15-12(14)7-9-8-13-11-6...,HUDBDWIQSIGUDI-UHFFFAOYSA-N,,,13067,,,,,HMDB0304946,
219112,Nordeoxycholic acid,"3-{5,16-dihydroxy-2,15-dimethyltetracyclo[8.7....",CC(CC(O)=O)C1CCC2C3CCC4CC(O)CCC4(C)C3CC(O)C12C,InChI=1S/C23H38O4/c1-13(10-21(26)27)17-6-7-18-...,PLRQOCVIINWCFA-UHFFFAOYSA-N,278117,,314374,,,,,HMDB0304947,
219113,3-Oxo-5beta-cholanoic acid,"4-{2,15-dimethyl-5-oxotetracyclo[8.7.0.0^{2,7}...",CC(CCC(O)=O)C1CCC2C3CCC4CC(=O)CCC4(C)C3CCC12C,InChI=1S/C24H38O3/c1-15(4-9-22(26)27)19-7-8-20...,KIQFUORWRVZTHT-UHFFFAOYSA-N,473110,,543448,,,,,HMDB0304950,


### 3.2 Make HMDB Unique

In [9]:
def data_cleaning(df, column):
    # Step 1: Separate the DataFrame into unique and duplicated entries based on the column
    df_no_duplicates = df[~(df[column].duplicated(keep=False) & df[column].notna()) | df[column].isna()]
    df_duplicates = df[df[column].notna() & df.duplicated(column, keep=False)].sort_values(column)
    
    # Step 2: Define the merge_entries function to handle merging of duplicated entries
    def merge_entries(series):
        filtered = series.dropna().astype(str)
        unique_filtered = ';'.join(filtered.unique())
        return unique_filtered
    
    # Step 3: Group the duplicated entries and merge them using the merge_entries function
    df_duplicates_merged = df_duplicates.groupby(column).agg(merge_entries).reset_index()
    
    return df_no_duplicates, df_duplicates_merged

no_duplicated_hmdb, duplicated_hmdb = data_cleaning(hmdb_chebi_v1, 'HMDB_ID')
hmdb_chebi_v2 = pd.concat([no_duplicated_hmdb, duplicated_hmdb], ignore_index=True)
hmdb_chebi_v2.replace('', pd.NA, inplace=True)
hmdb_chebi_v2

Unnamed: 0,name,iupac_name,smiles,inchi,inchikey,chemspider_id,drugbank_id,pubchem_compound_id,pdb_id,chebi_id,kegg_id,chebi_name,HMDB_ID,CAS_ID
0,Visnagin,"4-methoxy-7-methyl-5H-furo[3,2-g]chromen-5-one",COC1=C2C=COC2=CC2=C1C(=O)C=C(C)O2,InChI=1S/C13H10O4/c1-7-5-9(14)12-11(17-7)6-10-...,NZVQLVGOZRELTG-UHFFFAOYSA-N,6460,,,,10002,C09049,,HMDB0259840,
1,"13,14-Dihydro PGE1","7-[(1R,2R,3R)-3-hydroxy-2-[(3S)-3-hydroxyoctyl...",CCCCC[C@H](O)CC[C@H]1[C@H](O)CC(=O)[C@@H]1CCCC...,InChI=1S/C20H36O5/c1-2-3-6-9-15(21)12-13-17-16...,DPOINJQWXDTOSF-DODZYUBVSA-N,141668,,161273,,1000694,,,HMDB0002689,19313-28-1
2,Dihydrocubebin,"2,3-bis(2H-1,3-benzodioxol-5-ylmethyl)butane-1...",OCC(CC1=CC2=C(OCO2)C=C1)C(CO)CC1=CC2=C(OCO2)C=C1,InChI=1S/C20H22O6/c21-9-15(5-13-1-3-17-19(7-13...,JKCVMTYNARDGET-UHFFFAOYSA-N,3683161,,4485343,,1001275,C10558,,HMDB0030709,24563-03-9
3,Nalidixic Acid,"1-ethyl-7-methyl-4-oxo-1,4-dihydro-1,8-naphthy...",CCN1C=C(C(O)=O)C(=O)C2=C1N=C(C)C=C2,InChI=1S/C12H12N2O3/c1-3-14-6-9(12(16)17)10(15...,MHWLWQUZZRMNGJ-UHFFFAOYSA-N,4268,DB00779,4421,,100147,C05079,,HMDB0014917,389-08-2
4,Girinimbine,"3,3,5-trimethyl-3H,11H-pyrano[3,2-a]carbazole",CC1=C2OC(C)(C)C=CC2=C2NC3=CC=CC=C3C2=C1,InChI=1S/C18H17NO/c1-11-10-14-12-6-4-5-7-15(12...,GAEQWKVGMHUUKO-UHFFFAOYSA-N,87534,,96943,,1001644,,,HMDB0030241,23095-44-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218330,"PS(22:2(13Z,16Z)/20:4(6Z,8E,10E,14Z)-2OH(5S,12R))","(2S)-2-amino-3-({[(2R)-2-{[(5R,6Z,8E,10E,12S,1...",[H][C@@](COC(=O)CCCCCCCCCCC\C=C/C\C=C/CCCCC)(C...,InChI=1S/C48H82NO12P/c1-3-5-7-9-11-12-13-14-15...,KCLZSFVFYNHPLH-YKPLNWMYSA-N,,,,,230137,,"PS(22:2(13Z,16Z)/20:4(6Z,8E,10E,14Z)-2OH(5S,12R))",HMDB0283120,
218331,"DG(12:0/18:1(12Z)-O(9S,10R)/0:0)",(2S)-3-hydroxy-2-[(8-{3-[(2Z)-oct-2-en-1-yl]ox...,CCCCCCCCCCCC(=O)OC[C@H](CO)OC(=O)CCCCCCCC1OC1C...,InChI=1S/C33H60O6/c1-3-5-7-9-11-12-13-17-21-25...,LITIJTZLGLUHRG-FVGXFSABSA-N,,,,,231192,,"DG(12:0/18:1(12Z)-O(9S,10R)/0:0)",HMDB0294725,
218332,"DG(15:0/0:0/20:3(5Z,8Z,11Z)-O(14R,15S))","(2R)-2-hydroxy-3-{[(5Z,8Z,11Z)-13-(3-pentyloxi...",CCCCCCCCCCCCCCC(=O)OC[C@@H](O)COC(=O)CCC\C=C/C...,InChI=1S/C38H66O6/c1-3-5-7-8-9-10-11-12-16-19-...,QZRFNZSUPCFJCE-PSMWKCNTSA-N,,,,,230153,,"DG(15:0/0:0/20:3(5Z,8Z,11Z)-O(14R,15S))",HMDB0295185,
218333,"DG(8:0/18:1(12Z)-2OH(9,10)/0:0)","(2S)-1-hydroxy-3-(octanoyloxy)propan-2-yl (9S,...",CCCCCCCC(=O)OC[C@H](CO)OC(=O)CCCCCCC[C@H](O)[C...,InChI=1S/C29H54O7/c1-3-5-7-9-12-15-19-26(31)27...,VVIFSARANRPVQE-PTUZYTQWSA-N,,,,,231035,,"DG(8:0/18:1(12Z)-2OH(9,10)/0:0)",HMDB0297171,


In [10]:
hmdb_chebi_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218335 entries, 0 to 218334
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   name                 217934 non-null  object
 1   iupac_name           216866 non-null  object
 2   smiles               217910 non-null  object
 3   inchi                217909 non-null  object
 4   inchikey             217913 non-null  object
 5   chemspider_id        31291 non-null   object
 6   drugbank_id          3269 non-null    object
 7   pubchem_compound_id  104246 non-null  object
 8   pdb_id               524 non-null     object
 9   chebi_id             14822 non-null   object
 10  kegg_id              6826 non-null    object
 11  chebi_name           2959 non-null    object
 12  HMDB_ID              217942 non-null  object
 13  CAS_ID               16312 non-null   object
dtypes: object(14)
memory usage: 23.3+ MB


## 4. BioMedGraphica ID

In [11]:
biomedgraphica_metabolite = hmdb_chebi_v2.sort_values(by=['HMDB_ID'], na_position='last')
biomedgraphica_metabolite.reset_index(drop=True, inplace=True)

max_length = len(str(len(biomedgraphica_metabolite)))
biomedgraphica_metabolite['BioMedGraphica_ID'] = ['BMG_MT' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_metabolite) + 1)]
biomedgraphica_metabolite.rename(columns={'iupac_name': 'IUPAC_Name', 'smiles': 'SMILES', 'inchi': 'InChI', 'inchikey': 'InChIKey', 'chemspider_id': 'ChemSpider_ID', 'drugbank_id': 'DrugBank_ID', 'pubchem_compound_id': 'PubChem_CID', 'pdb_id': 'PDB_ID', 'chebi_id': 'ChEBI_ID', 'kegg_id': 'KEGG_ID', 'name': 'HMDB_Name', 'chebi_name':'ChEBI_Name', 'CAS_ID':'CAS_RN'}, inplace=True)
column_order = ['BioMedGraphica_ID', 'HMDB_ID', 'PubChem_CID', 'CAS_RN', 'ChemSpider_ID', 'PDB_ID', 'ChEBI_ID', 'KEGG_ID', 'HMDB_Name', 'ChEBI_Name', 'IUPAC_Name', 'SMILES', 'InChI', 'InChIKey']
biomedgraphica_metabolite = biomedgraphica_metabolite[column_order]
biomedgraphica_metabolite

Unnamed: 0,BioMedGraphica_ID,HMDB_ID,PubChem_CID,CAS_RN,ChemSpider_ID,PDB_ID,ChEBI_ID,KEGG_ID,HMDB_Name,ChEBI_Name,IUPAC_Name,SMILES,InChI,InChIKey
0,BMG_MT000001,HMDB0000001,92105,332-80-9,83153,,50599,C01152,1-Methylhistidine,N(tele)-methyl-L-histidine,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,BRMWTNUJHUMWMS-LURJTMIESA-N
1,BMG_MT000002,HMDB0000002,428,109-76-2,415,,15725,C00986,"1,3-Diaminopropane",trimethylenediamine,"propane-1,3-diamine",NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,XFNJVJPLKCPIBV-UHFFFAOYSA-N
2,BMG_MT000003,HMDB0000005,58,600-18-0,57,,30831;16763,C00109,2-Ketobutyric acid,2-oxobutanoate,2-oxobutanoic acid,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",TYEYBOSBBBHJIV-UHFFFAOYSA-N
3,BMG_MT000004,HMDB0000008,440864,600-15-7;3347-90-8,389701,,1148;50613,C05984,2-Hydroxybutyric acid,2-hydroxybutyric acid,(2S)-2-hydroxybutanoic acid,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",AFENDNXGAFYKQO-VKHMYHEASA-N
4,BMG_MT000005,HMDB0000010,440624,362-08-3,389515,,1189,C05299,2-Methoxyestrone,2-methoxyestrone,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,WHEUWNKSCXYKBU-QPWUGHHJSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218330,BMG_MT218331,,,52-89-1,,,91247,,,L-cysteine hydrochloride,,,,
218331,BMG_MT218332,,,7048-04-6,,,91248,,,L-cysteine hydrochloride hydrate,,,,
218332,BMG_MT218333,,,2706-75-4,,,91251,,,sodium glyoxylate,,,,
218333,BMG_MT218334,,,23017-93-8,,,91272,,,13-HPODE,,,,


In [12]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite.csv'
biomedgraphica_metabolite.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Metabolite\BioMedGraphica_Metabolite.csv


## 5. Description

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite' / 'BioMedGraphica_Metabolite.csv'
biomedgraphica_metabolite = pd.read_csv(target_dir, dtype=str)

### 5.1 From HMDB

In [None]:
import xml.etree.ElementTree as ET
import csv

tree = ET.parse('hmdb_metabolites.xml')
root = tree.getroot()

# Extract the relevant fields
namespace = {'hmdb': 'http://www.hmdb.ca'}
metabolites = []
for metabolite in root.findall('hmdb:metabolite', namespace):
    data = {
        'accession': metabolite.find('hmdb:accession', namespace).text,
        'hmdb_description': metabolite.find('hmdb:description', namespace).text,
    }
    metabolites.append(data)

import pandas as pd
# Convert the data to a DataFrame
hmdb = pd.DataFrame(metabolites)  # Correctly create DataFrame from metabolites
hmdb.to_csv('hmdb_description.csv', index=False)  # Save DataFrame to CSV file

In [3]:
hmdb_description = pd.read_csv('hmdb_description.csv')  # Load the CSV file into a DataFrame
hmdb_description.head()  # Display the first few rows of the DataFrame

Unnamed: 0,accession,hmdb_description
0,HMDB0000001,"1-Methylhistidine, also known as 1-MHis or 1MH..."
1,HMDB0000002,"1,3-Diaminopropane, also known as DAP or trime..."
2,HMDB0000005,"2-Ketobutyric acid, also known as alpha-ketobu..."
3,HMDB0000008,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn..."
4,HMDB0000010,2-Methoxyestrone (or 2-ME1) belongs to the cla...


In [4]:
bmg_hmdb = biomedgraphica_metabolite[['BioMedGraphica_ID', 'HMDB_ID']]
bmg_hmdb['HMDB_ID'] = bmg_hmdb['HMDB_ID'].str.split(';')
bmg_hmdb = bmg_hmdb.explode('HMDB_ID')

metabolite_description_hmdb = pd.merge(bmg_hmdb, hmdb_description, left_on='HMDB_ID', right_on='accession', how='left')
metabolite_description_hmdb.drop(columns=['accession', 'HMDB_ID'], inplace=True)
metabolite_description_hmdb.rename(columns={'hmdb_description':'HMDB'}, inplace=True)
metabolite_description_hmdb = metabolite_description_hmdb.groupby('BioMedGraphica_ID').agg({
    'HMDB': lambda x: ';'.join(x.dropna().unique())
}).reset_index()
metabolite_description_hmdb.replace('', pd.NA, inplace=True)
metabolite_description_hmdb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmg_hmdb['HMDB_ID'] = bmg_hmdb['HMDB_ID'].str.split(';')


Unnamed: 0,BioMedGraphica_ID,HMDB
0,BMG_MT000001,"1-Methylhistidine, also known as 1-MHis or 1MH..."
1,BMG_MT000002,"1,3-Diaminopropane, also known as DAP or trime..."
2,BMG_MT000003,"2-Ketobutyric acid, also known as alpha-ketobu..."
3,BMG_MT000004,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn..."
4,BMG_MT000005,2-Methoxyestrone (or 2-ME1) belongs to the cla...
...,...,...
218330,BMG_MT218331,
218331,BMG_MT218332,
218332,BMG_MT218333,
218333,BMG_MT218334,


### 5.2 From ChEBI

In [4]:
import csv

def parse_obo_to_csv(obo_file_path, csv_file_path):
    with open(obo_file_path, 'r') as obo_file:
        lines = obo_file.readlines()

    terms = []
    current_term = {}
    is_in_term_block = False

    for line in lines:
        line = line.strip()

        if line == "[Term]":
            # Save the previous term if it exists
            if current_term:
                terms.append(current_term)
            # Start a new term
            current_term = {}
            is_in_term_block = True
        elif is_in_term_block and line == "":
            # End of the current term block
            if current_term:
                terms.append(current_term)
            current_term = {}
            is_in_term_block = False
        elif is_in_term_block:
            # Parse lines within a term block
            if line.startswith("id: "):
                current_term['id'] = line.split("id: ")[1]
            elif line.startswith("def: "):
                current_term['def'] = line.split("def: ")[1]

    # Add the last term if it exists
    if current_term:
        terms.append(current_term)

    # Write to CSV
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["id", "def"])
        writer.writeheader()
        for term in terms:
            writer.writerow(term)

obo_file = "chebi_lite.obo"
csv_file = "chebi_def.csv"

parse_obo_to_csv(obo_file, csv_file)

print(f"Finished: {csv_file}")

Finished: chebi_def.csv


In [5]:
chebi_def = pd.read_csv('chebi_def.csv', dtype=str)  # Load the CSV file into a DataFrame
chebi_def["def"] = chebi_def["def"].str.replace('"', '', regex=False).str.replace(r'\[.*?\]', '', regex=True).str.strip()
chebi_def['id'] = chebi_def['id'].str.replace('CHEBI:', '')
chebi_def

Unnamed: 0,id,def
0,24431,A chemical entity is a physical entity of inte...
1,23367,Any constitutionally or isotopically distinct ...
2,24870,A molecular entity having a net electric charge.
3,24867,
4,23905,
...,...,...
202204,196964,
202205,191404,
202206,189822,
202207,187876,


In [6]:
bmg_chebi = biomedgraphica_metabolite[['BioMedGraphica_ID', 'ChEBI_ID']]
bmg_chebi['ChEBI_ID'] = bmg_chebi['ChEBI_ID'].str.split(';')
bmg_chebi = bmg_chebi.explode('ChEBI_ID')
bmg_chebi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmg_chebi['ChEBI_ID'] = bmg_chebi['ChEBI_ID'].str.split(';')


Unnamed: 0,BioMedGraphica_ID,ChEBI_ID
0,BMG_MT000001,50599
1,BMG_MT000002,15725
2,BMG_MT000003,30831
2,BMG_MT000003,16763
3,BMG_MT000004,1148
...,...,...
218330,BMG_MT218331,91247
218331,BMG_MT218332,91248
218332,BMG_MT218333,91251
218333,BMG_MT218334,91272


In [8]:
metabolite_description_chebi = pd.merge(bmg_chebi, chebi_def, left_on='ChEBI_ID', right_on='id', how='left')
metabolite_description_chebi.drop(columns=['id', 'ChEBI_ID'], inplace=True)
metabolite_description_chebi.rename(columns={'def':'ChEBI'}, inplace=True)
metabolite_description_chebi = metabolite_description_chebi.groupby('BioMedGraphica_ID').agg({
    'ChEBI': lambda x: ' | '.join(x.dropna().unique())
}).reset_index()
metabolite_description_chebi.replace('', pd.NA, inplace=True)
metabolite_description_chebi

Unnamed: 0,BioMedGraphica_ID,ChEBI
0,BMG_MT000001,A L-histidine derivative in which the methyl g...
1,BMG_MT000002,"An alkane-alpha,omega-diamine comprising a pro..."
2,BMG_MT000003,A 2-oxo monocarboxylic acid that is the 2-oxo ...
3,BMG_MT000004,A hydroxybutyric acid having a single hydroxyl...
4,BMG_MT000005,A 17-oxo steroid that is estrone in which the ...
...,...,...
218330,BMG_MT218331,A hydrochloride obtained by combining L-cystei...
218331,BMG_MT218332,A hydrate that is the monohydrate form of L-cy...
218332,BMG_MT218333,An organic sodium salt that is the monosodium ...
218333,BMG_MT218334,An HPODE (hydroperoxyoctadecadienoic acid) in ...


### 5.3 Final Description

In [9]:
metabolite_description = pd.merge(metabolite_description_hmdb, metabolite_description_chebi, on='BioMedGraphica_ID', how='outer')
metabolite_description

Unnamed: 0,BioMedGraphica_ID,HMDB,ChEBI
0,BMG_MT000001,"1-Methylhistidine, also known as 1-MHis or 1MH...",A L-histidine derivative in which the methyl g...
1,BMG_MT000002,"1,3-Diaminopropane, also known as DAP or trime...","An alkane-alpha,omega-diamine comprising a pro..."
2,BMG_MT000003,"2-Ketobutyric acid, also known as alpha-ketobu...",A 2-oxo monocarboxylic acid that is the 2-oxo ...
3,BMG_MT000004,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn...",A hydroxybutyric acid having a single hydroxyl...
4,BMG_MT000005,2-Methoxyestrone (or 2-ME1) belongs to the cla...,A 17-oxo steroid that is estrone in which the ...
...,...,...,...
218330,BMG_MT218331,,A hydrochloride obtained by combining L-cystei...
218331,BMG_MT218332,,A hydrate that is the monohydrate form of L-cy...
218332,BMG_MT218333,,An organic sodium salt that is the monosodium ...
218333,BMG_MT218334,,An HPODE (hydroperoxyoctadecadienoic acid) in ...


In [10]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_Description.csv'
metabolite_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Metabolite\BioMedGraphica_Metabolite_Description.csv


### 5.4 Combined Description

In [11]:
comb_description = metabolite_description.copy()

# add the column name at the beginning of the string
# first, we need to get the column names
column_names = comb_description.columns.tolist()
column_names = [col for col in column_names if col != 'BioMedGraphica_ID']
# then we can apply the function to each column
for col in column_names:
    comb_description[col] = comb_description[col].apply(lambda x: ' | '.join([f"{col}: {i}" for i in x.split(' | ')]) if pd.notna(x) else x)

# now we can merge the columns into one
comb_description['Description'] = comb_description[column_names].apply(lambda x: ' | '.join(x.dropna()), axis=1)
comb_description = comb_description[['BioMedGraphica_ID', 'Description']]
comb_description

Unnamed: 0,BioMedGraphica_ID,Description
0,BMG_MT000001,"HMDB: 1-Methylhistidine, also known as 1-MHis ..."
1,BMG_MT000002,"HMDB: 1,3-Diaminopropane, also known as DAP or..."
2,BMG_MT000003,"HMDB: 2-Ketobutyric acid, also known as alpha-..."
3,BMG_MT000004,"HMDB: 2-Hydroxybutyric acid (CAS: 600-15-7), a..."
4,BMG_MT000005,HMDB: 2-Methoxyestrone (or 2-ME1) belongs to t...
...,...,...
218330,BMG_MT218331,ChEBI: A hydrochloride obtained by combining L...
218331,BMG_MT218332,ChEBI: A hydrate that is the monohydrate form ...
218332,BMG_MT218333,ChEBI: An organic sodium salt that is the mono...
218333,BMG_MT218334,ChEBI: An HPODE (hydroperoxyoctadecadienoic ac...


In [12]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_Description_Combined.csv'
comb_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Metabolite\BioMedGraphica_Metabolite_Description_Combined.csv


## 6. File Generation

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite' / 'BioMedGraphica_Metabolite.csv'
biomedgraphica_metabolite = pd.read_csv(target_dir, dtype=str)

### 6.1 BioChem

In [13]:
biochem = biomedgraphica_metabolite[['BioMedGraphica_ID','SMILES','InChI','InChIKey']]
biochem

Unnamed: 0,BioMedGraphica_ID,SMILES,InChI,InChIKey
0,BMG_MT000001,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,BRMWTNUJHUMWMS-LURJTMIESA-N
1,BMG_MT000002,NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,XFNJVJPLKCPIBV-UHFFFAOYSA-N
2,BMG_MT000003,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",TYEYBOSBBBHJIV-UHFFFAOYSA-N
3,BMG_MT000004,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",AFENDNXGAFYKQO-VKHMYHEASA-N
4,BMG_MT000005,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,WHEUWNKSCXYKBU-QPWUGHHJSA-N
...,...,...,...,...
218330,BMG_MT218331,,,
218331,BMG_MT218332,,,
218332,BMG_MT218333,,,
218333,BMG_MT218334,,,


In [14]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_BioChem.csv'
biochem.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Metabolite\BioMedGraphica_Metabolite_BioChem.csv


### 6.2 Name and ID

GUI Name

In [15]:
def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

gui_name = biomedgraphica_metabolite.copy()
gui_name = merge_string_columns(gui_name, ['HMDB_Name', 'ChEBI_Name', 'IUPAC_Name'], 'Metabolite_Name_List')
gui_name = gui_name[['BioMedGraphica_ID', 'Metabolite_Name_List']]
gui_name

Unnamed: 0,BioMedGraphica_ID,Metabolite_Name_List
0,BMG_MT000001,1-Methylhistidine | N(tele)-methyl-L-histidine...
1,BMG_MT000002,"1,3-Diaminopropane | propane-1,3-diamine | tri..."
2,BMG_MT000003,2-oxobutanoic acid | 2-Ketobutyric acid | 2-ox...
3,BMG_MT000004,2-hydroxybutyric acid | (2S)-2-hydroxybutanoic...
4,BMG_MT000005,"2-methoxyestrone | (1S,10R,11S,15S)-5-hydroxy-..."
...,...,...
218330,BMG_MT218331,L-cysteine hydrochloride
218331,BMG_MT218332,L-cysteine hydrochloride hydrate
218332,BMG_MT218333,sodium glyoxylate
218333,BMG_MT218334,13-HPODE


In [16]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_GUI_Name.csv'
gui_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Metabolite\BioMedGraphica_Metabolite_GUI_Name.csv


LLM Name and ID

In [17]:
llm_name_id = biomedgraphica_metabolite.copy()

llm_name_id['HMDB_ID'] = llm_name_id['HMDB_ID'].apply(
    lambda x: ' | '.join(f"HMDB ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['PubChem_CID'] = llm_name_id['PubChem_CID'].apply(
    lambda x: ' | '.join(f"PubChem CID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['CAS_RN'] = llm_name_id['CAS_RN'].apply(
    lambda x: ' | '.join(f"CAS RN:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['ChemSpider_ID'] = llm_name_id['ChemSpider_ID'].apply(
    lambda x: ' | '.join(f"ChemSpider ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['PDB_ID'] = llm_name_id['PDB_ID'].apply(
    lambda x: ' | '.join(f"PDB ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['ChEBI_ID'] = llm_name_id['ChEBI_ID'].apply(
    lambda x: ' | '.join(f"ChEBI ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['KEGG_ID'] = llm_name_id['KEGG_ID'].apply(
    lambda x: ' | '.join(f"KEGG ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

column_order = ['BioMedGraphica_ID', 'IUPAC_Name', 'HMDB_Name', 'HMDB_ID', 'PubChem_CID', 'ChemSpider_ID', 'CAS_RN', 'ChEBI_Name', 'ChEBI_ID', 'KEGG_ID', 'PDB_ID']
llm_name_id = llm_name_id[column_order]
llm_name_id

Unnamed: 0,BioMedGraphica_ID,IUPAC_Name,HMDB_Name,HMDB_ID,PubChem_CID,ChemSpider_ID,CAS_RN,ChEBI_Name,ChEBI_ID,KEGG_ID,PDB_ID
0,BMG_MT000001,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,1-Methylhistidine,HMDB ID:HMDB0000001,PubChem CID:92105,ChemSpider ID:83153,CAS RN:332-80-9,N(tele)-methyl-L-histidine,ChEBI ID:50599,KEGG ID:C01152,
1,BMG_MT000002,"propane-1,3-diamine","1,3-Diaminopropane",HMDB ID:HMDB0000002,PubChem CID:428,ChemSpider ID:415,CAS RN:109-76-2,trimethylenediamine,ChEBI ID:15725,KEGG ID:C00986,
2,BMG_MT000003,2-oxobutanoic acid,2-Ketobutyric acid,HMDB ID:HMDB0000005,PubChem CID:58,ChemSpider ID:57,CAS RN:600-18-0,2-oxobutanoate,ChEBI ID:30831 | ChEBI ID:16763,KEGG ID:C00109,
3,BMG_MT000004,(2S)-2-hydroxybutanoic acid,2-Hydroxybutyric acid,HMDB ID:HMDB0000008,PubChem CID:440864,ChemSpider ID:389701,CAS RN:600-15-7 | CAS RN:3347-90-8,2-hydroxybutyric acid,ChEBI ID:1148 | ChEBI ID:50613,KEGG ID:C05984,
4,BMG_MT000005,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",2-Methoxyestrone,HMDB ID:HMDB0000010,PubChem CID:440624,ChemSpider ID:389515,CAS RN:362-08-3,2-methoxyestrone,ChEBI ID:1189,KEGG ID:C05299,
...,...,...,...,...,...,...,...,...,...,...,...
218330,BMG_MT218331,,,,,,CAS RN:52-89-1,L-cysteine hydrochloride,ChEBI ID:91247,,
218331,BMG_MT218332,,,,,,CAS RN:7048-04-6,L-cysteine hydrochloride hydrate,ChEBI ID:91248,,
218332,BMG_MT218333,,,,,,CAS RN:2706-75-4,sodium glyoxylate,ChEBI ID:91251,,
218333,BMG_MT218334,,,,,,CAS RN:23017-93-8,13-HPODE,ChEBI ID:91272,,


In [18]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_LLM_Name_ID.csv'
llm_name_id.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Metabolite\BioMedGraphica_Metabolite_LLM_Name_ID.csv


LLM Name and ID Combined

In [19]:
llm_combined = llm_name_id.copy()

def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

llm_combined = merge_string_columns(llm_combined, llm_combined.columns[llm_combined.columns != 'BioMedGraphica_ID'], 'Names_and_IDs')
llm_combined

Unnamed: 0,BioMedGraphica_ID,Names_and_IDs
0,BMG_MT000001,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...
1,BMG_MT000002,PubChem CID:428 | KEGG ID:C00986 | ChEBI ID:15...
2,BMG_MT000003,KEGG ID:C00109 | 2-oxobutanoic acid | 2-Ketobu...
3,BMG_MT000004,HMDB ID:HMDB0000008 | ChEBI ID:1148 | ChEBI ID...
4,BMG_MT000005,HMDB ID:HMDB0000010 | PubChem CID:440624 | KEG...
...,...,...
218330,BMG_MT218331,ChEBI ID:91247 | L-cysteine hydrochloride | CA...
218331,BMG_MT218332,CAS RN:7048-04-6 | L-cysteine hydrochloride hy...
218332,BMG_MT218333,ChEBI ID:91251 | sodium glyoxylate | CAS RN:27...
218333,BMG_MT218334,13-HPODE | CAS RN:23017-93-8 | ChEBI ID:91272


In [20]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_LLM_Name_ID_Combined.csv'
llm_combined.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Metabolite\BioMedGraphica_Metabolite_LLM_Name_ID_Combined.csv


Display Name

In [21]:
display_name = biomedgraphica_metabolite.copy()

display_name['BMG_Metabolite_Name'] = display_name['HMDB_Name'].fillna(display_name['IUPAC_Name']).fillna(display_name['ChEBI_Name'])
display_name = display_name[['BioMedGraphica_ID', 'BMG_Metabolite_Name']]
display_name

Unnamed: 0,BioMedGraphica_ID,BMG_Metabolite_Name
0,BMG_MT000001,1-Methylhistidine
1,BMG_MT000002,"1,3-Diaminopropane"
2,BMG_MT000003,2-Ketobutyric acid
3,BMG_MT000004,2-Hydroxybutyric acid
4,BMG_MT000005,2-Methoxyestrone
...,...,...
218330,BMG_MT218331,L-cysteine hydrochloride
218331,BMG_MT218332,L-cysteine hydrochloride hydrate
218332,BMG_MT218333,sodium glyoxylate
218333,BMG_MT218334,13-HPODE


In [22]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_Display_Name.csv'
display_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Metabolite\BioMedGraphica_Metabolite_Display_Name.csv
