# Example 2: Metabolite workflow

This notebook provides insights on how to use the tool if you have list of metabolites.

# Import modules

In [1]:
import pandas as pd

from pyBiodatafuse import id_mapper
from pyBiodatafuse.analyzer.summarize import BioGraph
from pyBiodatafuse.annotators import molmedb, kegg
from pyBiodatafuse.graph import saver
from pyBiodatafuse.utils import combine_sources, create_or_append_to_metadata

# Load list of metabolites/chemicals/compounds

In [2]:
metabolites_of_interest = """100208
10040286
10041551
10025195
5291
6030
1172
1060
"""

metabolite_list = metabolites_of_interest.split("\n")
len(metabolite_list)

9

In [3]:
data_input = pd.DataFrame(metabolite_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,100208
1,10040286
2,10041551
3,10025195
4,5291


### Entity resolution using BridgeDB

In [4]:
bridgedb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Human",
    input_datasource="PubChem Compound",
    output_datasource="All",
)
bridgedb_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,100208,PubChem-compound,90560,ChemSpider
1,100208,PubChem-compound,100208,PubChem Compound
2,100208,PubChem-compound,HMDB0244377,HMDB
3,100208,PubChem-compound,OFDNQWIFNXBECV-UHFFFAOYSA-N,InChIKey
4,100208,PubChem-compound,C11280,KEGG Compound


### Transporter inhibited from MolMeDB

In [5]:
(
    molmedb_transporter_inhibited_df,
    molmedb_transporter_inhibited_metadata,
) = molmedb.get_compound_gene_inhibitor(bridgedb_df=bridgedb_df)
molmedb_transporter_inhibited_df.head()

  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,MolMeDB_transporter_inhibited
0,100208,PubChem-compound,OFDNQWIFNXBECV-UHFFFAOYSA-N,InChIKey,[{'MolMeDB_uniprot_trembl_id': 'Uniprot-TrEMBL...
1,10025195,PubChem-compound,LEJRLSZVESQKJK-UHFFFAOYSA-N,InChIKey,"[{'MolMeDB_uniprot_trembl_id': nan, 'MolMeDB_h..."
2,10040286,PubChem-compound,FYGREZKTJIXWIH-UHFFFAOYSA-N,InChIKey,"[{'MolMeDB_uniprot_trembl_id': nan, 'MolMeDB_h..."
3,10041551,PubChem-compound,OVVBIIBBRZVPAL-UHFFFAOYSA-N,InChIKey,[{'MolMeDB_uniprot_trembl_id': 'Uniprot-TrEMBL...
4,1060,PubChem-compound,LCTONWCANYUPML-UHFFFAOYSA-N,InChIKey,"[{'MolMeDB_uniprot_trembl_id': nan, 'MolMeDB_h..."


### Pathways from KEGG

In [6]:
(
    kegg_df,
    kegg_metadata,
) = kegg.get_pathways(bridgedb_df=bridgedb_df)
kegg_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,100208,PubChem-compound,C11280,KEGG Compound,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
1,6030,PubChem-compound,C00105,KEGG Compound,"[{'pathway_id': 'path:hsa00240', 'pathway_labe..."
2,1172,PubChem-compound,C00105,KEGG Compound,"[{'pathway_id': 'path:hsa00240', 'pathway_labe..."
3,1060,PubChem-compound,C00022,KEGG Compound,"[{'pathway_id': 'path:hsa00010', 'pathway_labe..."


In [11]:
kegg_df["KEGG_pathways"][1]

[{'pathway_id': 'path:hsa00240',
  'pathway_label': 'Pyrimidine metabolism - Homo sapiens (human)',
  'pathway_compound_counts': 66,
  'pathway_genes': [{'KEGG_id': '10201'},
   {'KEGG_id': '115024'},
   {'KEGG_id': '124583'},
   {'KEGG_id': '129607'},
   {'KEGG_id': '1503'},
   {'KEGG_id': '151531'},
   {'KEGG_id': '1633'},
   {'KEGG_id': '1635'},
   {'KEGG_id': '1723'},
   {'KEGG_id': '1806'},
   {'KEGG_id': '1807'},
   {'KEGG_id': '1841'},
   {'KEGG_id': '1854'},
   {'KEGG_id': '1890'},
   {'KEGG_id': '221264'},
   {'KEGG_id': '22978'},
   {'KEGG_id': '284958'},
   {'KEGG_id': '29922'},
   {'KEGG_id': '30833'},
   {'KEGG_id': '318'},
   {'KEGG_id': '377841'},
   {'KEGG_id': '4830'},
   {'KEGG_id': '4831'},
   {'KEGG_id': '4832'},
   {'KEGG_id': '4833'},
   {'KEGG_id': '4907'},
   {'KEGG_id': '50484'},
   {'KEGG_id': '51020'},
   {'KEGG_id': '51251'},
   {'KEGG_id': '5167'},
   {'KEGG_id': '5169'},
   {'KEGG_id': '51727'},
   {'KEGG_id': '51733'},
   {'KEGG_id': '54963'},
   {'KEGG_i

# Create the BDF graph

In [12]:
combined_df = combine_sources(
    bridgedb_df, df_list=[kegg_df, molmedb_transporter_inhibited_df]
)
combined_df

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways,MolMeDB_transporter_inhibited
0,100208,PubChem-compound,100208,PubChem Compound,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'MolMeDB_uniprot_trembl_id': 'Uniprot-TrEMBL...
1,10040286,PubChem-compound,10040286,PubChem Compound,,"[{'MolMeDB_uniprot_trembl_id': nan, 'MolMeDB_h..."
2,10041551,PubChem-compound,10041551,PubChem Compound,,[{'MolMeDB_uniprot_trembl_id': 'Uniprot-TrEMBL...
3,10025195,PubChem-compound,10025195,PubChem Compound,,"[{'MolMeDB_uniprot_trembl_id': nan, 'MolMeDB_h..."
4,5291,PubChem-compound,5291,PubChem Compound,,[{'MolMeDB_uniprot_trembl_id': 'Uniprot-TrEMBL...
5,6030,PubChem-compound,6030,PubChem Compound,"[{'pathway_id': 'path:hsa00240', 'pathway_labe...","[{'MolMeDB_uniprot_trembl_id': nan, 'MolMeDB_h..."
6,1172,PubChem-compound,1172,PubChem Compound,"[{'pathway_id': 'path:hsa00240', 'pathway_labe...","[{'MolMeDB_uniprot_trembl_id': nan, 'MolMeDB_h..."
7,1060,PubChem-compound,1060,PubChem Compound,"[{'pathway_id': 'path:hsa00010', 'pathway_labe...","[{'MolMeDB_uniprot_trembl_id': nan, 'MolMeDB_h..."


In [13]:
combined_metadata = create_or_append_to_metadata(
    bridgdb_metadata, [kegg_metadata, molmedb_transporter_inhibited_metadata]
)

In [14]:
combined_metadata

[{'datasource': 'KEGG',
  'metadata': {'source_version': '115.0+/07-11'},
  'query': {'size': 3,
   'input_type': 'KEGG Compound',
   'number_of_added_edges': 41,
   'number_of_added_nodes': 39,
   'time': '0:00:41.443987',
   'date': '2025-07-11 18:36:35',
   'url': 'https://rest.kegg.jp'}},
 {'datasource': 'MolMeDB',
  'query': {'size': 239,
   'input_type': 'InChIKey',
   'time': '0:00:00.153252',
   'date': '2025-07-11 18:35:45',
   'url': 'https://idsm.elixir-czech.cz/sparql/endpoint/molmedb',
   'number_of_added_nodes': MolMeDB_uniprot_trembl_id    5
   MolMeDB_uniprot_trembl_id    5
   dtype: int64,
   'number_of_added_edges': 5}},
 {'datasource': 'BridgeDB',
  'metadata': {'source_version': {'java.version': '11.0.16',
    'bridgedb.version': '3.0.25',
    'webservice.version': '2.1.7'},
   'data_version': ['DATASOURCENAME: Ensembl',
    'BUILDDATE: 20230311',
    'SERIES: Homo sapiens genes and proteins',
    'DATATYPE: GeneProduct',
    'DATASOURCEVERSION: 108',
    'SCHEMAVER

# Saving the graph

In [15]:
import os

DATA_DIR = "./data/metabolite_workflow"
os.makedirs(DATA_DIR, exist_ok=True)

In [16]:
pygraph = saver.save_graph(
    combined_df=combined_df,
    combined_metadata=combined_metadata,
    graph_name="metabolite_workflow",
    graph_dir=DATA_DIR,
)

Combined DataFrame saved in ./data/metabolite_workflow/metabolite_workflow_df.pkl
Metadata saved in ./data/metabolite_workflow/metabolite_workflow_metadata.pkl
Building graph: 0it [00:00, ?it/s]
Graph is built successfully
Graph saved in: 
 ./data/metabolite_workflow/metabolite_workflow_graph.pkl 
 ./data/metabolite_workflow/metabolite_workflow_graph.gml
Graph saved in ./data/metabolite_workflow/metabolite_workflow_graph.edgelist


In [17]:
print(pygraph)

MultiDiGraph with 0 nodes and 0 edges


# Graph statistics

In [None]:
graph_obj = BioGraph(graph=pygraph)
graph_obj.graph_summary

In [None]:
graph_obj.count_nodes_by_data_source(plot=True)

In [None]:
graph_obj.count_edge_by_data_source(plot=True)

### To check the KEGG for gene input (@Delano, pathway gene count is not accurate)

In [25]:
genes_of_interest = """7350
6198
1499
6528
6714
10000
10891
6194
7068
4193
3709
"""

gene_list = genes_of_interest.split("\n")
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,7350
1,6198
2,1499
3,6528
4,6714


In [26]:
bridgedb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Human",
    input_datasource="NCBI Gene",
    output_datasource="All",
)
bridgedb_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,7350,Entrez Gene,2787089,Affy
1,7350,Entrez Gene,8102904,Affy
2,7350,Entrez Gene,2787088,Affy
3,7350,Entrez Gene,GO:0071398,Gene Ontology
4,7350,Entrez Gene,2787087,Affy


In [27]:
print("Number of genes with mapping in BridgeDb:", len(bridgedb_df["identifier"].unique()))
bridgedb_df.head()

Number of genes with mapping in BridgeDb: 11


Unnamed: 0,identifier,identifier.source,target,target.source
0,7350,Entrez Gene,2787089,Affy
1,7350,Entrez Gene,8102904,Affy
2,7350,Entrez Gene,2787088,Affy
3,7350,Entrez Gene,GO:0071398,Gene Ontology
4,7350,Entrez Gene,2787087,Affy


In [28]:
(
    kegg_df,
    kegg_metadata,
) = kegg.get_pathways(bridgedb_df=bridgedb_df)
kegg_df.head()

Getting KEGG IDs: 100%|██████████| 2/2 [00:06<00:00,  3.48s/it]


Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,7350,Entrez Gene,7350,NCBI Gene,"[{'pathway_id': 'path:hsa03320', 'pathway_labe..."
1,6198,Entrez Gene,6198,NCBI Gene,"[{'pathway_id': 'path:hsa01521', 'pathway_labe..."
2,1499,Entrez Gene,1499,NCBI Gene,"[{'pathway_id': 'path:hsa04015', 'pathway_labe..."
3,6528,Entrez Gene,6528,NCBI Gene,"[{'pathway_id': 'path:hsa04918', 'pathway_labe..."
4,6714,Entrez Gene,6714,NCBI Gene,"[{'pathway_id': 'path:hsa01521', 'pathway_labe..."


In [29]:
kegg_df["KEGG_pathways"][1]

[{'pathway_id': 'path:hsa01521',
  'pathway_label': 'EGFR tyrosine kinase inhibitor resistance - Homo sapiens (human)',
  'pathway_gene_counts': 0,
  'pathway_compounds': [{'KEGG_id': None}]},
 {'pathway_id': 'path:hsa01522',
  'pathway_label': 'Endocrine resistance - Homo sapiens (human)',
  'pathway_gene_counts': 0,
  'pathway_compounds': [{'KEGG_id': None}]},
 {'pathway_id': 'path:hsa04012',
  'pathway_label': 'ErbB signaling pathway - Homo sapiens (human)',
  'pathway_gene_counts': 85,
  'pathway_compounds': [{'KEGG_id': 'C00076'},
   {'KEGG_id': 'C00165'},
   {'KEGG_id': 'C01245'},
   {'KEGG_id': 'C05981'}]},
 {'pathway_id': 'path:hsa04066',
  'pathway_label': 'HIF-1 signaling pathway - Homo sapiens (human)',
  'pathway_gene_counts': 109,
  'pathway_compounds': [{'KEGG_id': 'C00002'},
   {'KEGG_id': 'C00007'},
   {'KEGG_id': 'C00022'},
   {'KEGG_id': 'C00024'},
   {'KEGG_id': 'C00026'},
   {'KEGG_id': 'C00031'},
   {'KEGG_id': 'C00072'},
   {'KEGG_id': 'C00076'},
   {'KEGG_id': 'C