# Example: CompoundWiki

In [35]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
src_path = os.path.join(project_root, "src")

if src_path not in sys.path:
    sys.path.insert(0, src_path)

print("PYTHONPATH:", sys.path[0])

PYTHONPATH: c:\Users\are10\Documents\BAFSTU\code\GitHub\pyBiodatafuse\src


In [36]:
# Import modules
import pandas as pd

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import intact, kegg, compoundwiki
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import combine_sources

# Load list of metabolites/chemicals/compounds

In [None]:
metabolites_of_interest = """2504
15996
17368
16526
31742
63596
28499
35718
25212
78675
35107
"""

metabolite_list = metabolites_of_interest.split("\n")
len(metabolite_list)

12

In [38]:
data_input = pd.DataFrame(metabolite_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,2157
1,15996
2,17368
3,16526
4,31742


### Entity resolution using BridgeDB

In [39]:
bridgedb_compound_df, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Human",
    input_datasource="ChEBI",
    output_datasource="All",
)
bridgedb_compound_df.head(10)

Unnamed: 0,identifier,identifier.source,target,target.source
0,15996,ChEBI,6830,PubChem Compound
1,15996,ChEBI,C00044,KEGG Compound
2,15996,ChEBI,Q392227,Wikidata
3,15996,ChEBI,DB04137,DrugBank
4,15996,ChEBI,6569,ChemSpider
5,15996,ChEBI,86-01-1,CAS
6,15996,ChEBI,CHEMBL1233147,ChEMBL compound
7,15996,ChEBI,HMDB0001273,HMDB
8,15996,ChEBI,CHEBI:15996,ChEBI
9,15996,ChEBI,XKMLYUALXHKNFT-UUOKFMHZSA-N,InChIKey


### Compound Interactions from IntAct

In [25]:
intact_compound_df, intact_compound_metadata = intact.get_compound_interactions(
    bridgedb_compound_df, interaction_type="both"
)
intact_compound_df.head(10)

Querying IntAct for compounds: 100%|██████████| 1/1 [00:12<00:00, 12.50s/it]


Unnamed: 0,identifier,identifier.source,target,target.source,IntAct_compound_interactions
0,15996,ChEBI,CHEBI:15996,ChEBI,"[{'interaction_id': 'EBI-1557199', 'interactor..."
1,17368,ChEBI,CHEBI:17368,ChEBI,"[{'interaction_id': nan, 'interactor_id_A': na..."
2,16526,ChEBI,CHEBI:16526,ChEBI,"[{'interaction_id': nan, 'interactor_id_A': na..."
3,31742,ChEBI,CHEBI:31742,ChEBI,"[{'interaction_id': nan, 'interactor_id_A': na..."
4,63596,ChEBI,CHEBI:63596,ChEBI,"[{'interaction_id': nan, 'interactor_id_A': na..."
5,28499,ChEBI,CHEBI:28499,ChEBI,"[{'interaction_id': nan, 'interactor_id_A': na..."
6,25212,ChEBI,CHEBI:25212,ChEBI,"[{'interaction_id': nan, 'interactor_id_A': na..."
7,35107,ChEBI,CHEBI:35107,ChEBI,"[{'interaction_id': nan, 'interactor_id_A': na..."


In [26]:
intact_compound_df["IntAct_compound_interactions"][0]

[{'interaction_id': 'EBI-1557199',
  'interactor_id_A': 'EBI-722284',
  'interactor_id_B': 'EBI-989053',
  'score': 0.62,
  'biological_role_A': 'enzyme',
  'biological_role_B': 'enzyme target',
  'type': 'enzymatic reaction',
  'detection_method': 'gtpase assay',
  'host_organism': 'In vitro',
  'interactor_A_name': 'rab4a_human',
  'interactor_B_name': 'gtp',
  'interactor_A_species': 'Homo sapiens',
  'interactor_B_species': 'Chemical synthesis (Chemical synthesis)',
  'molecule_A': 'RAB4A',
  'molecule_B': 'gtp',
  'id_A': 'uniprotkb:P20338',
  'id_B': 'CHEBI:15996',
  'pubmed_publication_id': '15377662',
  'intact_link_to': 'EBI-722284'},
 {'interaction_id': 'EBI-22225605',
  'interactor_id_A': 'EBI-9837586',
  'interactor_id_B': 'EBI-989053',
  'score': 0.54,
  'biological_role_A': 'unspecified role',
  'biological_role_B': 'cofactor',
  'type': 'direct interaction',
  'detection_method': 'pull down',
  'host_organism': 'In vitro',
  'interactor_A_name': 'rab32_human',
  'interac

In [27]:
combined_df = combine_sources(
    bridgedb_compound_df,
    [
        intact_compound_df, 
    ],
)

In [28]:
combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,IntAct_compound_interactions
0,15996,ChEBI,6830,PubChem Compound,"[{'interaction_id': 'EBI-1557199', 'interactor..."
1,17368,ChEBI,790,PubChem Compound,"[{'interaction_id': nan, 'interactor_id_A': na..."
2,16526,ChEBI,280,PubChem Compound,"[{'interaction_id': nan, 'interactor_id_A': na..."
3,31742,ChEBI,5282149,PubChem Compound,"[{'interaction_id': nan, 'interactor_id_A': na..."
4,63596,ChEBI,173576,PubChem Compound,"[{'interaction_id': nan, 'interactor_id_A': na..."


In [29]:
combined_df, compoundwiki_metadata = compoundwiki.get_compound_annotations(combined_df)

Processing IntAct column for compounds: IntAct_compound_interactions
Found CHEBI IDs in IntAct_compound_interactions: ['CHEBI:15996']
Querying BridgeDb for PubChem IDs from ChEBI...
Found 1 unique PubChem Compound IDs.
Running SPARQL query:
PREFIX wd: <https://compoundcloud.wikibase.cloud/entity/>
PREFIX wdt: <https://compoundcloud.wikibase.cloud/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?compound ?compoundLabel ?propEntity ?propLabel ?val ?valLabel WHERE {
  VALUES ?pubchem { "6830" }
  ?compound wdt:P13 ?pubchem .
  ?compound ?prop ?val .
  ?propEntity wikibase:directClaim ?prop .

  OPTIONAL { ?compound rdfs:label ?compoundLabel . FILTER (lang(?compoundLabel) = "en") }
  OPTIONAL { ?propEntity rdfs:label ?propLabel . FILTER (lang(?propLabel) = "en") }
  OPTIONAL { ?val rdfs:label ?valLabel . FILTER (lang(?valLabel) = "en") }
}
ORDER BY ?compound ?propLabel
A:  Empty DataFrame
Columns: []
Index: []
Receive

In [13]:
combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,IntAct_compound_interactions,KEGG_pathways
0,15361,ChEBI,107735,PubChem Compound,"[{'interaction_id': 'EBI-6621805', 'interactor...",
1,16393,ChEBI,5280335,PubChem Compound,"[{'interaction_id': 'EBI-26355556', 'interacto...","[{'pathway_id': 'path:hsa00600', 'pathway_labe..."
2,15996,ChEBI,6830,PubChem Compound,"[{'interaction_id': 'EBI-27066219', 'interacto...","[{'pathway_id': 'path:hsa00230', 'pathway_labe..."
3,17440,ChEBI,378,PubChem Compound,"[{'interaction_id': 'EBI-9691115', 'interactor...","[{'pathway_id': nan, 'pathway_label': nan, 'pa..."


In [14]:
combined_df['IntAct_compound_interactions'][1]

[{'interaction_id': 'EBI-26355556',
  'interactor_id_A': 'EBI-26355518',
  'interactor_id_B': 'EBI-26355518',
  'score': 0.4,
  'biological_role_A': 'unspecified role',
  'biological_role_B': 'competitor',
  'type': 'physical association',
  'detection_method': 'pull down',
  'host_organism': 'In vitro',
  'interactor_A_name': 'sphingosine',
  'interactor_B_name': 'sphingosine',
  'interactor_A_species': 'Chemical synthesis (Chemical synthesis)',
  'interactor_B_species': 'Chemical synthesis (Chemical synthesis)',
  'molecule_A': 'sphingosine',
  'molecule_B': 'sphingosine',
  'id_A': 'CHEBI:16393',
  'id_B': 'CHEBI:16393',
  'pubmed_publication_id': '32917722',
  'intact_link_to': '16393'},
 {'interaction_id': 'EBI-26355565',
  'interactor_id_A': 'EBI-26355518',
  'interactor_id_B': 'EBI-26355518',
  'score': 0.4,
  'biological_role_A': 'unspecified role',
  'biological_role_B': 'competitor',
  'type': 'physical association',
  'detection_method': 'pull down',
  'host_organism': 'In v

In [9]:
pygraph = generator.build_networkx_graph(combined_df)

Building graph: 100%|██████████| 2/2 [00:00<00:00, 969.56it/s]


In [10]:
combined_df.shape

(2, 5)

In [11]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="Test network")

Applying default style...
Applying preferred layout


In [3]:
genes_of_interest = """ENSMUSG00000026295
ENSMUSG00000022877
ENSMUSG00000020914
ENSMUSG00000024747
ENSMUSG00000032081
ENSMUSG00000004035
ENSMUSG00000072949
ENSMUSG00000028970
ENSMUSG00000028937
ENSMUSG00000075044
ENSMUSG00000067274
ENSMUSG00000000001
ENSMUSG00000030619
ENSMUSG00000027490
ENSMUSG00000022472
ENSMUSG00000059552"""

gene_list = genes_of_interest.split("\n")
len(gene_list)

16

In [4]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head()

input_species = "Mouse"

bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species=input_species,
    input_datasource="Ensembl",
    output_datasource="All",
)

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000026295,Ensembl,4434677,Affy
1,ENSMUSG00000026295,Ensembl,Q8K1I3,Uniprot-TrEMBL
2,ENSMUSG00000026295,Ensembl,A_51_P166152,Agilent
3,ENSMUSG00000026295,Ensembl,5314352,Affy
4,ENSMUSG00000026295,Ensembl,4638627,Affy
5,ENSMUSG00000026295,Ensembl,4315932,Affy
6,ENSMUSG00000026295,Ensembl,1418916_a_at,Affy
7,ENSMUSG00000026295,Ensembl,5031793,Affy
8,ENSMUSG00000026295,Ensembl,ENSMUSG00000026295,Ensembl
9,ENSMUSG00000026295,Ensembl,ILMN_2903540,Illumina


In [5]:
kegg_df, kegg_metadata = kegg.get_pathways(bridgedb_df)
kegg_df.head()

Getting KEGG IDs: 100%|██████████| 2/2 [00:09<00:00,  4.61s/it]


Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,ENSMUSG00000026295,Ensembl,75396,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
1,ENSMUSG00000022877,Ensembl,94175,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
2,ENSMUSG00000020914,Ensembl,21973,NCBI Gene,"[{'pathway_id': 'path:mmu01524', 'pathway_labe..."
3,ENSMUSG00000024747,Ensembl,26358,NCBI Gene,"[{'pathway_id': 'path:mmu00830', 'pathway_labe..."
4,ENSMUSG00000032081,Ensembl,11814,NCBI Gene,"[{'pathway_id': 'path:mmu03320', 'pathway_labe..."


In [6]:
kegg_df["KEGG_pathways"][3]

[{'pathway_id': 'path:mmu00830',
  'pathway_label': 'Retinol metabolism - Mus musculus (house mouse)',
  'pathway_gene_counts': 101,
  'pathway_compounds': [{'KEGG_id': 'C00376'},
   {'KEGG_id': 'C00473'},
   {'KEGG_id': 'C00777'},
   {'KEGG_id': 'C00778'},
   {'KEGG_id': 'C00899'},
   {'KEGG_id': 'C02075'},
   {'KEGG_id': 'C02094'},
   {'KEGG_id': 'C02110'},
   {'KEGG_id': 'C02588'},
   {'KEGG_id': 'C03455'},
   {'KEGG_id': 'C05914'},
   {'KEGG_id': 'C05915'},
   {'KEGG_id': 'C05916'},
   {'KEGG_id': 'C05917'},
   {'KEGG_id': 'C11061'},
   {'KEGG_id': 'C15492'},
   {'KEGG_id': 'C15493'},
   {'KEGG_id': 'C16677'},
   {'KEGG_id': 'C16678'},
   {'KEGG_id': 'C16679'},
   {'KEGG_id': 'C16680'},
   {'KEGG_id': 'C16681'},
   {'KEGG_id': 'C16682'},
   {'KEGG_id': 'C16683'},
   {'KEGG_id': 'C21797'}]},
 {'pathway_id': 'path:mmu01100',
  'pathway_label': 'Metabolic pathways - Mus musculus (house mouse)',
  'pathway_gene_counts': 0,
  'pathway_compounds': [{'KEGG_id': None}]}]

In [6]:
compounds_of_interest ="""C15492
C00473
C21797
"""

metabolite_list = compounds_of_interest.split("\n")
data_input = pd.DataFrame(metabolite_list, columns=["identifier"])

bridgedb_df_cmp, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Mouse",
    input_datasource="KEGG Compound",
    output_datasource="All",
)
bridgedb_df_cmp.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,C15492,KEGG Compound,HMDB11618,HMDB
1,C15492,KEGG Compound,Q27123154,Wikidata
2,C15492,KEGG Compound,394057,ChemSpider
3,C15492,KEGG Compound,HMDB0011618,HMDB
4,C15492,KEGG Compound,OVBOQVAIYMSUDT-HRYGCDPOSA-N,InChIKey
5,C15492,KEGG Compound,52075,ChEBI
6,C15492,KEGG Compound,CHEBI:52075,ChEBI
7,C15492,KEGG Compound,C15492,KEGG Compound
8,C15492,KEGG Compound,115797-14-3,CAS
9,C15492,KEGG Compound,LMPR01090047,LIPID MAPS


In [7]:
kegg_compound_df = kegg.get_compounds(bridgedb_df_cmp)
kegg_compound_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_compounds
0,C15492,KEGG Compound,C15492,KEGG Compound,"{'KEGG_id': 'C15492', 'KEGG_compound_name': 'a..."
1,C00473,KEGG Compound,C00473,KEGG Compound,"{'KEGG_id': 'C00473', 'KEGG_compound_name': 'R..."
2,C00473,KEGG Compound,C17276,KEGG Compound,"{'KEGG_id': 'C17276', 'KEGG_compound_name': 'R..."
3,C21797,KEGG Compound,C21797,KEGG Compound,"{'KEGG_id': 'C21797', 'KEGG_compound_name': 'a..."


In [8]:
combined_df = combine_sources(
    bridgedb_df,
    [
        kegg_df,
    ],
)

In [9]:
combined_df, compoundwiki_metadata = compoundwiki.get_compound_annotations(combined_df, kegg_compound_df=kegg_compound_df)

Processing KEGG column for compounds...


KeyError: 'KEGG_pathways'

In [11]:
combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,ENSMUSG00000026295,Ensembl,ENSMUSG00000026295,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
1,ENSMUSG00000022877,Ensembl,ENSMUSG00000022877,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
2,ENSMUSG00000020914,Ensembl,ENSMUSG00000020914,Ensembl,"[{'pathway_id': 'path:mmu01524', 'pathway_labe..."
3,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,"[{'pathway_id': 'path:mmu00830', 'pathway_labe..."
4,ENSMUSG00000032081,Ensembl,ENSMUSG00000032081,Ensembl,"[{'pathway_id': 'path:mmu03320', 'pathway_labe..."
