# Workflow playground for the project

In [1]:
# Import modules
import os

import pandas as pd

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import disgenet, molmedb, opentargets, wikipathways
from pyBiodatafuse.utils import combine_sources

## Load the input list and convert it to a dataframe

In [2]:
genes_of_interest = """AGRN
ALG14
ALG2
CHAT
CHD8
CHRNA1
CHRNB1
CHRND
CHRNE
CHRNG
COL13A1
COLQ
DOK7
SLC22A5"""
gene_list = genes_of_interest.split("\n")
len(gene_list)

14

In [3]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,AGRN
1,ALG14
2,ALG2
3,CHAT
4,CHD8


## Entity resolution using BridgeDB

In [4]:
bridgdb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input["identifier"],
    input_species="Human",
    input_datasource="HGNC",
    output_datasource="All",
)
bridgdb_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,AGRN,HGNC,GO:0046872,Gene Ontology
1,AGRN,HGNC,4053565,Affy
2,AGRN,HGNC,4053601,Affy
3,AGRN,HGNC,4053600,Affy
4,AGRN,HGNC,4053602,Affy


## Disease annotating using DisGeNet

In [5]:
api_key = "0209751bfa7b6a981a8f5fb5f062313067ecd36c"  # TODO: add your key
params = {"source": "CURATED", "format": "json"}  # only curated data
disgenet_result, disgenet_metadata = disgenet.get_gene_disease(
    bridgedb_df=bridgdb_df, api_key=api_key, params=params
)
disgenet_result.head()

Unnamed: 0,identifier,identifier.source,target,target.source,DisGeNET
0,AGRN,HGNC,375790,NCBI Gene,"[{'gene_dsi': 0.626, 'gene_dpi': 0.538, 'gene_..."
1,ALG14,HGNC,199857,NCBI Gene,"[{'gene_dsi': 0.722, 'gene_dpi': 0.308, 'gene_..."
2,ALG2,HGNC,85365,NCBI Gene,"[{'gene_dsi': 0.67, 'gene_dpi': 0.423, 'gene_p..."
3,CHAT,HGNC,1103,NCBI Gene,"[{'gene_dsi': 0.52, 'gene_dpi': 0.808, 'gene_p..."
4,CHD8,HGNC,57680,NCBI Gene,"[{'gene_dsi': 0.656, 'gene_dpi': 0.577, 'gene_..."


In [6]:
disgenet_result["DisGeNET"][0]

[{'gene_dsi': 0.626,
  'gene_dpi': 0.538,
  'gene_pli': 5.4727e-07,
  'protein_class': None,
  'protein_class_name': None,
  'diseaseid': 'C3808739',
  'disease_name': 'MYASTHENIC SYNDROME, CONGENITAL, 8',
  'disease_class': None,
  'disease_class_name': None,
  'disease_type': 'disease',
  'disease_semantic_type': 'Disease or Syndrome',
  'score': 0.8,
  'ei': 1.0,
  'el': None,
  'year_initial': 2009.0,
  'year_final': 2014.0,
  'source': 'CURATED'},
 {'gene_dsi': 0.626,
  'gene_dpi': 0.538,
  'gene_pli': 5.4727e-07,
  'protein_class': None,
  'protein_class_name': None,
  'diseaseid': 'C0751882',
  'disease_name': 'Myasthenic Syndromes, Congenital',
  'disease_class': 'C16;C10',
  'disease_class_name': '   Congenital, Hereditary, and Neonatal Diseases and Abnormalities;    Nervous System Diseases',
  'disease_type': 'disease',
  'disease_semantic_type': 'Disease or Syndrome',
  'score': 0.65,
  'ei': 1.0,
  'el': 'strong',
  'year_initial': 2009.0,
  'year_final': 2020.0,
  'source'

## Gene location annotation from OpenTargets

In [7]:
loc_df, opentargets_loc_metadata = opentargets.get_gene_location(bridgedb_df=bridgdb_df)
loc_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_Location
0,AGRN,HGNC,ENSG00000188157,Ensembl,"[{'loc_identifier': 'SL-0243', 'subcellular_lo..."
1,ALG14,HGNC,ENSG00000172339,Ensembl,"[{'loc_identifier': 'SL-0097', 'subcellular_lo..."
2,ALG2,HGNC,ENSG00000119523,Ensembl,"[{'loc_identifier': 'SL-0162', 'subcellular_lo..."
3,CHAT,HGNC,ENSG00000070748,Ensembl,"[{'loc_identifier': nan, 'subcellular_loc': na..."
4,CHD8,HGNC,ENSG00000100888,Ensembl,"[{'loc_identifier': 'SL-0191', 'subcellular_lo..."


In [8]:
go_process_df, opentargets_go_metadata = opentargets.get_gene_go_process(bridgedb_df=bridgdb_df)
go_process_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,GO_Process
0,AGRN,HGNC,ENSG00000188157,Ensembl,"[{'go_id': 'GO:0005796', 'go_name': 'Golgi lum..."
1,ALG14,HGNC,ENSG00000172339,Ensembl,"[{'go_id': 'GO:0006488', 'go_name': 'dolichol-..."
2,ALG2,HGNC,ENSG00000119523,Ensembl,"[{'go_id': 'GO:0046982', 'go_name': 'protein h..."
3,CHAT,HGNC,ENSG00000070748,Ensembl,"[{'go_id': 'GO:0004102', 'go_name': 'choline O..."
4,CHD8,HGNC,ENSG00000100888,Ensembl,"[{'go_id': 'GO:0005654', 'go_name': 'nucleopla..."


In [9]:
reactome_process_df, opentargets_process_metadata = opentargets.get_gene_reactome_pathways(
    bridgedb_df=bridgdb_df
)
reactome_process_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Reactome_Pathways
0,AGRN,HGNC,ENSG00000188157,Ensembl,"[{'pathway_name': 'ECM proteoglycans', 'pathwa..."
1,ALG14,HGNC,ENSG00000172339,Ensembl,[{'pathway_name': 'Biosynthesis of the N-glyca...
2,ALG2,HGNC,ENSG00000119523,Ensembl,[{'pathway_name': 'Biosynthesis of the N-glyca...
3,CHAT,HGNC,ENSG00000070748,Ensembl,"[{'pathway_name': 'Synthesis of PC', 'pathway_..."
4,CHD8,HGNC,ENSG00000100888,Ensembl,[{'pathway_name': 'Deactivation of the beta-ca...


In [10]:
drug_df, opentargets_drug_metadata = opentargets.get_gene_drug_interactions(bridgedb_df=bridgdb_df)
drug_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,ChEMBL_Drugs
0,AGRN,HGNC,ENSG00000188157,Ensembl,"[{'chembl_id': nan, 'drug_name': nan, 'relatio..."
1,ALG14,HGNC,ENSG00000172339,Ensembl,"[{'chembl_id': nan, 'drug_name': nan, 'relatio..."
2,ALG2,HGNC,ENSG00000119523,Ensembl,"[{'chembl_id': nan, 'drug_name': nan, 'relatio..."
3,CHAT,HGNC,ENSG00000070748,Ensembl,"[{'chembl_id': nan, 'drug_name': nan, 'relatio..."
4,CHD8,HGNC,ENSG00000100888,Ensembl,"[{'chembl_id': nan, 'drug_name': nan, 'relatio..."


In [11]:
disease_df, opentargets_disease_metadata = opentargets.get_gene_disease_associations(
    bridgedb_df=bridgdb_df
)
disease_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_Diseases
0,AGRN,HGNC,ENSG00000188157,Ensembl,"[{'disease_id': nan, 'disease_name': nan, 'the..."
1,ALG14,HGNC,ENSG00000172339,Ensembl,"[{'disease_id': nan, 'disease_name': nan, 'the..."
2,ALG2,HGNC,ENSG00000119523,Ensembl,"[{'disease_id': nan, 'disease_name': nan, 'the..."
3,CHAT,HGNC,ENSG00000070748,Ensembl,"[{'disease_id': nan, 'disease_name': nan, 'the..."
4,CHD8,HGNC,ENSG00000100888,Ensembl,"[{'disease_id': nan, 'disease_name': nan, 'the..."


## Inhibitors of trasporters encoded by genes annotation by MolMeDB

In [12]:
inhibitor_df, inhibitor_metadata = molmedb.get_transporter_inhibitor(bridgedb_df=bridgdb_df)
inhibitor_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,transporter_inhibitor
0,AGRN,HGNC,A0A087X208,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
1,AGRN,HGNC,A0A494C0G5,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
2,AGRN,HGNC,A0A494C1I6,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
3,AGRN,HGNC,O00468,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
4,ALG14,HGNC,Q96F25,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."


In [13]:
inhibitor_df["transporter_inhibitor"][72]

[{'label': 'Cefepime',
  'InChIKey': 'HVFLCNVBZFFHBT-UHFFFAOYSA-O',
  'SMILES': 'CON=C(C(=O)NC1C(=O)N2C(C(=O)O)=C(C[N+]3(C)CCCC3)CSC12)c1csc(N)n1',
  'pubchem_compound_id': '2623',
  'molmedb_id': 'MM16967',
  'source_doi': 'doi:10.1074/jbc.275.3.1699',
  'source_pmid': '10636865',
  'chebi_id': nan,
  'drugbank_id': nan},
 {'label': 'Cephaloridine',
  'InChIKey': 'CZTQZXZIADLWOZ-UHFFFAOYSA-O',
  'SMILES': 'O=C(Cc1cccs1)NC1C(=O)N2C(C(=O)O)=C(C[n+]3ccccc3)CSC12',
  'pubchem_compound_id': '5773',
  'molmedb_id': 'MM00638',
  'source_doi': 'doi:10.1074/jbc.275.3.1699',
  'source_pmid': '10636865',
  'chebi_id': '3537',
  'drugbank_id': 'DB09008'}]

## Combing all the results into single dataframe

In [14]:
combined_df = combine_sources(
    [disgenet_result, loc_df, go_process_df, reactome_process_df, drug_df, disease_df, inhibitor_df]
)

In [15]:
combined_df.head(4)

Unnamed: 0,identifier,identifier.source,target,target.source,DisGeNET,OpenTargets_Location,GO_Process,Reactome_Pathways,ChEMBL_Drugs,OpenTargets_Diseases,transporter_inhibitor
0,AGRN,HGNC,375790,NCBI Gene,"[{'gene_dsi': 0.626, 'gene_dpi': 0.538, 'gene_...","[{'loc_identifier': 'SL-0243', 'subcellular_lo...","[{'go_id': 'GO:0005796', 'go_name': 'Golgi lum...","[{'pathway_name': 'ECM proteoglycans', 'pathwa...","[{'chembl_id': nan, 'drug_name': nan, 'relatio...","[{'disease_id': nan, 'disease_name': nan, 'the...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
1,ALG14,HGNC,199857,NCBI Gene,"[{'gene_dsi': 0.722, 'gene_dpi': 0.308, 'gene_...","[{'loc_identifier': 'SL-0097', 'subcellular_lo...","[{'go_id': 'GO:0006488', 'go_name': 'dolichol-...",[{'pathway_name': 'Biosynthesis of the N-glyca...,"[{'chembl_id': nan, 'drug_name': nan, 'relatio...","[{'disease_id': nan, 'disease_name': nan, 'the...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
2,ALG2,HGNC,85365,NCBI Gene,"[{'gene_dsi': 0.67, 'gene_dpi': 0.423, 'gene_p...","[{'loc_identifier': 'SL-0162', 'subcellular_lo...","[{'go_id': 'GO:0046982', 'go_name': 'protein h...",[{'pathway_name': 'Biosynthesis of the N-glyca...,"[{'chembl_id': nan, 'drug_name': nan, 'relatio...","[{'disease_id': nan, 'disease_name': nan, 'the...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
3,CHAT,HGNC,1103,NCBI Gene,"[{'gene_dsi': 0.52, 'gene_dpi': 0.808, 'gene_p...","[{'loc_identifier': nan, 'subcellular_loc': na...","[{'go_id': 'GO:0004102', 'go_name': 'choline O...","[{'pathway_name': 'Synthesis of PC', 'pathway_...","[{'chembl_id': nan, 'drug_name': nan, 'relatio...","[{'disease_id': nan, 'disease_name': nan, 'the...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."


# Metabolite workflow

In [16]:
metabolites_of_interest = """100208
10040286
10041551
10025195"""
metabolite_list = metabolites_of_interest.split("\n")
len(metabolite_list)

4

In [17]:
data_input = pd.DataFrame(metabolite_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,100208
1,10040286
2,10041551
3,10025195


In [18]:
bridgdb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input["identifier"],
    input_species="Human",
    input_datasource="PubChem Compound",
    output_datasource="All",
)
bridgdb_df.head(100)

Unnamed: 0,identifier,identifier.source,target,target.source
0,100208,PubChem-compound,90560,ChemSpider
1,100208,PubChem-compound,100208,PubChem Compound
2,100208,PubChem-compound,HMDB0244377,HMDB
3,100208,PubChem-compound,OFDNQWIFNXBECV-UHFFFAOYSA-N,InChIKey
4,100208,PubChem-compound,C11280,KEGG Compound
5,100208,PubChem-compound,Q104246146,Wikidata
6,10040286,PubChem-compound,10040286,PubChem Compound
7,10040286,PubChem-compound,CHEMBL602850,ChEMBL compound
8,10040286,PubChem-compound,Q27163373,Wikidata
9,10040286,PubChem-compound,91540,ChEBI


In [20]:
transporter_inhibited_df, transporter_inhibited_metadata = molmedb.get_transporter_inhibited(bridgedb_df=bridgdb_df)
transporter_inhibited_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,transporter_inhibited
0,100208,PubChem-compound,OFDNQWIFNXBECV-UHFFFAOYSA-N,InChIKey,"[{'uniprot_trembl_id': 'P08183', 'hgcn_id': 'A..."
1,10025195,PubChem-compound,LEJRLSZVESQKJK-UHFFFAOYSA-N,InChIKey,"[{'uniprot_trembl_id': 'Q01959', 'hgcn_id': 'S..."
2,10040286,PubChem-compound,FYGREZKTJIXWIH-UHFFFAOYSA-N,InChIKey,"[{'uniprot_trembl_id': 'Q01959', 'hgcn_id': 'S..."
3,10041551,PubChem-compound,OVVBIIBBRZVPAL-UHFFFAOYSA-N,InChIKey,"[{'uniprot_trembl_id': 'P23975', 'hgcn_id': 'S..."


In [21]:
transporter_inhibited_df["transporter_inhibited"][0]

[{'uniprot_trembl_id': 'P08183',
  'hgcn_id': 'ABCB1',
  'source_doi': 'doi:10.1074/jbc.271.6.3163',
  'source_pmid': '8621716'}]