# Example: Gene Workflow

This notebook provides insights on how to use the tool if you have list of genes.
Our current data sources include:
* DisGeNet
* MolMeDB
* OpenTargets
* WikiPathways
* WikiData
* STRING
* Bgee

In [1]:
# Import modules
import pandas as pd

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import disgenet, molmedb  # Example with just two resource
from pyBiodatafuse.utils import combine_sources

## Load the input list and convert it to a dataframe

In [2]:
genes_of_interest = """AGRN
ALG14
ALG2
CHAT
CHD8
CHRNA1
CHRNB1
CHRND
CHRNE
CHRNG
COL13A1
COLQ
DOK7
SLC22A5"""
gene_list = genes_of_interest.split("\n")
len(gene_list)

14

In [3]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,AGRN
1,ALG14
2,ALG2
3,CHAT
4,CHD8


## Entity resolution using BridgeDB

In [4]:
bridgdb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Human",
    input_datasource="HGNC",
    output_datasource="All",
)
bridgdb_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,AGRN,HGNC,GO:0046872,Gene Ontology
1,AGRN,HGNC,4053565,Affy
2,AGRN,HGNC,4053601,Affy
3,AGRN,HGNC,4053600,Affy
4,AGRN,HGNC,4053602,Affy


## Disease annotating using DisGeNet

In [5]:
api_key = "0209751bfa7b6a981a8f5fb5f062313067ecd36c"  # TODO: add your key
params = {"source": "CURATED", "format": "json"}  # only curated data
disgenet_result, disgenet_metadata = disgenet.get_gene_disease(
    bridgedb_df=bridgdb_df, api_key=api_key, params=params
)
disgenet_result.head()

Unnamed: 0,identifier,identifier.source,target,target.source,DisGeNET
0,AGRN,HGNC,375790,NCBI Gene,"[{'gene_dsi': 0.626, 'gene_dpi': 0.538, 'gene_..."
1,ALG14,HGNC,199857,NCBI Gene,"[{'gene_dsi': 0.722, 'gene_dpi': 0.308, 'gene_..."
2,ALG2,HGNC,85365,NCBI Gene,"[{'gene_dsi': 0.67, 'gene_dpi': 0.423, 'gene_p..."
3,CHAT,HGNC,1103,NCBI Gene,"[{'gene_dsi': 0.52, 'gene_dpi': 0.808, 'gene_p..."
4,CHD8,HGNC,57680,NCBI Gene,"[{'gene_dsi': 0.656, 'gene_dpi': 0.577, 'gene_..."


In [6]:
disgenet_result["DisGeNET"][0]

[{'gene_dsi': 0.626,
  'gene_dpi': 0.538,
  'gene_pli': 5.4727e-07,
  'protein_class': None,
  'protein_class_name': None,
  'diseaseid': 'C3808739',
  'disease_name': 'MYASTHENIC SYNDROME, CONGENITAL, 8',
  'disease_class': None,
  'disease_class_name': None,
  'disease_type': 'disease',
  'disease_semantic_type': 'Disease or Syndrome',
  'score': 0.8,
  'ei': 1.0,
  'el': None,
  'year_initial': 2009.0,
  'year_final': 2014.0,
  'source': 'CURATED'},
 {'gene_dsi': 0.626,
  'gene_dpi': 0.538,
  'gene_pli': 5.4727e-07,
  'protein_class': None,
  'protein_class_name': None,
  'diseaseid': 'C0751882',
  'disease_name': 'Myasthenic Syndromes, Congenital',
  'disease_class': 'C16;C10',
  'disease_class_name': '   Congenital, Hereditary, and Neonatal Diseases and Abnormalities;    Nervous System Diseases',
  'disease_type': 'disease',
  'disease_semantic_type': 'Disease or Syndrome',
  'score': 0.65,
  'ei': 1.0,
  'el': 'strong',
  'year_initial': 2009.0,
  'year_final': 2020.0,
  'source'

## Inhibitors of trasporters encoded by genes annotation by MolMeDB

In [7]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_mol_inhibitor(bridgedb_df=bridgdb_df)
inhibitor_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,transporter_inhibitor
0,AGRN,HGNC,A0A087X208,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
1,AGRN,HGNC,A0A494C0G5,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
2,AGRN,HGNC,A0A494C1I6,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
3,AGRN,HGNC,O00468,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
4,ALG14,HGNC,Q96F25,Uniprot-TrEMBL,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."


In [8]:
inhibitor_df["transporter_inhibitor"][72]

[{'label': 'Cefepime',
  'InChIKey': 'HVFLCNVBZFFHBT-UHFFFAOYSA-O',
  'SMILES': 'CON=C(C(=O)NC1C(=O)N2C(C(=O)O)=C(C[N+]3(C)CCCC3)CSC12)c1csc(N)n1',
  'pubchem_compound_id': '2623',
  'molmedb_id': 'MM16967',
  'source_doi': 'doi:10.1074/jbc.275.3.1699',
  'source_pmid': '10636865',
  'chebi_id': nan,
  'drugbank_id': nan},
 {'label': 'Cephaloridine',
  'InChIKey': 'CZTQZXZIADLWOZ-UHFFFAOYSA-O',
  'SMILES': 'O=C(Cc1cccs1)NC1C(=O)N2C(C(=O)O)=C(C[n+]3ccccc3)CSC12',
  'pubchem_compound_id': '5773',
  'molmedb_id': 'MM00638',
  'source_doi': 'doi:10.1074/jbc.275.3.1699',
  'source_pmid': '10636865',
  'chebi_id': '3537',
  'drugbank_id': 'DB09008'}]

## Combing all the results into single dataframe

In [9]:
combined_df = combine_sources([disgenet_result, inhibitor_df])

In [10]:
combined_df.head(4)

Unnamed: 0,identifier,identifier.source,target,target.source,DisGeNET,transporter_inhibitor
0,AGRN,HGNC,375790,NCBI Gene,"[{'gene_dsi': 0.626, 'gene_dpi': 0.538, 'gene_...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
1,ALG14,HGNC,199857,NCBI Gene,"[{'gene_dsi': 0.722, 'gene_dpi': 0.308, 'gene_...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
2,ALG2,HGNC,85365,NCBI Gene,"[{'gene_dsi': 0.67, 'gene_dpi': 0.423, 'gene_p...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
3,CHAT,HGNC,1103,NCBI Gene,"[{'gene_dsi': 0.52, 'gene_dpi': 0.808, 'gene_p...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
