# Example: Polysistice Kidney Disease use case

In [1]:
new_path = r"C:\Users\are10\Documents\BAFSTU\code\pyBioDatafusemain\pyBiodatafuse\examples\usecases"

# Setting up the working directory
import os
import sys

# Define the absolute path to the src directory
src_path = os.path.abspath(os.path.join("..", "..", "..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)

print(sys.path)  # Verify the correct src path is included

['C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\python39.zip', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\DLLs', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv', '', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\src']


In [2]:
# Import modules
import pickle

import pandas as pd
from dotenv import load_dotenv

from pyBiodatafuse import id_mapper
from pyBiodatafuse import human_homologs
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    kegg,
    minerva,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.constants import (
    BGEE_GENE_EXPRESSION_LEVELS_COL,
    DISGENET_DISEASE_COL,
    KEGG_COL,
    MINERVA,
    MOLMEDB_PROTEIN_COMPOUND_COL,
    OPENTARGETS_DISEASE_COMPOUND_COL,
    OPENTARGETS_GENE_COMPOUND_COL,
    OPENTARGETS_GO_COL,
    OPENTARGETS_REACTOME_COL,
    PUBCHEM_COMPOUND_ASSAYS_COL,
    STRING_PPI_COL,
    WIKIPATHWAYS,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import combine_sources, create_harmonized_input_file

# 1. Entity resolution using BridgeDB

### 1.1. Load the input list and convert it to a dataframe

Small set of mouse data used for debugging.

In [None]:
# TEST Mice Ensembl
genes_of_interest = """ENSMUSG00000067274
ENSMUSG00000000001
ENSMUSG00000084349
ENSMUSG00000025428
ENSMUSG00000044533"""

# TEST Rat Ensembl
# genes_of_interest = """ENSRNOG00060027926
# ENSRNOG00055005387
# ENSRNOG00060018596
# ENSRNOG00060011358
# ENSRNOG00055009275 
# """

# TEST Mice MGI
# genes_of_interest = """14679
# 100043000
# 11946
# 16898
# 11837"""

# TEST Human HGNC
# genes_of_interest = """CHRNG
# DMD
# AHR
# SCN4A
# LC25A1
# HTR3A"""

# TEST Human Ensembl
# genes_of_interest = """ENSG00000072080
# ENSG00000113905
# ENSG00000131747
# ENSG00000165092
# ENSG00000110245
# ENSG00000213366
# ENSG00000184227
# ENSG00000085563
# ENSG00000097021
# ENSG00000149742"""

gene_list = genes_of_interest.split("\n")
len(gene_list)

5

Mouse use case

In [4]:
from pyBiodatafuse import data_loader, id_mapper

data_input = data_loader.create_df_from_dea("data/full_de_genes_treated_vs_untreated_plus_cpm_fc.csv")
data_filtered = data_input[data_input['DE'].isin([1, -1])]
features_filtered = data_filtered['identifier']

gene_list = features_filtered.tolist()
print(len(gene_list))

81


In [5]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head(20)

Unnamed: 0,identifier
0,ENSMUSG00000100426
1,ENSMUSG00000032087
2,ENSMUSG00000024747
3,ENSMUSG00000091813
4,ENSMUSG00000004035
5,ENSMUSG00000001155
6,ENSMUSG00000021336
7,ENSMUSG00000027331
8,ENSMUSG00000028873
9,ENSMUSG00000038486


In [6]:
print(gene_list)

['ENSMUSG00000100426', 'ENSMUSG00000032087', 'ENSMUSG00000024747', 'ENSMUSG00000091813', 'ENSMUSG00000004035', 'ENSMUSG00000001155', 'ENSMUSG00000021336', 'ENSMUSG00000027331', 'ENSMUSG00000028873', 'ENSMUSG00000038486', 'ENSMUSG00000044816', 'ENSMUSG00000003038', 'ENSMUSG00000015451', 'ENSMUSG00000026295', 'ENSMUSG00000056749', 'ENSMUSG00000016756', 'ENSMUSG00000027452', 'ENSMUSG00000027577', 'ENSMUSG00000029368', 'ENSMUSG00000026205', 'ENSMUSG00000027379', 'ENSMUSG00000028715', 'ENSMUSG00000035186', 'ENSMUSG00000043439', 'ENSMUSG00000055629', 'ENSMUSG00000079494', 'ENSMUSG00000045328', 'ENSMUSG00000075044', 'ENSMUSG00000020914', 'ENSMUSG00000001334', 'ENSMUSG00000006398', 'ENSMUSG00000022415', 'ENSMUSG00000021213', 'ENSMUSG00000002870', 'ENSMUSG00000061959', 'ENSMUSG00000001313', 'ENSMUSG00000007950', 'ENSMUSG00000017390', 'ENSMUSG00000025983', 'ENSMUSG00000026049', 'ENSMUSG00000026415', 'ENSMUSG00000030825', 'ENSMUSG00000037628', 'ENSMUSG00000048489', 'ENSMUSG00000051483', 'ENSMUSG0

### 1.2. Query BridgeDB

In [7]:
# Mouse usecase
input_species="Mouse"

bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
   identifiers=data_input,
    input_species=input_species,
    input_datasource="Ensembl",
    output_datasource="All",
)

# TEST Human data
# bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
#     identifiers=data_input,
#     input_species="Human",
#     input_datasource="HGNC",
#    output_datasource="All",
# )

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000100426,Ensembl,MGI:3782384,MGI
1,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl
2,ENSMUSG00000032087,Ensembl,MGI:2150309,MGI
3,ENSMUSG00000032087,Ensembl,114873,NCBI Gene
4,ENSMUSG00000032087,Ensembl,A0A1L1SQZ7,Uniprot-TrEMBL
5,ENSMUSG00000032087,Ensembl,E9QPR7,Uniprot-TrEMBL
6,ENSMUSG00000032087,Ensembl,Q4VA61,Uniprot-TrEMBL
7,ENSMUSG00000032087,Ensembl,A0A1L1SQ53,Uniprot-TrEMBL
8,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl
9,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl


# 2. Step-by-step graph generation

### 2.X Homologs

In [8]:
ensembl_homologs_df, ensembl_metadata = human_homologs.get_homologs(bridgedb_df=bridgedb_df)
ensembl_homologs_df.head()
homologs = [entry["homolog"] for entry in ensembl_homologs_df["Ensembl_homologs"] if isinstance(entry, dict)]

{"release":"15.9"}


In [9]:
print(homologs)

['ENSG00000177103', 'ENSG00000165092', 'ENSG00000213366', 'ENSG00000160282', 'ENSG00000146039', 'ENSG00000128944', 'ENSG00000134690', 'ENSG00000159164', 'ENSG00000188674', 'ENSG00000224389', 'ENSG00000072080', 'ENSG00000165030', 'ENSG00000154930', 'ENSG00000101204', 'ENSG00000163631', 'ENSG00000213901', 'ENSG00000169679', 'ENSG00000213886', 'ENSG00000273604', 'ENSG00000182272', 'ENSG00000144035', 'ENSG00000138778', 'ENSG00000149742', 'ENSG00000131747', 'ENSG00000160097', 'ENSG00000117399', 'ENSG00000100321', 'ENSG00000073111', 'ENSG00000108830', 'ENSG00000127220', 'ENSG00000109107', 'ENSG00000144395', 'ENSG00000151287', 'ENSG00000162897', 'ENSG00000087076', 'ENSG00000100526', 'ENSG00000165507', 'ENSG00000159228', 'ENSG00000120054', 'ENSG00000149554', 'ENSG00000184545', 'ENSG00000170430', 'ENSG00000123427', 'ENSG00000120800', 'ENSG00000113905', 'ENSG00000169245', 'ENSG00000072571', 'ENSG00000101412', 'ENSG00000136689', 'ENSG00000102837', 'ENSG00000112299', 'ENSG00000168268', 'ENSG000000

In [10]:
input_species="Human"

data_input_hl = pd.DataFrame(homologs, columns=["identifier"])

bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input_hl,
    input_species="Human",
    input_datasource="Ensembl",
    output_datasource="All",
)

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSG00000177103,Ensembl,57453,NCBI Gene
1,ENSG00000177103,Ensembl,DSCAML1,HGNC
2,ENSG00000177103,Ensembl,E9PP71,Uniprot-TrEMBL
3,ENSG00000177103,Ensembl,ENSG00000177103,Ensembl
4,ENSG00000177103,Ensembl,Q8TD84,Uniprot-TrEMBL
5,ENSG00000177103,Ensembl,HGNC:14656,HGNC Accession Number
6,ENSG00000177103,Ensembl,A0A384DVL8,Uniprot-TrEMBL
7,ENSG00000165092,Ensembl,V9HW83,Uniprot-TrEMBL
8,ENSG00000165092,Ensembl,HGNC:402,HGNC Accession Number
9,ENSG00000165092,Ensembl,ENSG00000165092,Ensembl


### 2.1. Gene-Disease edges


In [11]:
load_dotenv()

disgenet_api_key = os.getenv("DISGENET_API_KEY")

In [12]:
disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
    api_key=disgenet_api_key, bridgedb_df=bridgedb_df
)
disgenet_df.head()

Querying DisGeNET: 100%|██████████| 71/71 [00:03<00:00, 19.20it/s]
  disgenet_df, disgenet_metadata = disgenet.get_gene_disease(


In [21]:
disgenet_df[DISGENET_DISEASE_COL][0]

KeyError: 'DISGENET_diseases'

### 2.2 Disease-Compound edges

In [11]:
# Prepare the input to use DISGENET output as seed for OpenTargets
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,UMLS_C1458155,UMLS,EFO_0003869,EFO
1,UMLS_C0003873,UMLS,EFO_0000685,EFO
2,UMLS_C0028754,UMLS,EFO_0001073,EFO
3,UMLS_C0025517,UMLS,EFO_0000589,EFO
4,UMLS_C2973725,UMLS,EFO_0001361,EFO


In [None]:
(
    opentargets_disease_compound_df,
    opentargets_disease_compound_metadata,
) = opentargets.get_disease_compound_interactions(disease_mapping_df)
opentargets_disease_compound_df.head()

In [None]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

### 2.3 Gene-Compound edges

In [13]:
opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
    bridgedb_df=bridgedb_df
)
opentargets_compound_df.head()

Processing gene-drug interactions: 100%|██████████| 71/71 [00:01<00:00, 57.60it/s]
Mapping PubChem: 100%|██████████| 37/37 [00:17<00:00,  2.10it/s]
  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_gene_compounds
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,"[{'chembl_id': 'CHEMBL:CHEMBL1086218', 'drugba..."
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."


In [None]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][0]

### 2.4 Gene-Pathways edges

#### Pathways from WikiPathways

In [14]:
wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df)
wikipathways_df.head()

Querying WikiPathways: 100%|██████████| 3/3 [01:27<00:00, 29.07s/it]


Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways
0,ENSG00000072080,Ensembl,6694,NCBI Gene,"[{'pathway_id': 'WP:WP5224', 'pathway_label': ..."
1,ENSG00000072571,Ensembl,3161,NCBI Gene,"[{'pathway_id': 'WP:WP5461', 'pathway_label': ..."
2,ENSG00000073111,Ensembl,4171,NCBI Gene,"[{'pathway_id': 'WP:WP466', 'pathway_label': '..."
3,ENSG00000085563,Ensembl,5243,NCBI Gene,"[{'pathway_id': 'WP:WP299', 'pathway_label': '..."
4,ENSG00000087076,Ensembl,51171,NCBI Gene,"[{'pathway_id': 'WP:WP5276', 'pathway_label': ..."


In [27]:
wikipathways_df[WIKIPATHWAYS][0]

[{'pathway_id': 'WP:WP5224',
  'pathway_label': '2q37 copy number variation syndrome',
  'pathway_gene_count': 153.0}]

#### Pathways from KEGG

In [15]:
kegg_df, kegg_metadata = kegg.get_pathways(bridgedb_df)
kegg_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,ENSG00000177103,Ensembl,57453,NCBI Gene,
1,ENSG00000165092,Ensembl,216,NCBI Gene,"[{'pathway_id': 'path:hsa00830', 'pathway_labe..."
2,ENSG00000213366,Ensembl,2946,NCBI Gene,"[{'pathway_id': 'path:hsa00480', 'pathway_labe..."
3,ENSG00000160282,Ensembl,10841,NCBI Gene,"[{'pathway_id': 'path:hsa00340', 'pathway_labe..."
4,ENSG00000146039,Ensembl,10050,NCBI Gene,


In [18]:
kegg_df[KEGG_COL][1]

[{'pathway_id': 'path:mmu04015',
  'pathway_name': 'Rap1 signaling pathway - Mus musculus (house mouse)',
  'gene_count': 213,
  'compounds': ['C00035', 'C00044', 'C00076', 'C00165', 'C00575']},
 {'pathway_id': 'path:mmu04022',
  'pathway_name': 'cGMP-PKG signaling pathway - Mus musculus (house mouse)',
  'gene_count': 171,
  'compounds': ['C00020',
   'C00027',
   'C00076',
   'C00144',
   'C00212',
   'C00238',
   'C00533',
   'C00575',
   'C00942',
   'C01245']},
 {'pathway_id': 'path:mmu04024',
  'pathway_name': 'cAMP signaling pathway - Mus musculus (house mouse)',
  'gene_count': 223,
  'compounds': ['C00020',
   'C00042',
   'C00076',
   'C00080',
   'C00165',
   'C00186',
   'C00212',
   'C00238',
   'C00288',
   'C00334',
   'C00416',
   'C00547',
   'C00575',
   'C00584',
   'C00698',
   'C00780',
   'C00788',
   'C01089',
   'C01245',
   'C01312',
   'C01330',
   'C01996',
   'C03758',
   'C20792',
   'C20793']},
 {'pathway_id': 'path:mmu04062',
  'pathway_name': 'Chemokine 

#### Reactome pathways from OpenTargets

In [16]:
opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
    bridgedb_df=bridgedb_df
)
opentargets_reactome_df.head()

Processing gene-pathway interactions: 100%|██████████| 71/71 [00:00<00:00, 267.80it/s]
  opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_reactome
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'pathway_label': 'Platelet degranulation ', ..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,[{'pathway_label': 'Hyaluronan uptake and degr...
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,[{'pathway_label': 'Orc1 removal from chromati...
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,[{'pathway_label': 'Abacavir transmembrane tra...
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'pathway_label': 'Estrogen biosynthesis', 'p..."


In [None]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][0]

### 2.5 Gene Ontology from OpenTargets

In [17]:
opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(
    bridgedb_df=bridgedb_df
)
opentargets_go_df.head()

Processing gene annotation: 100%|██████████| 71/71 [00:00<00:00, 312.52it/s]
  opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_go
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'go_id': 'GO:0005788', 'go_name': 'endoplasm..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,"[{'go_id': 'GO:0005515', 'go_name': 'protein b..."
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,"[{'go_id': 'GO:0005654', 'go_name': 'nucleopla..."
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,"[{'go_id': 'GO:0008559', 'go_name': 'ABC-type ..."
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'go_id': 'GO:0005515', 'go_name': 'protein b..."


In [31]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

[{'go_id': 'GO:0005788',
  'go_name': 'endoplasmic reticulum lumen',
  'go_type': 'C'},
 {'go_id': 'GO:0031089',
  'go_name': 'platelet dense granule lumen',
  'go_type': 'C'},
 {'go_id': 'GO:0004866',
  'go_name': 'endopeptidase inhibitor activity',
  'go_type': 'F'},
 {'go_id': 'GO:0046849', 'go_name': 'bone remodeling', 'go_type': 'P'},
 {'go_id': 'GO:0062023',
  'go_name': 'collagen-containing extracellular matrix',
  'go_type': 'C'},
 {'go_id': 'GO:0005576', 'go_name': 'extracellular region', 'go_type': 'C'},
 {'go_id': 'GO:0001501',
  'go_name': 'skeletal system development',
  'go_type': 'P'}]

### 2.6. Protein-Protein Interactions

In [18]:
input_species = "Human"
ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df, species=input_species)
ppi_df.head()

   queryIndex        queryItem              stringId  ncbiTaxonId  \
0           0  ENSG00000144035  9606.ENSP00000272425         9606   
1           1  ENSG00000106927  9606.ENSP00000265132         9606   
2           2  ENSG00000168268  9606.ENSP00000406933         9606   
3           3  ENSG00000182272  9606.ENSP00000328277         9606   
4           4  ENSG00000165092  9606.ENSP00000297785         9606   
..        ...              ...                   ...          ...   
65         65  ENSG00000178828  9606.ENSP00000364263         9606   
66         66  ENSG00000149554  9606.ENSP00000391090         9606   
67         67  ENSG00000137804  9606.ENSP00000499238         9606   
68         68  ENSG00000138778  9606.ENSP00000265148         9606   
69         69  ENSG00000085563  9606.ENSP00000478255         9606   

       taxonName preferredName  \
0   Homo sapiens          NAT8   
1   Homo sapiens          AMBP   
2   Homo sapiens        NT5DC2   
3   Homo sapiens      B4GALNT4   
4

Unnamed: 0,identifier,identifier.source,target,target.source,StringDB_ppi
0,ENSG00000177103,Ensembl,ENSG00000177103,Ensembl,"[{'stringdb_link_to': 'ENSG00000110245', 'Ense..."
1,ENSG00000165092,Ensembl,ENSG00000165092,Ensembl,"[{'stringdb_link_to': 'ENSG00000159228', 'Ense..."
2,ENSG00000213366,Ensembl,ENSG00000213366,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
3,ENSG00000160282,Ensembl,ENSG00000160282,Ensembl,"[{'stringdb_link_to': 'ENSG00000113905', 'Ense..."
4,ENSG00000146039,Ensembl,ENSG00000146039,Ensembl,"[{'stringdb_link_to': 'ENSG00000213901', 'Ense..."


In [8]:
ppi_df[STRING_PPI_COL][3]

[{'stringdb_link_to': 'Rps2',
  'Ensembl': 'Ensembl:ENSMUSP00000092502',
  'score': 0.902,
  'Uniprot-TrEMBL': 'Atp5a1'},
 {'stringdb_link_to': 'Rplp0',
  'Ensembl': 'Ensembl:ENSMUSP00000083705',
  'score': 0.903,
  'Uniprot-TrEMBL': 'Atp5a1'}]

### 2.7 Gene expression edges

In [10]:
bgee_df, bgee_metadata = bgee.get_gene_expression(bridgedb_df=bridgedb_df)
bgee_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels
0,ENSMUSG00000000001,Ensembl,ENSMUSG00000000001,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a..."
1,ENSMUSG00000025428,Ensembl,ENSMUSG00000025428,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a..."
2,ENSMUSG00000044533,Ensembl,ENSMUSG00000044533,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a..."
3,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a..."
4,ENSMUSG00000084349,Ensembl,ENSMUSG00000084349,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a..."


In [15]:
bgee_df[BGEE_GENE_EXPRESSION_LEVELS_COL][1]

[{'anatomical_entity_id': 'UBERON_0002371',
  'anatomical_entity_name': 'bone marrow',
  'expression_level': 98.45455,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0000955',
  'anatomical_entity_name': 'brain',
  'expression_level': 99.51974,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0004535',
  'anatomical_entity_name': 'cardiovascular system',
  'expression_level': 99.83003,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0001007',
  'anatomical_entity_name': 'digestive system'

### 2.8 Transporter Inhibitors

In [19]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df)
inhibitor_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,MolMeDB_transporter_inhibitor
0,ENSG00000072080,Ensembl,C9J6K0,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
1,ENSG00000072080,Ensembl,Q13103,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
2,ENSG00000072571,Ensembl,E5RI30,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
3,ENSG00000072571,Ensembl,E5RIH2,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
4,ENSG00000072571,Ensembl,O75330,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."


In [34]:
print(inhibitor_df[MOLMEDB_PROTEIN_COMPOUND_COL][47])
inhibitor_df[inhibitor_df["target"] == "P35499"]

[{'compound_name': nan, 'inchikey': nan, 'smiles': nan, 'compound_cid': nan, 'molmedb_id': nan, 'source_pmid': nan, 'chebi_id': nan, 'drugbank_id': nan, 'uniprot_trembl_id': nan}]


Unnamed: 0,identifier,identifier.source,target,target.source,MolMeDB_transporter_inhibitor


# 3. Generating Graph

### 3.1 Combing all the results into single dataframe

In [20]:
#        bgee_df,
#        disgenet_df,
#        minerva_df,
#        opentargets_reactome_df,
#        opentargets_go_df,
#        opentargets_compound_df,
#        inhibitor_df,

combined_df = combine_sources(
    bridgedb_df,
    [
        wikipathways_df,
        kegg_df,
        ppi_df,
        opentargets_reactome_df,
        opentargets_go_df,
        opentargets_compound_df,
        inhibitor_df
    ],
)
combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways,KEGG_pathways,StringDB_ppi,OpenTargets_reactome,OpenTargets_go,OpenTargets_gene_compounds,MolMeDB_transporter_inhibitor
0,ENSG00000177103,Ensembl,ENSG00000177103,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...",,"[{'stringdb_link_to': 'ENSG00000110245', 'Ense...","[{'pathway_label': 'DSCAM interactions', 'path...","[{'go_id': 'GO:0007409', 'go_name': 'axonogene...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."
1,ENSG00000177103,Ensembl,ENSG00000177103,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...",,"[{'stringdb_link_to': 'ENSG00000110245', 'Ense...","[{'pathway_label': 'DSCAM interactions', 'path...","[{'go_id': 'GO:0007409', 'go_name': 'axonogene...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."
2,ENSG00000177103,Ensembl,ENSG00000177103,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...",,"[{'stringdb_link_to': 'ENSG00000110245', 'Ense...","[{'pathway_label': 'DSCAM interactions', 'path...","[{'go_id': 'GO:0007409', 'go_name': 'axonogene...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."
3,ENSG00000165092,Ensembl,ENSG00000165092,Ensembl,"[{'pathway_id': 'WP:WP2855', 'pathway_label': ...","[{'pathway_id': 'path:hsa00830', 'pathway_labe...","[{'stringdb_link_to': 'ENSG00000159228', 'Ense...","[{'pathway_label': 'Fructose catabolism', 'pat...","[{'go_id': 'GO:0051287', 'go_name': 'NAD bindi...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."
4,ENSG00000165092,Ensembl,ENSG00000165092,Ensembl,"[{'pathway_id': 'WP:WP2855', 'pathway_label': ...","[{'pathway_id': 'path:hsa00830', 'pathway_labe...","[{'stringdb_link_to': 'ENSG00000159228', 'Ense...","[{'pathway_label': 'Fructose catabolism', 'pat...","[{'go_id': 'GO:0051287', 'go_name': 'NAD bindi...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."


In [20]:
combined_df.shape

(5, 8)

### 3.2 Exporting the database in pickle format

In [21]:
with open("combined_df.pkl", "wb") as out:
    pickle.dump(combined_df, out)
# with open("opentargets_disease_compound_df.pkl", "wb") as out:
#     pickle.dump(opentargets_disease_compound_df, out)

### 3.3 Creating a graph from the annotated dataframe

In [22]:
# combined_df = generator.load_dataframe_from_pickle("combined_df.pkl")
# opentargets_disease_compound_df = generator.load_dataframe_from_pickle(
#     "opentargets_disease_compound_df.pkl"
# )

combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways,KEGG_pathways,StringDB_ppi,OpenTargets_reactome,OpenTargets_go,OpenTargets_gene_compounds,MolMeDB_transporter_inhibitor
0,ENSG00000177103,Ensembl,ENSG00000177103,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...",,"[{'stringdb_link_to': 'ENSG00000110245', 'Ense...","[{'pathway_label': 'DSCAM interactions', 'path...","[{'go_id': 'GO:0007409', 'go_name': 'axonogene...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."
1,ENSG00000177103,Ensembl,ENSG00000177103,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...",,"[{'stringdb_link_to': 'ENSG00000110245', 'Ense...","[{'pathway_label': 'DSCAM interactions', 'path...","[{'go_id': 'GO:0007409', 'go_name': 'axonogene...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."
2,ENSG00000177103,Ensembl,ENSG00000177103,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...",,"[{'stringdb_link_to': 'ENSG00000110245', 'Ense...","[{'pathway_label': 'DSCAM interactions', 'path...","[{'go_id': 'GO:0007409', 'go_name': 'axonogene...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."
3,ENSG00000165092,Ensembl,ENSG00000165092,Ensembl,"[{'pathway_id': 'WP:WP2855', 'pathway_label': ...","[{'pathway_id': 'path:hsa00830', 'pathway_labe...","[{'stringdb_link_to': 'ENSG00000159228', 'Ense...","[{'pathway_label': 'Fructose catabolism', 'pat...","[{'go_id': 'GO:0051287', 'go_name': 'NAD bindi...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."
4,ENSG00000165092,Ensembl,ENSG00000165092,Ensembl,"[{'pathway_id': 'WP:WP2855', 'pathway_label': ...","[{'pathway_id': 'path:hsa00830', 'pathway_labe...","[{'stringdb_link_to': 'ENSG00000159228', 'Ense...","[{'pathway_label': 'Fructose catabolism', 'pat...","[{'go_id': 'GO:0051287', 'go_name': 'NAD bindi...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'compound_name': nan, 'inchikey': nan, 'smil..."


In [None]:
opentargets_disease_compound_df.head()

In [23]:
pygraph = generator.build_networkx_graph(combined_df)

Building graph: 100%|██████████| 273/273 [00:00<00:00, 1464.94it/s]


### 3.4 Store the graph

In [25]:
with open("networkx_graph_test.pkl", "wb") as out:
    pickle.dump(pygraph, out)

### 3.5 Visualize the graph

In [None]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

# 4. Exporting Graph to external sources

### 4.1 Cytoscape
Make sure that the Cytoscape is open

In [24]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="Test network")

Applying default style...
Applying preferred layout


### 4.2 Neo4j

In [38]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "networkx_graph_test.graphml")

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph  before) Remove all the nodes before importing `.graphml` file

    ```MATCH (n) DETACH DELETE n```

- Import `.graphml` file

    ```call apoc.import.graphml('file:///networkx_graph_test.graphml',{readLabels:TRUE})```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```
    

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) 
        - Gene (```MATCH (n:Gene) RETURN count(n)```)
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```)
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) 
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) 
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) 
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) 
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) 
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) 
        - Disease (```MATCH (n:Disease) RETURN count(n)```) 
        - Compound (```MATCH (n:Compound) RETURN count(n)```)
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) 
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) 
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) 
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) 
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) 
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) 
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) 
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) 
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) 
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) 

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("networkx_graph_test.csv",{})```