# Example: Polysistice Kidney Disease use case

In [24]:
new_path = r"C:\Users\are10\Documents\BAFSTU\code\pyBioDatafusemain\pyBiodatafuse\examples\usecases"

# Setting up the working directory
import os
import sys

src_path = os.path.abspath(os.path.join("..", "..", "..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)

print(sys.path)  # Verify the correct src path is included

['C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\python39.zip', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\DLLs', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv', '', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\src']


In [25]:
# Import modules
import pickle

import pandas as pd
from dotenv import load_dotenv

from pyBiodatafuse import id_mapper
from pyBiodatafuse import human_homologs
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    kegg,
    minerva,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.constants import (
    BGEE_GENE_EXPRESSION_LEVELS_COL,
    DISGENET_DISEASE_COL,
    ENSEMBL_HOMOLOG_COL,
    KEGG_COL,
    MINERVA,
    MOLMEDB_PROTEIN_COMPOUND_COL,
    OPENTARGETS_DISEASE_COMPOUND_COL,
    OPENTARGETS_GENE_COMPOUND_COL,
    OPENTARGETS_GO_COL,
    OPENTARGETS_REACTOME_COL,
    PUBCHEM_COMPOUND_ASSAYS_COL,
    STRING_PPI_COL,
    WIKIPATHWAYS,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import combine_sources, create_harmonized_input_file

# 1. Entity resolution using BridgeDB

### 1.1. Load the input list and convert it to a dataframe

Small set of mouse data used for debugging.

In [None]:
# TEST Mice Ensembl
genes_of_interest = """ENSMUSG00000067274
ENSMUSG00000000001
ENSMUSG00000084349
ENSMUSG00000025428
ENSMUSG00000044533"""

# TEST Mice use case = 
genes_of_interest = """ENSMUSG00000026295
ENSMUSG00000022877
ENSMUSG00000020914
ENSMUSG00000024747
ENSMUSG00000032081
ENSMUSG00000004035
ENSMUSG00000072949
ENSMUSG00000028970
ENSMUSG00000028937
ENSMUSG00000075044
"""

# TEST Rat Ensembl
# genes_of_interest = """ENSRNOG00060027926
# ENSRNOG00055005387
# ENSRNOG00060018596
# ENSRNOG00060011358
# ENSRNOG00055009275 
# """

# TEST Human HGNC
# genes_of_interest = """CHRNG
# DMD
# AHR
# SCN4A
# LC25A1
# HTR3A"""

# TEST Human Ensembl
# genes_of_interest = """ENSG00000072080
# ENSG00000113905
# ENSG00000131747
# ENSG00000165092
# ENSG00000110245
# ENSG00000213366
# ENSG00000184227
# ENSG00000085563
# ENSG00000097021
# ENSG00000149742"""

gene_list = genes_of_interest.split("\n")
len(gene_list)

5

Mouse use case

In [27]:
from pyBiodatafuse import data_loader, id_mapper

data_input = data_loader.create_df_from_dea("data/full_de_genes_treated_vs_untreated_plus_cpm_fc.csv")
data_filtered = data_input[data_input['DE'].isin([1, -1])]
features_filtered = data_filtered['identifier']

gene_list = features_filtered.tolist()
print(len(gene_list))

81


In [28]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head(20)

Unnamed: 0,identifier
0,ENSMUSG00000100426
1,ENSMUSG00000032087
2,ENSMUSG00000024747
3,ENSMUSG00000091813
4,ENSMUSG00000004035
5,ENSMUSG00000001155
6,ENSMUSG00000021336
7,ENSMUSG00000027331
8,ENSMUSG00000028873
9,ENSMUSG00000038486


In [29]:
print(gene_list)

['ENSMUSG00000100426', 'ENSMUSG00000032087', 'ENSMUSG00000024747', 'ENSMUSG00000091813', 'ENSMUSG00000004035', 'ENSMUSG00000001155', 'ENSMUSG00000021336', 'ENSMUSG00000027331', 'ENSMUSG00000028873', 'ENSMUSG00000038486', 'ENSMUSG00000044816', 'ENSMUSG00000003038', 'ENSMUSG00000015451', 'ENSMUSG00000026295', 'ENSMUSG00000056749', 'ENSMUSG00000016756', 'ENSMUSG00000027452', 'ENSMUSG00000027577', 'ENSMUSG00000029368', 'ENSMUSG00000026205', 'ENSMUSG00000027379', 'ENSMUSG00000028715', 'ENSMUSG00000035186', 'ENSMUSG00000043439', 'ENSMUSG00000055629', 'ENSMUSG00000079494', 'ENSMUSG00000045328', 'ENSMUSG00000075044', 'ENSMUSG00000020914', 'ENSMUSG00000001334', 'ENSMUSG00000006398', 'ENSMUSG00000022415', 'ENSMUSG00000021213', 'ENSMUSG00000002870', 'ENSMUSG00000061959', 'ENSMUSG00000001313', 'ENSMUSG00000007950', 'ENSMUSG00000017390', 'ENSMUSG00000025983', 'ENSMUSG00000026049', 'ENSMUSG00000026415', 'ENSMUSG00000030825', 'ENSMUSG00000037628', 'ENSMUSG00000048489', 'ENSMUSG00000051483', 'ENSMUSG0

### 1.2. Query BridgeDB

In [30]:
# Mouse usecase
input_species="Mouse"

bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
   identifiers=data_input,
    input_species=input_species,
    input_datasource="Ensembl",
    output_datasource="All",
)

# TEST Human data
# bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
#     identifiers=data_input,
#     input_species="Human",
#     input_datasource="Ensembl",
#     output_datasource="All",
# )

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000100426,Ensembl,MGI:3782384,MGI
1,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl
2,ENSMUSG00000032087,Ensembl,MGI:2150309,MGI
3,ENSMUSG00000032087,Ensembl,114873,NCBI Gene
4,ENSMUSG00000032087,Ensembl,A0A1L1SQZ7,Uniprot-TrEMBL
5,ENSMUSG00000032087,Ensembl,E9QPR7,Uniprot-TrEMBL
6,ENSMUSG00000032087,Ensembl,Q4VA61,Uniprot-TrEMBL
7,ENSMUSG00000032087,Ensembl,A0A1L1SQ53,Uniprot-TrEMBL
8,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl
9,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl


### 1.3 Homologs

In [31]:
ensembl_homologs_df, ensembl_metadata = human_homologs.get_homologs(bridgedb_df=bridgedb_df)
ensembl_homologs_df.head()


{"release":"15.9"}


Unnamed: 0,identifier,identifier.source,target,target.source,Ensembl_homologs
0,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl,
1,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl,[{'homolog': 'ENSG00000177103'}]
2,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,[{'homolog': 'ENSG00000165092'}]
3,ENSMUSG00000091813,Ensembl,ENSMUSG00000091813,Ensembl,
4,ENSMUSG00000004035,Ensembl,ENSMUSG00000004035,Ensembl,[{'homolog': 'ENSG00000213366'}]


In [32]:
homologs = ensembl_homologs_df[ENSEMBL_HOMOLOG_COL].apply(
    lambda x: x[0]['homolog'] if isinstance(x, list) and len(x) > 0 and 'homolog' in x[0] else None
).dropna().tolist()

print(homologs)

['ENSG00000177103', 'ENSG00000165092', 'ENSG00000213366', 'ENSG00000160282', 'ENSG00000146039', 'ENSG00000128944', 'ENSG00000134690', 'ENSG00000159164', 'ENSG00000188674', 'ENSG00000224389', 'ENSG00000072080', 'ENSG00000165030', 'ENSG00000154930', 'ENSG00000101204', 'ENSG00000163631', 'ENSG00000213901', 'ENSG00000169679', 'ENSG00000213886', 'ENSG00000273604', 'ENSG00000182272', 'ENSG00000144035', 'ENSG00000138778', 'ENSG00000149742', 'ENSG00000131747', 'ENSG00000160097', 'ENSG00000117399', 'ENSG00000100321', 'ENSG00000073111', 'ENSG00000108830', 'ENSG00000127220', 'ENSG00000109107', 'ENSG00000144395', 'ENSG00000151287', 'ENSG00000162897', 'ENSG00000087076', 'ENSG00000100526', 'ENSG00000165507', 'ENSG00000159228', 'ENSG00000120054', 'ENSG00000149554', 'ENSG00000184545', 'ENSG00000170430', 'ENSG00000123427', 'ENSG00000120800', 'ENSG00000113905', 'ENSG00000169245', 'ENSG00000072571', 'ENSG00000101412', 'ENSG00000136689', 'ENSG00000102837', 'ENSG00000112299', 'ENSG00000168268', 'ENSG000000

### 1.4 Query homologs

In [33]:
input_species="Human"

data_input_hl = pd.DataFrame(homologs, columns=["identifier"])

bridgedb_df_hl, bridgedb_metadata_hl = id_mapper.bridgedb_xref(
    identifiers=data_input_hl,
    input_species="Human",
    input_datasource="Ensembl",
    output_datasource="All",
)

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000100426,Ensembl,MGI:3782384,MGI
1,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl
2,ENSMUSG00000032087,Ensembl,MGI:2150309,MGI
3,ENSMUSG00000032087,Ensembl,114873,NCBI Gene
4,ENSMUSG00000032087,Ensembl,A0A1L1SQZ7,Uniprot-TrEMBL
5,ENSMUSG00000032087,Ensembl,E9QPR7,Uniprot-TrEMBL
6,ENSMUSG00000032087,Ensembl,Q4VA61,Uniprot-TrEMBL
7,ENSMUSG00000032087,Ensembl,A0A1L1SQ53,Uniprot-TrEMBL
8,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl
9,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl


# 2. Step-by-step graph generation

### 2.1. Gene-Disease edges


In [34]:
load_dotenv('disgenet.env')

disgenet_api_key = os.getenv("DISGENET_API_KEY")
print(disgenet_api_key)

15eadd18-9b50-466d-a41d-3deb5fad122c


In [35]:
disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
    api_key=disgenet_api_key, bridgedb_df=bridgedb_df_hl
)
disgenet_df.head()

Querying DisGeNET: 100%|██████████| 71/71 [01:10<00:00,  1.01it/s]
  disgenet_df, disgenet_metadata = disgenet.get_gene_disease(


Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases
0,ENSG00000072080,Ensembl,6694,NCBI Gene,"[{'disease_name': 'Retinal dystrophy', 'HPO': ..."
1,ENSG00000072571,Ensembl,3161,NCBI Gene,"[{'disease_name': 'Cancer, Breast', 'HPO': Non..."
2,ENSG00000073111,Ensembl,4171,NCBI Gene,"[{'disease_name': 'Liver cell carcinoma', 'HPO..."
3,ENSG00000085563,Ensembl,5243,NCBI Gene,"[{'disease_name': 'Breast Neoplasms', 'HPO': '..."
4,ENSG00000087076,Ensembl,51171,NCBI Gene,[{'disease_name': 'Severe myopia (> -6.00 diop...


In [13]:
disgenet_df[DISGENET_DISEASE_COL][0]

[{'disease_name': 'ARCND1',
  'HPO': None,
  'NCI': None,
  'OMIM': 'MIM:602483, MIM:139370',
  'MONDO': 'MONDO:0000107, MONDO:0011234',
  'ORDO': 'ORDO:137888',
  'EFO': None,
  'DO': None,
  'MESH': 'MESH:C538270',
  'UMLS': 'UMLS:C4551996',
  'disease_type': 'disease',
  'disease_umlscui': 'C4551996',
  'score': 0.7,
  'ei': 1.0,
  'el': None},
 {'disease_name': 'Auriculo-condylar syndrome',
  'HPO': None,
  'NCI': None,
  'OMIM': 'MIM:602483',
  'MONDO': 'MONDO:0000107',
  'ORDO': 'ORDO:137888',
  'EFO': None,
  'DO': None,
  'MESH': 'MESH:C538270',
  'UMLS': 'UMLS:C1865295',
  'disease_type': 'disease',
  'disease_umlscui': 'C1865295',
  'score': 0.65,
  'ei': 0.8,
  'el': None},
 {'disease_name': 'Amnestic disorders',
  'HPO': None,
  'NCI': None,
  'OMIM': None,
  'MONDO': 'MONDO:0001152',
  'ORDO': None,
  'EFO': None,
  'DO': 'DOID:10914',
  'MESH': None,
  'UMLS': 'UMLS:C0002625',
  'disease_type': 'disease',
  'disease_umlscui': 'C0002625',
  'score': 0.4,
  'ei': 1.0,
  'el

### 2.2 Disease-Compound edges

In [36]:
# Prepare the input to use DISGENET output as seed for OpenTargets
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,UMLS_C0006142,UMLS,EFO_0000305,EFO
1,UMLS_C0006142,UMLS,EFO_0003869,EFO
2,UMLS_C1458155,UMLS,EFO_0003869,EFO
3,UMLS_C2239176,UMLS,EFO_0000182,EFO
4,UMLS_C2239176,UMLS,EFO_0000762,EFO


In [37]:
(
    opentargets_disease_compound_df,
    opentargets_disease_compound_metadata,
) = opentargets.get_disease_compound_interactions(disease_mapping_df)
opentargets_disease_compound_df.head()

Processing diseases-drug interactions: 100%|██████████| 342/342 [00:05<00:00, 68.06it/s]
Mapping PubChem: 100%|██████████| 2433/2433 [06:38<00:00,  6.11it/s]
  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0000786,UMLS,EFO_1001255,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1276308', 'drugba..."
1,UMLS_C0001175,UMLS,EFO_0000765,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL704', 'drugbank_i..."
2,UMLS_C0002103,UMLS,EFO_0005854,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1201353', 'drugba..."
3,UMLS_C0002171,UMLS,EFO_0004192,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1200963', 'drugba..."
4,UMLS_C0002940,UMLS,EFO_0004264,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1076903', 'drugba..."


In [None]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

### 2.3 Gene-Compound edges

In [38]:
opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
    bridgedb_df=bridgedb_df_hl
)
opentargets_compound_df.head()

Processing gene-drug interactions: 100%|██████████| 71/71 [00:00<00:00, 318.26it/s]
Mapping PubChem: 100%|██████████| 37/37 [00:15<00:00,  2.41it/s]
  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_gene_compounds
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,"[{'chembl_id': 'CHEMBL:CHEMBL1086218', 'drugba..."
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."


In [None]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][0]

### 2.4 Gene-Pathways edges

#### Pathways from WikiPathways

In [39]:
wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df)
wikipathways_df.head()

Querying WikiPathways: 100%|██████████| 4/4 [02:05<00:00, 31.28s/it]


Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways
0,ENSMUSG00000000934,Ensembl,72960,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
1,ENSMUSG00000001155,Ensembl,14317,NCBI Gene,"[{'pathway_id': 'WP:WP435', 'pathway_label': '..."
2,ENSMUSG00000001313,Ensembl,11858,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
3,ENSMUSG00000001334,Ensembl,384061,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
4,ENSMUSG00000002870,Ensembl,17216,NCBI Gene,"[{'pathway_id': 'WP:WP413', 'pathway_label': '..."


In [None]:
wikipathways_df[WIKIPATHWAYS][0]

[{'pathway_id': 'WP:WP5224',
  'pathway_label': '2q37 copy number variation syndrome',
  'pathway_gene_count': 153.0}]

#### Pathways from KEGG

In [40]:
kegg_df, kegg_metadata = kegg.get_pathways(bridgedb_df)
kegg_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,ENSMUSG00000032087,Ensembl,114873,NCBI Gene,
1,ENSMUSG00000024747,Ensembl,26358,NCBI Gene,"[{'pathway_id': 'path:mmu00830', 'pathway_labe..."
2,ENSMUSG00000091813,Ensembl,436059,NCBI Gene,"[{'pathway_id': 'path:mmu00983', 'pathway_labe..."
3,ENSMUSG00000004035,Ensembl,68312,NCBI Gene,"[{'pathway_id': 'path:mmu00480', 'pathway_labe..."
4,ENSMUSG00000001155,Ensembl,14317,NCBI Gene,"[{'pathway_id': 'path:mmu00340', 'pathway_labe..."


In [None]:
kegg_df[KEGG_COL][1]

[{'pathway_id': 'path:mmu04015',
  'pathway_name': 'Rap1 signaling pathway - Mus musculus (house mouse)',
  'gene_count': 213,
  'compounds': ['C00035', 'C00044', 'C00076', 'C00165', 'C00575']},
 {'pathway_id': 'path:mmu04022',
  'pathway_name': 'cGMP-PKG signaling pathway - Mus musculus (house mouse)',
  'gene_count': 171,
  'compounds': ['C00020',
   'C00027',
   'C00076',
   'C00144',
   'C00212',
   'C00238',
   'C00533',
   'C00575',
   'C00942',
   'C01245']},
 {'pathway_id': 'path:mmu04024',
  'pathway_name': 'cAMP signaling pathway - Mus musculus (house mouse)',
  'gene_count': 223,
  'compounds': ['C00020',
   'C00042',
   'C00076',
   'C00080',
   'C00165',
   'C00186',
   'C00212',
   'C00238',
   'C00288',
   'C00334',
   'C00416',
   'C00547',
   'C00575',
   'C00584',
   'C00698',
   'C00780',
   'C00788',
   'C01089',
   'C01245',
   'C01312',
   'C01330',
   'C01996',
   'C03758',
   'C20792',
   'C20793']},
 {'pathway_id': 'path:mmu04062',
  'pathway_name': 'Chemokine 

#### Reactome pathways from OpenTargets

In [41]:
opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
    bridgedb_df=bridgedb_df_hl
)
opentargets_reactome_df.head()

Processing gene-pathway interactions: 100%|██████████| 71/71 [00:00<00:00, 571.39it/s]
  opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_reactome
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'pathway_label': 'Platelet degranulation ', ..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,[{'pathway_label': 'Hyaluronan uptake and degr...
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,[{'pathway_label': 'Orc1 removal from chromati...
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,[{'pathway_label': 'Abacavir transmembrane tra...
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'pathway_label': 'Estrogen biosynthesis', 'p..."


In [None]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][0]

### 2.5 Gene Ontology from OpenTargets

In [42]:
opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(
    bridgedb_df=bridgedb_df_hl
)
opentargets_go_df.head()

Processing gene annotation: 100%|██████████| 71/71 [00:00<00:00, 537.84it/s]
  opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_go
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'go_id': 'GO:0005788', 'go_name': 'endoplasm..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,"[{'go_id': 'GO:0005515', 'go_name': 'protein b..."
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,"[{'go_id': 'GO:0005654', 'go_name': 'nucleopla..."
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,"[{'go_id': 'GO:0008559', 'go_name': 'ABC-type ..."
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'go_id': 'GO:0005515', 'go_name': 'protein b..."


In [None]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

[{'go_id': 'GO:0005788',
  'go_name': 'endoplasmic reticulum lumen',
  'go_type': 'C'},
 {'go_id': 'GO:0031089',
  'go_name': 'platelet dense granule lumen',
  'go_type': 'C'},
 {'go_id': 'GO:0004866',
  'go_name': 'endopeptidase inhibitor activity',
  'go_type': 'F'},
 {'go_id': 'GO:0046849', 'go_name': 'bone remodeling', 'go_type': 'P'},
 {'go_id': 'GO:0062023',
  'go_name': 'collagen-containing extracellular matrix',
  'go_type': 'C'},
 {'go_id': 'GO:0005576', 'go_name': 'extracellular region', 'go_type': 'C'},
 {'go_id': 'GO:0001501',
  'go_name': 'skeletal system development',
  'go_type': 'P'}]

### 2.6. Protein-Protein Interactions

In [43]:
input_species = "Mouse"
ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df, species=input_species)
ppi_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,StringDB_ppi
0,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
1,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
2,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,"[{'stringdb_link_to': 'ENSMUSG00000027452', 'E..."
3,ENSMUSG00000091813,Ensembl,ENSMUSG00000091813,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
4,ENSMUSG00000004035,Ensembl,ENSMUSG00000004035,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."


In [None]:
ppi_df[STRING_PPI_COL][3]

[{'stringdb_link_to': 'Rps2',
  'Ensembl': 'Ensembl:ENSMUSP00000092502',
  'score': 0.902,
  'Uniprot-TrEMBL': 'Atp5a1'},
 {'stringdb_link_to': 'Rplp0',
  'Ensembl': 'Ensembl:ENSMUSP00000083705',
  'score': 0.903,
  'Uniprot-TrEMBL': 'Atp5a1'}]

### 2.7 Gene expression edges

In [None]:
bgee_df, bgee_metadata = bgee.get_gene_expression(bridgedb_df=bridgedb_df)
bgee_df.head()

KeyboardInterrupt: 

In [None]:
bgee_df[BGEE_GENE_EXPRESSION_LEVELS_COL][1]

[{'anatomical_entity_id': 'UBERON_0002371',
  'anatomical_entity_name': 'bone marrow',
  'expression_level': 98.45455,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0000955',
  'anatomical_entity_name': 'brain',
  'expression_level': 99.51974,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0004535',
  'anatomical_entity_name': 'cardiovascular system',
  'expression_level': 99.83003,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0001007',
  'anatomical_entity_name': 'digestive system'

### 2.8 Transporter Inhibitors

In [44]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df_hl)
inhibitor_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,MolMeDB_transporter_inhibitor
0,ENSG00000072080,Ensembl,C9J6K0,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
1,ENSG00000072080,Ensembl,Q13103,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
2,ENSG00000072571,Ensembl,E5RI30,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
3,ENSG00000072571,Ensembl,E5RIH2,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
4,ENSG00000072571,Ensembl,O75330,Uniprot-TrEMBL,"[{'compound_name': nan, 'inchikey': nan, 'smil..."


In [None]:
print(inhibitor_df[MOLMEDB_PROTEIN_COMPOUND_COL][47])
inhibitor_df[inhibitor_df["target"] == "P35499"]

[{'compound_name': nan, 'inchikey': nan, 'smiles': nan, 'compound_cid': nan, 'molmedb_id': nan, 'source_pmid': nan, 'chebi_id': nan, 'drugbank_id': nan, 'uniprot_trembl_id': nan}]


Unnamed: 0,identifier,identifier.source,target,target.source,MolMeDB_transporter_inhibitor


# 3. Generating Graph

### 3.1 Combing all the results into single dataframe

In [45]:
#        bgee_df,
#        disgenet_df,
#        minerva_df,
#        opentargets_reactome_df,
#        opentargets_go_df,
#        opentargets_compound_df,
#        inhibitor_df,
#        kegg_df,

combined_df = combine_sources(
    bridgedb_df,
    [
        wikipathways_df,
        ppi_df,
        ensembl_homologs_df
    ],
)

combined_df_hl = combine_sources(
    bridgedb_df_hl,
    [
        opentargets_go_df,
        disgenet_df,
    ],
)

combined_df_hl.head(10)

merged_df = pd.merge(combined_df, combined_df_hl, on='identifier', how='outer')

for col in merged_df.columns:
    if col.endswith('_x'):
        base_col = col[:-2] 
        merged_df[base_col] = merged_df[col].combine_first(merged_df.get(f'{base_col}_y'))
        merged_df.drop([col, f'{base_col}_y'], axis=1, inplace=True)

merged_df.head(10)

Unnamed: 0,identifier,WikiPathways,StringDB_ppi,Ensembl_homologs,OpenTargets_go,DISGENET_diseases,identifier.source,target,target.source
0,ENSMUSG00000100426,,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...",,,,Ensembl,ENSMUSG00000100426,Ensembl
1,ENSMUSG00000032087,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...",[{'homolog': 'ENSG00000177103'}],,,Ensembl,ENSMUSG00000032087,Ensembl
2,ENSMUSG00000024747,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'stringdb_link_to': 'ENSMUSG00000027452', 'E...",[{'homolog': 'ENSG00000165092'}],,,Ensembl,ENSMUSG00000024747,Ensembl
3,ENSMUSG00000091813,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...",,,,Ensembl,ENSMUSG00000091813,Ensembl
4,ENSMUSG00000004035,"[{'pathway_id': 'WP:WP4466', 'pathway_label': ...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...",[{'homolog': 'ENSG00000213366'}],,,Ensembl,ENSMUSG00000004035,Ensembl
5,ENSMUSG00000001155,"[{'pathway_id': 'WP:WP435', 'pathway_label': '...","[{'stringdb_link_to': 'ENSMUSG00000028356', 'E...",[{'homolog': 'ENSG00000160282'}],,,Ensembl,ENSMUSG00000001155,Ensembl
6,ENSMUSG00000021336,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'stringdb_link_to': 'ENSMUSG00000026205', 'E...",[{'homolog': 'ENSG00000146039'}],,,Ensembl,ENSMUSG00000021336,Ensembl
7,ENSMUSG00000027331,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'stringdb_link_to': 'ENSMUSG00000006398', 'E...",[{'homolog': 'ENSG00000128944'}],,,Ensembl,ENSMUSG00000027331,Ensembl
8,ENSMUSG00000028873,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'stringdb_link_to': 'ENSMUSG00000006398', 'E...",[{'homolog': 'ENSG00000134690'}],,,Ensembl,ENSMUSG00000028873,Ensembl
9,ENSMUSG00000038486,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'stringdb_link_to': 'ENSMUSG00000022415', 'E...",[{'homolog': 'ENSG00000159164'}],,,Ensembl,ENSMUSG00000038486,Ensembl


In [None]:
combined_df.shape

(8, 9)

### 3.2 Exporting the database in pickle format

In [46]:
with open("combined_df.pkl", "wb") as out:
    pickle.dump(merged_df, out)
# with open("opentargets_disease_compound_df.pkl", "wb") as out:
#     pickle.dump(opentargets_disease_compound_df, out)

### 3.3 Creating a graph from the annotated dataframe

In [None]:
# combined_df = generator.load_dataframe_from_pickle("combined_df.pkl")
# opentargets_disease_compound_df = generator.load_dataframe_from_pickle(
#     "opentargets_disease_compound_df.pkl"
# )

combined_df.head(10)

Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways,StringDB_ppi,Ensembl_homologs
0,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl,"[{'pathway_id': 'WP:WP163', 'pathway_label': '...","[{'stringdb_link_to': 'ENSMUSG00000025428', 'E...",[{'homolog': 'ENSG00000089157'}]
1,ENSMUSG00000000001,Ensembl,ENSMUSG00000000001,Ensembl,"[{'pathway_id': 'WP:WP2292', 'pathway_label': ...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...",[{'homolog': 'ENSG00000065135'}]
2,ENSMUSG00000084349,Ensembl,ENSMUSG00000084349,Ensembl,,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...",
3,ENSMUSG00000025428,Ensembl,ENSMUSG00000025428,Ensembl,"[{'pathway_id': 'WP:WP295', 'pathway_label': '...","[{'stringdb_link_to': 'ENSMUSG00000044533', 'E...",[{'homolog': 'ENSG00000152234'}]
4,ENSMUSG00000044533,Ensembl,ENSMUSG00000044533,Ensembl,"[{'pathway_id': 'WP:WP163', 'pathway_label': '...","[{'stringdb_link_to': 'ENSMUSG00000025428', 'E...",


In [21]:
opentargets_disease_compound_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0001125,UMLS,EFO_1000036,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL306823', 'drugban..."
1,UMLS_C0002622,UMLS,EFO_0001072,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL526', 'drugbank_i..."
2,UMLS_C0002622,UMLS,EFO_1001454,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL655', 'drugbank_i..."


In [47]:
pygraph = generator.build_networkx_graph(merged_df)

Building graph: 100%|██████████| 152/152 [00:00<00:00, 2745.70it/s]

[{'stringdb_link_to': nan, 'Ensembl': nan, 'score': nan}]
Skipping row with no valid PPI for: ENSMUSG00000100426
[{'stringdb_link_to': nan, 'Ensembl': nan, 'score': nan}]
Skipping row with no valid PPI for: ENSMUSG00000032087
[{'stringdb_link_to': 'ENSMUSG00000027452', 'Ensembl': 'Ensembl:ENSMUSP00000028944', 'score': 0.636, 'Uniprot-TrEMBL': 'Aldh1a7'}, {'stringdb_link_to': 'ENSMUSG00000028715', 'Ensembl': 'Ensembl:ENSMUSP00000030487', 'score': 0.738, 'Uniprot-TrEMBL': 'Aldh1a7'}]
Adding PPI for: ENSMUSG00000024747
[{'stringdb_link_to': nan, 'Ensembl': nan, 'score': nan}]
Skipping row with no valid PPI for: ENSMUSG00000091813
[{'stringdb_link_to': nan, 'Ensembl': nan, 'score': nan}]
Skipping row with no valid PPI for: ENSMUSG00000004035
[{'stringdb_link_to': 'ENSMUSG00000028356', 'Ensembl': 'Ensembl:ENSMUSP00000030041', 'score': 0.47, 'Uniprot-TrEMBL': 'Ftcd'}, {'stringdb_link_to': 'ENSMUSG00000022877', 'Ensembl': 'Ensembl:ENSMUSP00000023590', 'score': 0.567, 'Uniprot-TrEMBL': 'Ftcd'}




### 3.4 Store the graph

In [22]:
with open("networkx_graph_test.pkl", "wb") as out:
    pickle.dump(pygraph, out)

### 3.5 Visualize the graph

In [None]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

# 4. Exporting Graph to external sources

### 4.1 Cytoscape
Make sure that the Cytoscape is open

In [48]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="Test network")

Applying default style...
Applying preferred layout


### 4.2 Neo4j

In [38]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "networkx_graph_test.graphml")

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph  before) Remove all the nodes before importing `.graphml` file

    ```MATCH (n) DETACH DELETE n```

- Import `.graphml` file

    ```call apoc.import.graphml('file:///networkx_graph_test.graphml',{readLabels:TRUE})```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```
    

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) 
        - Gene (```MATCH (n:Gene) RETURN count(n)```)
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```)
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) 
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) 
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) 
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) 
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) 
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) 
        - Disease (```MATCH (n:Disease) RETURN count(n)```) 
        - Compound (```MATCH (n:Compound) RETURN count(n)```)
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) 
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) 
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) 
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) 
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) 
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) 
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) 
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) 
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) 
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) 

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("networkx_graph_test.csv",{})```