# Example: Polysistice Kidney Disease use case

In [1]:
new_path = r"C:\Users\are10\Documents\BAFSTU\code\pyBioDatafusemain\pyBiodatafuse\examples\usecases"

# Setting up the working directory
import os
import sys

src_path = os.path.abspath(os.path.join("..", "..", "..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)

print(sys.path)  # Verify the correct src path is included

['C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\python39.zip', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\DLLs', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'C:\\Users\\are10\\AppData\\Local\\Programs\\Python\\Python39', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv', '', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\are10\\Documents\\BAFSTU\\code\\pyBioDatafusemain\\pyBiodatafuse\\src']


In [2]:
# Import modules
import pickle

import pandas as pd
from dotenv import load_dotenv

from pyBiodatafuse import human_homologs, id_mapper, testcodeopenai, utils
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    intact,
    kegg,
    minerva,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.constants import (
    BGEE_GENE_EXPRESSION_LEVELS_COL,
    DISGENET_DISEASE_COL,
    ENSEMBL_HOMOLOG_COL,
    KEGG_COL,
    MINERVA,
    MOLMEDB_PROTEIN_COMPOUND_COL,
    OPENTARGETS_DISEASE_COMPOUND_COL,
    OPENTARGETS_GENE_COMPOUND_COL,
    OPENTARGETS_GO_COL,
    OPENTARGETS_REACTOME_COL,
    PUBCHEM_COMPOUND_ASSAYS_COL,
    STRING_PPI_COL,
    WIKIPATHWAYS,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import combine_sources, combine_with_homologs, create_harmonized_input_file

In [3]:
testcodeopenai.main()

Type 'exit' to quit.

ChatGPT: Test received! How can I assist you today?



# 1. Entity resolution using BridgeDB

### 1.1. Load the input list and convert it to a dataframe

Small set of mouse data used for debugging.

In [4]:
# TEST Mice use case =
genes_of_interest = """ENSMUSG00000026295
ENSMUSG00000022877
ENSMUSG00000020914
ENSMUSG00000024747
ENSMUSG00000032081
ENSMUSG00000004035
ENSMUSG00000072949
ENSMUSG00000028970
ENSMUSG00000028937
ENSMUSG00000075044"""

# TEST Mice Ensembl
genes_of_interest = """ENSMUSG00000067274
ENSMUSG00000000001
ENSMUSG00000030619
ENSMUSG00000027490
ENSMUSG00000022472
ENSMUSG00000059552"""

# genes_of_interest = """ENSG00000141510"""

# TEST Rat Ensembl
# genes_of_interest = """ENSRNOG00060027926
# ENSRNOG00055005387
# ENSRNOG00060018596
# ENSRNOG00060011358
# ENSRNOG00055009275
# """

# TEST Human HGNC
# genes_of_interest = """CHRNG
# DMD
# AHR
# SCN4A
# LC25A1
# HTR3A"""

# TEST Human Ensembl
# genes_of_interest = """ENSG00000072080
# ENSG00000113905
# ENSG00000131747
# ENSG00000165092
# ENSG00000110245
# ENSG00000213366
# ENSG00000184227
# ENSG00000085563
# ENSG00000097021
# ENSG00000149742"""

gene_list = genes_of_interest.split("\n")
len(gene_list)

6

Mouse use case

In [5]:
# from pyBiodatafuse import data_loader, id_mapper

# data_input = data_loader.create_df_from_dea("data/full_de_genes_treated_vs_untreated_plus_cpm_fc.csv")
# data_filtered = data_input[data_input['DE'].isin([1, -1])]
# features_filtered = data_filtered['identifier']

# gene_list = features_filtered.tolist()
# print(len(gene_list))

In [6]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head(20)

Unnamed: 0,identifier
0,ENSMUSG00000067274
1,ENSMUSG00000000001
2,ENSMUSG00000030619
3,ENSMUSG00000027490
4,ENSMUSG00000022472
5,ENSMUSG00000059552


In [7]:
print(gene_list)

['ENSMUSG00000067274', 'ENSMUSG00000000001', 'ENSMUSG00000030619', 'ENSMUSG00000027490', 'ENSMUSG00000022472', 'ENSMUSG00000059552']


### 1.2. Query BridgeDB

In [8]:
# Mouse usecase
input_species = "Mouse"

bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species=input_species,
    input_datasource="Ensembl",
    output_datasource="All",
)

# TEST Human data
# bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
#     identifiers=data_input,
#     input_species="Human",
#     input_datasource="Ensembl",
#     output_datasource="All",
# )

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl
1,ENSMUSG00000067274,Ensembl,10524718,Affy
2,ENSMUSG00000067274,Ensembl,GO:0016020,Gene Ontology
3,ENSMUSG00000067274,Ensembl,GO:0071353,Gene Ontology
4,ENSMUSG00000067274,Ensembl,GO:0070180,Gene Ontology
5,ENSMUSG00000067274,Ensembl,GO:0022626,Gene Ontology
6,ENSMUSG00000067274,Ensembl,GO:1904628,Gene Ontology
7,ENSMUSG00000067274,Ensembl,GO:0022625,Gene Ontology
8,ENSMUSG00000067274,Ensembl,GO:0042277,Gene Ontology
9,ENSMUSG00000067274,Ensembl,GO:0098794,Gene Ontology


### 1.3 Homologs

In [8]:
ensembl_homologs_df, ensembl_metadata = human_homologs.get_homologs(bridgedb_df=bridgedb_df)
ensembl_homologs_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Ensembl_homologs
0,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl,[{'homolog': nan}]
1,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl,[{'homolog': 'ENSG00000177103'}]
2,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,[{'homolog': 'ENSG00000165092'}]
3,ENSMUSG00000091813,Ensembl,ENSMUSG00000091813,Ensembl,[{'homolog': nan}]
4,ENSMUSG00000004035,Ensembl,ENSMUSG00000004035,Ensembl,[{'homolog': 'ENSG00000213366'}]


In [9]:
ensembl_homologs_df.head(20)

Unnamed: 0,identifier,identifier.source,target,target.source,Ensembl_homologs
0,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl,[{'homolog': nan}]
1,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl,[{'homolog': 'ENSG00000177103'}]
2,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,[{'homolog': 'ENSG00000165092'}]
3,ENSMUSG00000091813,Ensembl,ENSMUSG00000091813,Ensembl,[{'homolog': nan}]
4,ENSMUSG00000004035,Ensembl,ENSMUSG00000004035,Ensembl,[{'homolog': 'ENSG00000213366'}]
5,ENSMUSG00000001155,Ensembl,ENSMUSG00000001155,Ensembl,[{'homolog': 'ENSG00000160282'}]
6,ENSMUSG00000021336,Ensembl,ENSMUSG00000021336,Ensembl,[{'homolog': 'ENSG00000146039'}]
7,ENSMUSG00000027331,Ensembl,ENSMUSG00000027331,Ensembl,[{'homolog': 'ENSG00000128944'}]
8,ENSMUSG00000028873,Ensembl,ENSMUSG00000028873,Ensembl,[{'homolog': 'ENSG00000134690'}]
9,ENSMUSG00000038486,Ensembl,ENSMUSG00000038486,Ensembl,[{'homolog': 'ENSG00000159164'}]


In [10]:
homologs = (
    ensembl_homologs_df[ENSEMBL_HOMOLOG_COL]
    .apply(
        lambda x: (
            x[0]["homolog"] if isinstance(x, list) and len(x) > 0 and "homolog" in x[0] else None
        )
    )
    .dropna()
    .tolist()
)

print(homologs)

['ENSG00000177103', 'ENSG00000165092', 'ENSG00000213366', 'ENSG00000160282', 'ENSG00000146039', 'ENSG00000128944', 'ENSG00000134690', 'ENSG00000159164', 'ENSG00000188674', 'ENSG00000224389', 'ENSG00000072080', 'ENSG00000165030', 'ENSG00000154930', 'ENSG00000101204', 'ENSG00000163631', 'ENSG00000213901', 'ENSG00000169679', 'ENSG00000213886', 'ENSG00000273604', 'ENSG00000182272', 'ENSG00000144035', 'ENSG00000138778', 'ENSG00000149742', 'ENSG00000131747', 'ENSG00000160097', 'ENSG00000117399', 'ENSG00000100321', 'ENSG00000073111', 'ENSG00000108830', 'ENSG00000127220', 'ENSG00000109107', 'ENSG00000144395', 'ENSG00000151287', 'ENSG00000162897', 'ENSG00000087076', 'ENSG00000100526', 'ENSG00000165507', 'ENSG00000159228', 'ENSG00000120054', 'ENSG00000149554', 'ENSG00000184545', 'ENSG00000170430', 'ENSG00000123427', 'ENSG00000120800', 'ENSG00000113905', 'ENSG00000169245', 'ENSG00000072571', 'ENSG00000101412', 'ENSG00000136689', 'ENSG00000102837', 'ENSG00000112299', 'ENSG00000168268', 'ENSG000000

### 1.4 Query homologs

In [11]:
input_species = "Human"

data_input_hl = pd.DataFrame(homologs, columns=["identifier"])

bridgedb_df_hl, bridgedb_metadata_hl = id_mapper.bridgedb_xref(
    identifiers=data_input_hl,
    input_species="Human",
    input_datasource="Ensembl",
    output_datasource="All",
)

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000100426,Ensembl,MGI:3782384,MGI
1,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl
2,ENSMUSG00000032087,Ensembl,MGI:2150309,MGI
3,ENSMUSG00000032087,Ensembl,114873,NCBI Gene
4,ENSMUSG00000032087,Ensembl,A0A1L1SQZ7,Uniprot-TrEMBL
5,ENSMUSG00000032087,Ensembl,E9QPR7,Uniprot-TrEMBL
6,ENSMUSG00000032087,Ensembl,Q4VA61,Uniprot-TrEMBL
7,ENSMUSG00000032087,Ensembl,A0A1L1SQ53,Uniprot-TrEMBL
8,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl
9,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl


# 2. Step-by-step graph generation

### 2.1. Gene-Disease edges


In [12]:
load_dotenv("disgenet.env")

disgenet_api_key = os.getenv("DISGENET_API_KEY")
print(disgenet_api_key)

15eadd18-9b50-466d-a41d-3deb5fad122c


In [13]:
disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
    api_key=disgenet_api_key, bridgedb_df=bridgedb_df_hl
)
disgenet_df.head()

Querying DisGeNET: 100%|██████████| 71/71 [01:09<00:00,  1.02it/s]
  disgenet_df, disgenet_metadata = disgenet.get_gene_disease(


Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases
0,ENSG00000072080,Ensembl,6694,NCBI Gene,"[{'disease_name': 'Retinal dystrophy', 'HPO': ..."
1,ENSG00000072571,Ensembl,3161,NCBI Gene,"[{'disease_name': 'Cancer, Breast', 'HPO': Non..."
2,ENSG00000073111,Ensembl,4171,NCBI Gene,"[{'disease_name': 'Liver cell carcinoma', 'HPO..."
3,ENSG00000085563,Ensembl,5243,NCBI Gene,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,..."
4,ENSG00000087076,Ensembl,51171,NCBI Gene,[{'disease_name': 'Severe myopia (> -6.00 diop...


In [14]:
disgenet_df[DISGENET_DISEASE_COL][0]

[{'disease_name': 'Retinal dystrophy',
  'HPO': 'HPO:HP:0000556',
  'NCI': 'NCI:C35625',
  'OMIM': None,
  'MONDO': 'MONDO:0019118',
  'ORDO': 'ORDO:71862',
  'EFO': None,
  'DO': 'DOID:8501',
  'MESH': 'MESH:D058499',
  'UMLS': 'UMLS:C0854723',
  'disease_type': 'disease',
  'disease_umlscui': 'C0854723',
  'score': 0.4,
  'ei': nan,
  'el': None}]

### 2.2 Disease-Compound edges

In [15]:
# Prepare the input to use DISGENET output as seed for OpenTargets
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,UMLS_C0006142,UMLS,EFO_0000305,EFO
1,UMLS_C0006142,UMLS,EFO_0003869,EFO
2,UMLS_C1458155,UMLS,EFO_0003869,EFO
3,UMLS_C2239176,UMLS,EFO_0000182,EFO
4,UMLS_C2239176,UMLS,EFO_0000762,EFO


In [16]:
(
    opentargets_disease_compound_df,
    opentargets_disease_compound_metadata,
) = opentargets.get_disease_compound_interactions(disease_mapping_df)
opentargets_disease_compound_df.head()

Processing diseases-drug interactions: 100%|██████████| 292/292 [00:07<00:00, 37.11it/s]
Mapping PubChem: 100%|██████████| 2129/2129 [06:13<00:00,  5.70it/s]
  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0000786,UMLS,EFO_1001255,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1276308', 'drugba..."
1,UMLS_C0002171,UMLS,EFO_0004192,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1200963', 'drugba..."
2,UMLS_C0002940,UMLS,EFO_0004264,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1076903', 'drugba..."
3,UMLS_C0002940,UMLS,EFO_0009659,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL526', 'drugbank_i..."
4,UMLS_C0002994,UMLS,EFO_0005532,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL2028850', 'drugba..."


In [17]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

[{'chembl_id': 'CHEMBL:CHEMBL1276308',
  'drugbank_id': 'DrugBank:DB00834',
  'compound_cid': 'pubchem.compound:55245',
  'compound_name': 'MIFEPRISTONE',
  'clincal_trial_phase': 4.0,
  'is_approved': True,
  'relation': 'treats',
  'adverse_effect_count': 41.0,
  'adverse_effect': [{'name': 'abortion incomplete'},
   {'name': 'haemorrhage'},
   {'name': 'pregnancy'},
   {'name': 'endometritis'},
   {'name': 'induced abortion failed'},
   {'name': 'vaginal haemorrhage'},
   {'name': 'anaemia'},
   {'name': 'muscle spasms'},
   {'name': 'metrorrhagia'},
   {'name': 'abortion induced incomplete'},
   {'name': 'menorrhagia'},
   {'name': 'pain'},
   {'name': 'uterine haemorrhage'},
   {'name': 'post abortion infection'},
   {'name': 'uterine rupture'},
   {'name': 'ectopic pregnancy'},
   {'name': 'blood potassium decreased'},
   {'name': 'syncope'},
   {'name': 'endometritis bacterial'},
   {'name': 'pelvic inflammatory disease'},
   {'name': 'uterine dilation and curettage'},
   {'name

### 2.3 Compound Annotation

#### Compounds from OpenTargets

In [18]:
opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
    bridgedb_df=bridgedb_df_hl
)
opentargets_compound_df.head()

Processing gene-drug interactions: 100%|██████████| 71/71 [00:00<00:00, 147.70it/s]
Mapping PubChem: 100%|██████████| 37/37 [00:13<00:00,  2.68it/s]
  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_gene_compounds
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,"[{'chembl_id': 'CHEMBL:CHEMBL1086218', 'drugba..."
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."


In [19]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][0]

[{'chembl_id': nan,
  'drugbank_id': nan,
  'compound_cid': nan,
  'compound_name': nan,
  'clincal_trial_phase': nan,
  'is_approved': nan,
  'relation': nan,
  'adverse_effect_count': nan,
  'adverse_effect': nan}]

#### Compounds from PubChem

In [20]:
pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(
    bridgedb_df=bridgedb_df_hl
)

Querying PubChem: 100%|██████████| 11/11 [00:02<00:00,  4.57it/s]
  pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(


### 2.4 Gene-Pathways edges

#### Pathways from WikiPathways

In [21]:
wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df)
wikipathways_df.head()

Querying WikiPathways: 100%|██████████| 4/4 [01:42<00:00, 25.58s/it]


Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways
0,ENSMUSG00000000934,Ensembl,72960,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
1,ENSMUSG00000001155,Ensembl,14317,NCBI Gene,"[{'pathway_id': 'WP:WP435', 'pathway_label': '..."
2,ENSMUSG00000001313,Ensembl,11858,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
3,ENSMUSG00000001334,Ensembl,384061,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
4,ENSMUSG00000002870,Ensembl,17216,NCBI Gene,"[{'pathway_id': 'WP:WP413', 'pathway_label': '..."


In [22]:
wikipathways_df[WIKIPATHWAYS][0]

[{'pathway_id': nan, 'pathway_label': nan, 'pathway_gene_count': nan}]

#### Pathways from KEGG

In [23]:
compounds_of_interest = """C01089
C00020
C02571
C00212
C00041
C00152
C00049
C00719
C00114
C00158
C00300
C01026
C00122
C00031
C00025
C00064
C00037
C00135
C00262
C00130
C00294
C00407
C00186
C00123
C00149
C00073
C00137
C00003
C00153
C00079
C00588
C00346
C04230
C00245
C00188
C00082
C00043
C00105
C00106
C00299
C00183"""

metabolite_list = compounds_of_interest.split("\n")
data_input = pd.DataFrame(metabolite_list, columns=["identifier"])

bridgdb_df_cmp, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Mouse",
    input_datasource="KEGG Compound",
    output_datasource="All",
)
bridgdb_df_cmp.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,C01089,KEGG Compound,6971058,PubChem Compound
1,C01089,KEGG Compound,92135,PubChem Compound
2,C01089,KEGG Compound,C01089,KEGG Compound
3,C00020,KEGG Compound,C00020,KEGG Compound
4,C00020,KEGG Compound,6083,PubChem Compound


In [24]:
kegg_df, kegg_metadata = kegg.get_pathways(bridgedb_df)
kegg_df.head()

mmu:114873


mmu:26358
mmu:26358	path:mmu00830
mmu:26358	path:mmu01100

mmu:436059
mmu:436059	path:mmu00983

mmu:68312
mmu:68312	path:mmu00480
mmu:68312	path:mmu00980
mmu:68312	path:mmu00982
mmu:68312	path:mmu00983
mmu:68312	path:mmu01100
mmu:68312	path:mmu01524
mmu:68312	path:mmu05200
mmu:68312	path:mmu05204
mmu:68312	path:mmu05207
mmu:68312	path:mmu05208
mmu:68312	path:mmu05225
mmu:68312	path:mmu05418

mmu:14317
mmu:14317	path:mmu00340
mmu:14317	path:mmu00670
mmu:14317	path:mmu01100

mmu:319848


mmu:51944


mmu:52276


mmu:64051
mmu:64051	path:mmu04512

mmu:98303


mmu:15331


mmu:625018
mmu:625018	path:mmu04610
mmu:625018	path:mmu04936
mmu:625018	path:mmu05133
mmu:625018	path:mmu05150
mmu:625018	path:mmu05171
mmu:625018	path:mmu05322

mmu:75396


mmu:18030
mmu:18030	path:mmu04710

mmu:12763
mmu:12763	path:mmu00520
mmu:12763	path:mmu01250

mmu:68738
mmu:68738	path:mmu00010
mmu:68738	path:mmu00620
mmu:68738	path:mmu00630
mmu:68738	path:mmu00640
mmu:68738	path:mmu01100
mmu:68738	path:

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,ENSMUSG00000032087,Ensembl,114873,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'ge..."
1,ENSMUSG00000024747,Ensembl,26358,NCBI Gene,"[{'pathway_id': 'path:mmu00830', 'pathway_labe..."
2,ENSMUSG00000091813,Ensembl,436059,NCBI Gene,"[{'pathway_id': 'path:mmu00983', 'pathway_labe..."
3,ENSMUSG00000004035,Ensembl,68312,NCBI Gene,"[{'pathway_id': 'path:mmu00480', 'pathway_labe..."
4,ENSMUSG00000001155,Ensembl,14317,NCBI Gene,"[{'pathway_id': 'path:mmu00340', 'pathway_labe..."


In [25]:
kegg_df["KEGG_pathways"][2]

[{'pathway_id': 'path:mmu00983',
  'pathway_label': 'Drug metabolism - other enzymes - Mus musculus (house mouse)',
  'gene_count': 94,
  'compounds': [{'KEGG_identifier': 'C00003'},
   {'KEGG_identifier': 'C00006'},
   {'KEGG_identifier': 'C02320'},
   {'KEGG_identifier': 'C02380'},
   {'KEGG_identifier': 'C04242'},
   {'KEGG_identifier': 'C04646'},
   {'KEGG_identifier': 'C05361'},
   {'KEGG_identifier': 'C06108'},
   {'KEGG_identifier': 'C06837'},
   {'KEGG_identifier': 'C07054'},
   {'KEGG_identifier': 'C07446'},
   {'KEGG_identifier': 'C07447'},
   {'KEGG_identifier': 'C07585'},
   {'KEGG_identifier': 'C07648'},
   {'KEGG_identifier': 'C07649'},
   {'KEGG_identifier': 'C11173'},
   {'KEGG_identifier': 'C11376'},
   {'KEGG_identifier': 'C11736'},
   {'KEGG_identifier': 'C12650'},
   {'KEGG_identifier': 'C12673'},
   {'KEGG_identifier': 'C12739'},
   {'KEGG_identifier': 'C13252'},
   {'KEGG_identifier': 'C16542'},
   {'KEGG_identifier': 'C16543'},
   {'KEGG_identifier': 'C16613'},
 

In [26]:
kegg_compound_df = kegg.get_compounds(bridgdb_df_cmp)
kegg_compound_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_compounds
0,C01089,KEGG Compound,C01089,KEGG Compound,"{'KEGG_identifier': 'C01089', 'name': '(R)-3-H..."
1,C00020,KEGG Compound,C00020,KEGG Compound,"{'KEGG_identifier': 'C00020', 'name': 'AMP'}"
2,C02571,KEGG Compound,C02571,KEGG Compound,"{'KEGG_identifier': 'C02571', 'name': 'O-Acety..."
3,C00212,KEGG Compound,C00212,KEGG Compound,"{'KEGG_identifier': 'C00212', 'name': 'Adenosi..."
4,C00041,KEGG Compound,C00041,KEGG Compound,"{'KEGG_identifier': 'C00041', 'name': 'L-Alani..."


In [27]:
kegg_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,ENSMUSG00000032087,Ensembl,114873,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'ge..."
1,ENSMUSG00000024747,Ensembl,26358,NCBI Gene,"[{'pathway_id': 'path:mmu00830', 'pathway_labe..."
2,ENSMUSG00000091813,Ensembl,436059,NCBI Gene,"[{'pathway_id': 'path:mmu00983', 'pathway_labe..."
3,ENSMUSG00000004035,Ensembl,68312,NCBI Gene,"[{'pathway_id': 'path:mmu00480', 'pathway_labe..."
4,ENSMUSG00000001155,Ensembl,14317,NCBI Gene,"[{'pathway_id': 'path:mmu00340', 'pathway_labe..."


In [28]:
kegg_compound_df["KEGG_compounds"][0]

{'KEGG_identifier': 'C01089', 'name': '(R)-3-Hydroxybutanoate'}

In [29]:
# data_input_compounds = pd.DataFrame(kegg_identifiers, columns=["identifier"])
# data_input_compounds.head()

# bridgdb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
#     identifiers=data_input_compounds,
#     input_species="Mouse",
#     input_datasource="KEGG Compound",
#     output_datasource="PubChem Compound",
# )
# bridgdb_df.head(25)

In [30]:
import requests

chebi_id = "15422"

url = f"http://webservice.bridgedb.org/Human/xrefs/ChEBI/15422"
response = requests.get(url)

if response.status_code == 200:
    # Parse the response to extract metabolite name(s)
    lines = response.text.splitlines()
    print(lines)
    names = [line.split("\t")[2] for line in lines if len(line.split("\t")) > 2]
    print(names if names else "No metabolite names found.")
else:
    print(f"Error: Unable to retrieve data (status code {response.status_code}).")

['5742\tChemspider', 'DB00171\tDrugBank', 'HMDB00538\tHMDB', '5957\tPubChem-compound', 'CHEBI:15422\tChEBI', '15422\tChEBI', 'ZKHQWZAMYRWXGA-KQYNXXCUSA-N\tInChIKey', 'DTXSID6022559\tEPA CompTox', 'C00001491\tKNApSAcK', '56-65-5\tCAS', 'HMDB0000538\tHMDB', 'CHEMBL14249\tChEMBL compound', 'Q80863\tWikidata', 'C00002\tKEGG Compound', '1713\tGuide to Pharmacology', 'D08646\tKEGG Drug']
No metabolite names found.


#### Reactome pathways from OpenTargets

In [31]:
opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
    bridgedb_df=bridgedb_df_hl
)
opentargets_reactome_df.head()

Processing gene-pathway interactions: 100%|██████████| 71/71 [00:00<00:00, 405.41it/s]
  opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_reactome
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'pathway_label': 'Platelet degranulation ', ..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,[{'pathway_label': 'Hyaluronan uptake and degr...
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,[{'pathway_label': 'Orc1 removal from chromati...
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,[{'pathway_label': 'Abacavir transmembrane tra...
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'pathway_label': 'Estrogen biosynthesis', 'p..."


In [32]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][0]

[{'pathway_label': 'Platelet degranulation ',
  'pathway_id': 'Reactome:R-HSA-114608'},
 {'pathway_label': 'Regulation of Insulin-like Growth Factor (IGF) transport and uptake by Insulin-like Growth Factor Binding Proteins (IGFBPs)',
  'pathway_id': 'Reactome:R-HSA-381426'},
 {'pathway_label': 'Post-translational protein phosphorylation',
  'pathway_id': 'Reactome:R-HSA-8957275'}]

### 2.5 Gene Ontology from OpenTargets

In [33]:
opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(
    bridgedb_df=bridgedb_df_hl
)
opentargets_go_df.head()

Processing gene annotation: 100%|██████████| 71/71 [00:00<00:00, 325.19it/s]
  opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_go
0,ENSG00000072080,Ensembl,ENSG00000072080,Ensembl,"[{'go_id': 'GO:0005788', 'go_name': 'endoplasm..."
1,ENSG00000072571,Ensembl,ENSG00000072571,Ensembl,"[{'go_id': 'GO:0005515', 'go_name': 'protein b..."
2,ENSG00000073111,Ensembl,ENSG00000073111,Ensembl,"[{'go_id': 'GO:0005654', 'go_name': 'nucleopla..."
3,ENSG00000085563,Ensembl,ENSG00000085563,Ensembl,"[{'go_id': 'GO:0008559', 'go_name': 'ABC-type ..."
4,ENSG00000087076,Ensembl,ENSG00000087076,Ensembl,"[{'go_id': 'GO:0005515', 'go_name': 'protein b..."


In [34]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

[{'go_id': 'GO:0005788',
  'go_name': 'endoplasmic reticulum lumen',
  'go_type': 'C'},
 {'go_id': 'GO:0031089',
  'go_name': 'platelet dense granule lumen',
  'go_type': 'C'},
 {'go_id': 'GO:0004866',
  'go_name': 'endopeptidase inhibitor activity',
  'go_type': 'F'},
 {'go_id': 'GO:0046849', 'go_name': 'bone remodeling', 'go_type': 'P'},
 {'go_id': 'GO:0062023',
  'go_name': 'collagen-containing extracellular matrix',
  'go_type': 'C'},
 {'go_id': 'GO:0005576', 'go_name': 'extracellular region', 'go_type': 'C'},
 {'go_id': 'GO:0001501',
  'go_name': 'skeletal system development',
  'go_type': 'P'}]

### 2.6. Protein-Protein Interactions

In [32]:
input_species = "Mouse"
ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df, species=input_species)
ppi_df.head()

  ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df, species=input_species)


In [36]:
ppi_df[STRING_PPI_COL][3]

[{'stringdb_link_to': nan, 'Ensembl': nan, 'score': nan}]

### Intact WIP

In [9]:
intact_df, intact_metadata = intact.get_interactions(bridgedb_df)
intact_df.head()

ERROR:root:Error getting IntAct version


{'ENSMUSG00000067274': ['Q5M8R8', 'S4R1N1', 'P14869', 'D3YVM5'], 'ENSMUSG00000000001': ['Q9DC51'], 'ENSMUSG00000030619': ['Q921E6', 'A0A140LIG5', 'A0A140LIN6', 'A0A5F8MPX8'], 'ENSMUSG00000027490': ['Q9CYB4', 'Q547J6', 'Q61501'], 'ENSMUSG00000022472': ['F8WJJ8', 'Q9CQT7', 'Q3TCG9', 'D6RDE8', 'E9Q2Y9']}
ENSMUSG00000030619 {'ENSMUSG00000030619', 'ENSMUSG00000027490', 'ENSMUSG00000000001', 'ENSMUSG00000067274', 'ENSMUSG00000022472'}
ENSMUSG00000027490 {'ENSMUSG00000030619', 'ENSMUSG00000027490', 'ENSMUSG00000000001', 'ENSMUSG00000067274', 'ENSMUSG00000022472'}
ENSMUSG00000000001 {'ENSMUSG00000030619', 'ENSMUSG00000027490', 'ENSMUSG00000000001', 'ENSMUSG00000067274', 'ENSMUSG00000022472'}
ENSMUSG00000067274 {'ENSMUSG00000030619', 'ENSMUSG00000027490', 'ENSMUSG00000000001', 'ENSMUSG00000067274', 'ENSMUSG00000022472'}
ENSMUSG00000022472 {'ENSMUSG00000030619', 'ENSMUSG00000027490', 'ENSMUSG00000000001', 'ENSMUSG00000067274', 'ENSMUSG00000022472'}
{'ENSMUSG00000030619': ['EBI-904301'], 'ENSMUSG

Unnamed: 0,identifier,identifier.source,target,target.source,IntAct_interactions
0,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl,"[{'interaction_id': 'EBI-7770205', 'interactor..."
1,ENSMUSG00000000001,Ensembl,ENSMUSG00000000001,Ensembl,[]
2,ENSMUSG00000030619,Ensembl,ENSMUSG00000030619,Ensembl,[]
3,ENSMUSG00000027490,Ensembl,ENSMUSG00000027490,Ensembl,[]
4,ENSMUSG00000022472,Ensembl,ENSMUSG00000022472,Ensembl,"[{'interaction_id': 'EBI-7768731', 'interactor..."


In [10]:
intact_df["IntAct_interactions"][0]

[{'interaction_id': 'EBI-7770205',
  'interactor_id_A': 'EBI-7768710',
  'interactor_id_B': 'EBI-529949',
  'binary_interaction_id': 12910167,
  'confidence_values': ['intact-miscore:0.37'],
  'intact_score': 0.37,
  'biological_role_A': 'unspecified role',
  'biological_role_B': 'unspecified role',
  'type': 'physical association',
  'stoichiometry_A': '0-0',
  'stoichiometry_B': '0-0',
  'detection_method': 'two hybrid pooling',
  'detection_method_id': 'MI:0398',
  'host_organism': 'Saccharomyces cerevisiae',
  'interactor_A_name': 'desi1_mouse',
  'interactor_B_name': 'rla0_mouse',
  'interactor_A_species': 'Mus musculus',
  'interactor_B_species': 'Mus musculus',
  'molecule_A': 'Desi1',
  'molecule_B': 'Rplp0',
  'id_A': 'Q9CQT7',
  'id_B': 'P14869',
  'pubmed_publication_id': '22370726',
  'ensembl': 'ENSMUSG00000067274'},
 {'interaction_id': 'EBI-2526357',
  'interactor_id_A': 'EBI-904301',
  'interactor_id_B': 'EBI-529949',
  'binary_interaction_id': 7267158,
  'confidence_val

In [None]:
intact_compound_df, intact_compound_metadata = intact.get_compound_interactions(bridgedb_df)

In [10]:
intact_data = intact.get_compound_related_interactions()
print(intact_data)

Failed to query IntAct.
[]


### 2.7 Gene expression edges

In [8]:
bgee_df, bgee_metadata = bgee.get_gene_expression(bridgedb_df=bridgedb_df)
bgee_df.head()

KeyboardInterrupt: 

In [78]:
bgee_df[BGEE_GENE_EXPRESSION_LEVELS_COL][1]

[{'anatomical_entity_id': 'UBERON_0004535',
  'anatomical_entity_name': 'cardiovascular system',
  'expression_level': 50.40476,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0001007',
  'anatomical_entity_name': 'digestive system',
  'expression_level': 74.9911,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0000948',
  'anatomical_entity_name': 'heart',
  'expression_level': 53.08205,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0002113',
  'anatomical_entity_name': 'kidney',
  'e

### 2.8 Transporter Inhibitors

In [37]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df_hl)
inhibitor_df.head()

# 3. Generating Graph

### 3.1 Combing all the results into single dataframe

In [37]:
#        bgee_df,
#        disgenet_df,
#        minerva_df,
#        opentargets_reactome_df,
#        opentargets_go_df,
#        opentargets_compound_df,
#        inhibitor_df,
#        kegg_df,
#        wikipathways_df,
#        ppi_df,
#        ensembl_homologs_df,

combined_df = combine_sources(
    bridgedb_df,
    [kegg_df, ppi_df, wikipathways_df, ensembl_homologs_df],
)

combined_df = combine_with_homologs(
    combined_df,
    [
        opentargets_reactome_df,
        opentargets_go_df,
        opentargets_compound_df,
        disgenet_df,
    ],
)


combined_df.head(10)

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways,StringDB_ppi,WikiPathways,Ensembl_homologs,homolog,OpenTargets_reactome,OpenTargets_go,OpenTargets_gene_compounds,DISGENET_diseases
0,ENSMUSG00000100426,Ensembl,ENSMUSG00000100426,Ensembl,,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...",,[{'homolog': nan}],,,,,
1,ENSMUSG00000032087,Ensembl,ENSMUSG00000032087,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'ge...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'homolog': 'ENSG00000177103'}],ENSG00000177103,"[{'pathway_label': 'DSCAM interactions', 'path...","[{'go_id': 'GO:0007409', 'go_name': 'axonogene...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'disease_name': 'Alcoholism', 'HPO': 'HPO:HP..."
2,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,"[{'pathway_id': 'path:mmu00830', 'pathway_labe...","[{'stringdb_link_to': 'ENSMUSG00000027452', 'E...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'homolog': 'ENSG00000165092'}],ENSG00000165092,"[{'pathway_label': 'Fructose catabolism', 'pat...","[{'go_id': 'GO:0051287', 'go_name': 'NAD bindi...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'disease_name': 'Melanoma', 'HPO': 'HPO:HP:0..."
3,ENSMUSG00000091813,Ensembl,ENSMUSG00000091813,Ensembl,"[{'pathway_id': 'path:mmu00983', 'pathway_labe...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'homolog': nan}],,,,,
4,ENSMUSG00000004035,Ensembl,ENSMUSG00000004035,Ensembl,"[{'pathway_id': 'path:mmu00480', 'pathway_labe...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc...","[{'pathway_id': 'WP:WP4466', 'pathway_label': ...",[{'homolog': 'ENSG00000213366'}],ENSG00000213366,"[{'pathway_label': 'Glutathione conjugation', ...","[{'go_id': 'GO:0043295', 'go_name': 'glutathio...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'disease_name': 'Carcinoma, Non Small Cell L..."
5,ENSMUSG00000001155,Ensembl,ENSMUSG00000001155,Ensembl,"[{'pathway_id': 'path:mmu00340', 'pathway_labe...","[{'stringdb_link_to': 'ENSMUSG00000028356', 'E...","[{'pathway_id': 'WP:WP435', 'pathway_label': '...",[{'homolog': 'ENSG00000160282'}],ENSG00000160282,"[{'pathway_label': 'Histidine catabolism', 'pa...","[{'go_id': 'GO:0005515', 'go_name': 'protein b...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...",[{'disease_name': 'Glutamate formiminotransfer...
6,ENSMUSG00000021336,Ensembl,ENSMUSG00000021336,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'ge...","[{'stringdb_link_to': 'ENSMUSG00000026205', 'E...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'homolog': 'ENSG00000146039'}],ENSG00000146039,"[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': 'GO:0035725', 'go_name': 'sodium io...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'disease_name': nan, 'HPO': nan, 'NCI': nan,..."
7,ENSMUSG00000027331,Ensembl,ENSMUSG00000027331,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'ge...","[{'stringdb_link_to': 'ENSMUSG00000006398', 'E...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'homolog': 'ENSG00000128944'}],ENSG00000128944,"[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': 'GO:0005737', 'go_name': 'cytoplasm...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...",[{'disease_name': 'Squamous cell carcinoma of ...
8,ENSMUSG00000028873,Ensembl,ENSMUSG00000028873,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'ge...","[{'stringdb_link_to': 'ENSMUSG00000006398', 'E...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'homolog': 'ENSG00000134690'}],ENSG00000134690,[{'pathway_label': 'Resolution of Sister Chrom...,"[{'go_id': 'GO:0000775', 'go_name': 'chromosom...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'disease_name': 'Liver cell carcinoma', 'HPO..."
9,ENSMUSG00000038486,Ensembl,ENSMUSG00000038486,Ensembl,"[{'pathway_id': 'path:mmu04512', 'pathway_labe...","[{'stringdb_link_to': 'ENSMUSG00000022415', 'E...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'homolog': 'ENSG00000159164'}],ENSG00000159164,[{'pathway_label': 'Toxicity of botulinum toxi...,"[{'go_id': 'GO:0030672', 'go_name': 'synaptic ...","[{'chembl_id': 'CHEMBL:CHEMBL1286', 'drugbank_...","[{'disease_name': 'Schizophrenias', 'HPO': 'HP..."


In [21]:
combined_df = combine_sources(
    bridgedb_df,
    [intact_df],
)

In [10]:
combined_df.shape

(5, 5)

### 3.2 Exporting the database in pickle format

In [None]:
with open("combined_df.pkl", "wb") as out:
    pickle.dump(combined_df, out)
# with open("opentargets_disease_compound_df.pkl", "wb") as out:
#     pickle.dump(opentargets_disease_compound_df, out)

### 3.3 Creating a graph from the annotated dataframe

In [11]:
# combined_df = generator.load_dataframe_from_pickle("combined_df.pkl")
# opentargets_disease_compound_df = generator.load_dataframe_from_pickle(
#     "opentargets_disease_compound_df.pkl"
# )

combined_df.head(15)

Unnamed: 0,identifier,identifier.source,target,target.source,IntAct_interactions
0,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl,"[{'interaction_id': 'EBI-6909752', 'interactor..."
1,ENSMUSG00000000001,Ensembl,ENSMUSG00000000001,Ensembl,"[{'interaction_id': 'EBI-22300555', 'interacto..."
2,ENSMUSG00000084349,Ensembl,ENSMUSG00000084349,Ensembl,[]
3,ENSMUSG00000027490,Ensembl,ENSMUSG00000027490,Ensembl,"[{'interaction_id': 'EBI-1186759', 'interactor..."
4,ENSMUSG00000046840,Ensembl,ENSMUSG00000046840,Ensembl,[]


In [84]:
combined_df["KEGG_pathways"][2]

[{'pathway_id': nan,
  'pathway_label': nan,
  'gene_count': nan,
  'compounds': [{'KEGG_identifier': None, 'name': None}]}]

In [85]:
opentargets_disease_compound_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0000786,UMLS,EFO_1001255,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1276308', 'drugba..."
1,UMLS_C0001175,UMLS,EFO_0000765,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL704', 'drugbank_i..."
2,UMLS_C0002103,UMLS,EFO_0005854,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1201353', 'drugba..."
3,UMLS_C0002171,UMLS,EFO_0004192,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1200963', 'drugba..."
4,UMLS_C0002940,UMLS,EFO_0004264,EFO,"[{'chembl_id': 'CHEMBL:CHEMBL1076903', 'drugba..."


In [38]:
pygraph = generator.build_networkx_graph(
    combined_df,
    disease_compound=opentargets_disease_compound_df,
    pathway_compound=kegg_compound_df,
    homolog_df_list=[
        opentargets_reactome_df,
        opentargets_compound_df,
    ],
)

Building graph:   0%|          | 0/81 [00:00<?, ?it/s]

Building graph: 100%|██████████| 81/81 [00:00<00:00, 500.39it/s]
100%|██████████| 81/81 [00:00<00:00, 1945.18it/s]


In [22]:
pygraph = generator.build_networkx_graph(combined_df)

Building graph: 100%|██████████| 5/5 [00:00<00:00, 2984.84it/s]
Building graph: 100%|██████████| 5/5 [00:00<00:00, 3236.85it/s]


In [23]:
print(pygraph)

MultiDiGraph with 5 nodes and 0 edges


### 3.4 Store the graph

In [40]:
with open("networkx_graph_test.pkl", "wb") as out:
    pickle.dump(pygraph, out)

### 3.5 Visualize the graph

In [89]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

# 4. Exporting Graph to external sources

### 4.1 Cytoscape
Make sure that the Cytoscape is open

In [11]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="Test network")

KeyError: 'source'

### 4.2 Neo4j

In [17]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "networkx_graph_usecase_with_homologs.graphml")

NameError: name 'pygraph' is not defined

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph  before) Remove all the nodes before importing `.graphml` file

    ```MATCH (n) DETACH DELETE n```

- Import `.graphml` file

    ```call apoc.import.graphml('file:///networkx_graph_test.graphml',{readLabels:TRUE})```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```
    

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) 
        - Gene (```MATCH (n:Gene) RETURN count(n)```)
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```)
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) 
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) 
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) 
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) 
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) 
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) 
        - Disease (```MATCH (n:Disease) RETURN count(n)```) 
        - Compound (```MATCH (n:Compound) RETURN count(n)```)
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) 
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) 
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) 
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) 
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) 
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) 
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) 
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) 
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) 
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) 

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("networkx_graph_test.csv",{})```