# Example: Polysistice Kidney Disease use case

In [3]:
# Setting up the working directory
import os
import sys

src_path = os.path.abspath(os.path.join("..", "..", "..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)

print(sys.path)  # Verify the correct src path is included

['', '/home/javi/.local/lib/python3.11/site-packages', '/home/javi/pyBiodatafuse-2/examples/usecases/PKD', '/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '/home/javi/pyBiodatafuse-2/.venv/lib/python3.11/site-packages', '/home/javi/pyBiodatafuse-2/src']


In [None]:
# Import modules
import pickle

import pandas as pd
from dotenv import load_dotenv

from pyBiodatafuse import id_mapper
from pyBiodatafuse import human_homologs
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    kegg,
    newkegg,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.constants import (
    BGEE_GENE_EXPRESSION_LEVELS_COL,
    DISGENET_DISEASE_COL,
    ENSEMBL_HOMOLOG_COL,
    KEGG_COL,
    OPENTARGETS_DISEASE_COMPOUND_COL,
    OPENTARGETS_GENE_COMPOUND_COL,
    OPENTARGETS_GO_COL,
    OPENTARGETS_REACTOME_COL,
    STRING_PPI_COL,
    WIKIPATHWAYS,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import combine_sources, create_harmonized_input_file, combine_with_homologs

# 1. Entity resolution using BridgeDB

### 1.1. Load the input list and convert it to a dataframe

Small set of mouse data used for debugging.

In [5]:
# TEST Mice Ensembl
genes_of_interest = """ENSMUSG00000067274
ENSMUSG00000000001
ENSMUSG00000084349
ENSMUSG00000025428
ENSMUSG00000044533"""

# TEST Mice use case = 
genes_of_interest = """ENSMUSG00000026295
ENSMUSG00000022877
ENSMUSG00000020914
ENSMUSG00000024747
ENSMUSG00000032081
ENSMUSG00000004035
ENSMUSG00000072949
ENSMUSG00000028970
ENSMUSG00000028937
ENSMUSG00000075044"""

# TEST Rat Ensembl
# genes_of_interest = """ENSRNOG00060027926
# ENSRNOG00055005387
# ENSRNOG00060018596
# ENSRNOG00060011358
# ENSRNOG00055009275 
# """

# TEST Human HGNC
# genes_of_interest = """CHRNG
# DMD
# AHR
# SCN4A
# LC25A1
# HTR3A"""

# TEST Human Ensembl
# genes_of_interest = """ENSG00000072080
# ENSG00000113905
# ENSG00000131747
# ENSG00000165092
# ENSG00000110245
# ENSG00000213366
# ENSG00000184227
# ENSG00000085563
# ENSG00000097021
# ENSG00000149742"""

gene_list = genes_of_interest.split("\n")
len(gene_list)

10

Mouse use case

In [None]:
# from pyBiodatafuse import data_loader, id_mapper

# data_input = data_loader.create_df_from_dea("data/full_de_genes_treated_vs_untreated_plus_cpm_fc.csv")
# data_filtered = data_input[data_input['DE'].isin([1, -1])]
# features_filtered = data_filtered['identifier']

# gene_list = features_filtered.tolist()
# print(len(gene_list))

In [6]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head(20)

Unnamed: 0,identifier
0,ENSMUSG00000026295
1,ENSMUSG00000022877
2,ENSMUSG00000020914
3,ENSMUSG00000024747
4,ENSMUSG00000032081
5,ENSMUSG00000004035
6,ENSMUSG00000072949
7,ENSMUSG00000028970
8,ENSMUSG00000028937
9,ENSMUSG00000075044


In [None]:
print(gene_list)

### 1.2. Query BridgeDB

In [7]:
# Mouse usecase
input_species="Mouse"

bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
   identifiers=data_input,
    input_species=input_species,
    input_datasource="Ensembl",
    output_datasource="All",
)

# TEST Human data
# bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
#     identifiers=data_input,
#     input_species="Human",
#     input_datasource="Ensembl",
#     output_datasource="All",
# )

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000026295,Ensembl,Q8K1I3,Uniprot-TrEMBL
1,ENSMUSG00000026295,Ensembl,ENSMUSG00000026295,Ensembl
2,ENSMUSG00000026295,Ensembl,A0A087WSF3,Uniprot-TrEMBL
3,ENSMUSG00000026295,Ensembl,H3BLP2,Uniprot-TrEMBL
4,ENSMUSG00000026295,Ensembl,75396,NCBI Gene
5,ENSMUSG00000022877,Ensembl,ENSMUSG00000022877,Ensembl
6,ENSMUSG00000022877,Ensembl,94175,NCBI Gene
7,ENSMUSG00000022877,Ensembl,A0A0R4J039,Uniprot-TrEMBL
8,ENSMUSG00000022877,Ensembl,A0A338P6H8,Uniprot-TrEMBL
9,ENSMUSG00000020914,Ensembl,ENSMUSG00000020914,Ensembl


### 1.3 Homologs

In [8]:
ensembl_homologs_df, ensembl_metadata = human_homologs.get_homologs(bridgedb_df=bridgedb_df)
ensembl_homologs_df.head()


Unnamed: 0,identifier,identifier.source,target,target.source,Ensembl_homologs
0,ENSMUSG00000026295,Ensembl,ENSMUSG00000026295,Ensembl,[{'homolog': 'ENSG00000072080'}]
1,ENSMUSG00000022877,Ensembl,ENSMUSG00000022877,Ensembl,[{'homolog': 'ENSG00000113905'}]
2,ENSMUSG00000020914,Ensembl,ENSMUSG00000020914,Ensembl,[{'homolog': 'ENSG00000131747'}]
3,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,[{'homolog': 'ENSG00000165092'}]
4,ENSMUSG00000032081,Ensembl,ENSMUSG00000032081,Ensembl,[{'homolog': 'ENSG00000110245'}]


In [9]:
ensembl_homologs_df.head(20)

Unnamed: 0,identifier,identifier.source,target,target.source,Ensembl_homologs
0,ENSMUSG00000026295,Ensembl,ENSMUSG00000026295,Ensembl,[{'homolog': 'ENSG00000072080'}]
1,ENSMUSG00000022877,Ensembl,ENSMUSG00000022877,Ensembl,[{'homolog': 'ENSG00000113905'}]
2,ENSMUSG00000020914,Ensembl,ENSMUSG00000020914,Ensembl,[{'homolog': 'ENSG00000131747'}]
3,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,[{'homolog': 'ENSG00000165092'}]
4,ENSMUSG00000032081,Ensembl,ENSMUSG00000032081,Ensembl,[{'homolog': 'ENSG00000110245'}]
5,ENSMUSG00000004035,Ensembl,ENSMUSG00000004035,Ensembl,[{'homolog': 'ENSG00000213366'}]
6,ENSMUSG00000072949,Ensembl,ENSMUSG00000072949,Ensembl,[{'homolog': 'ENSG00000184227'}]
7,ENSMUSG00000028970,Ensembl,ENSMUSG00000028970,Ensembl,[{'homolog': 'ENSG00000085563'}]
8,ENSMUSG00000028937,Ensembl,ENSMUSG00000028937,Ensembl,[{'homolog': 'ENSG00000097021'}]
9,ENSMUSG00000075044,Ensembl,ENSMUSG00000075044,Ensembl,[{'homolog': 'ENSG00000149742'}]


In [10]:
homologs = ensembl_homologs_df[ENSEMBL_HOMOLOG_COL].apply(
    lambda x: x[0]['homolog'] if isinstance(x, list) and len(x) > 0 and 'homolog' in x[0] else None
).dropna().tolist()

print(homologs)

['ENSG00000072080', 'ENSG00000113905', 'ENSG00000131747', 'ENSG00000165092', 'ENSG00000110245', 'ENSG00000213366', 'ENSG00000184227', 'ENSG00000085563', 'ENSG00000097021', 'ENSG00000149742']


### 1.4 Query homologs

In [11]:
input_species="Human"

data_input_hl = pd.DataFrame(homologs, columns=["identifier"])

bridgedb_df_hl, bridgedb_metadata_hl = id_mapper.bridgedb_xref(
    identifiers=data_input_hl,
    input_species="Human",
    input_datasource="Ensembl",
    output_datasource="All",
)

bridgedb_df.head(25)

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000026295,Ensembl,Q8K1I3,Uniprot-TrEMBL
1,ENSMUSG00000026295,Ensembl,ENSMUSG00000026295,Ensembl
2,ENSMUSG00000026295,Ensembl,A0A087WSF3,Uniprot-TrEMBL
3,ENSMUSG00000026295,Ensembl,H3BLP2,Uniprot-TrEMBL
4,ENSMUSG00000026295,Ensembl,75396,NCBI Gene
5,ENSMUSG00000022877,Ensembl,ENSMUSG00000022877,Ensembl
6,ENSMUSG00000022877,Ensembl,94175,NCBI Gene
7,ENSMUSG00000022877,Ensembl,A0A0R4J039,Uniprot-TrEMBL
8,ENSMUSG00000022877,Ensembl,A0A338P6H8,Uniprot-TrEMBL
9,ENSMUSG00000020914,Ensembl,ENSMUSG00000020914,Ensembl


# 2. Step-by-step graph generation

### 2.1. Gene-Disease edges


In [None]:
load_dotenv('disgenet.env')
disgenet_api_key = os.getenv("DISGENET_API_KEY")


In [None]:
disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
    api_key=disgenet_api_key, bridgedb_df=bridgedb_df_hl
)
disgenet_df.head()

In [None]:
disgenet_df[DISGENET_DISEASE_COL][0]

### 2.2 Disease-Compound edges

In [None]:
# Prepare the input to use DISGENET output as seed for OpenTargets
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

In [None]:
(
    opentargets_disease_compound_df,
    opentargets_disease_compound_metadata,
) = opentargets.get_disease_compound_interactions(disease_mapping_df)
opentargets_disease_compound_df.head()

In [None]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

### 2.3 Compound Annotation

#### Compounds from OpenTargets

In [None]:
opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
    bridgedb_df=bridgedb_df_hl
)
opentargets_compound_df.head()

In [None]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][1]

#### Compounds from PubChem

In [None]:
pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(
        bridgedb_df=bridgedb_df_hl
    )

### 2.4 Gene-Pathways edges

#### Pathways from WikiPathways

In [None]:
wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df)
wikipathways_df.head()

In [None]:
wikipathways_df[WIKIPATHWAYS][0]

#### Pathways from KEGG

In [12]:
kegg_df, kegg_metadata = kegg.get_pathways(bridgedb_df)
kegg_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,KEGG_pathways
0,ENSMUSG00000026295,Ensembl,75396,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'ge..."
1,ENSMUSG00000022877,Ensembl,94175,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'ge..."
2,ENSMUSG00000020914,Ensembl,21973,NCBI Gene,"[{'pathway_id': 'mmu01524', 'pathway_label': '..."
3,ENSMUSG00000024747,Ensembl,26358,NCBI Gene,"[{'pathway_id': 'mmu00830', 'pathway_label': '..."
4,ENSMUSG00000032081,Ensembl,11814,NCBI Gene,"[{'pathway_id': 'mmu03320', 'pathway_label': '..."


In [14]:
kegg_df[KEGG_COL][3]

[{'pathway_id': 'mmu00830',
  'pathway_label': 'Retinol metabolism - Mus musculus (house mouse)',
  'gene_count': 101,
  'compounds': [{'KEGG_identifier': 'C00376'},
   {'KEGG_identifier': 'C00473'},
   {'KEGG_identifier': 'C00777'},
   {'KEGG_identifier': 'C00778'},
   {'KEGG_identifier': 'C00899'},
   {'KEGG_identifier': 'C02075'},
   {'KEGG_identifier': 'C02094'},
   {'KEGG_identifier': 'C02110'},
   {'KEGG_identifier': 'C02588'},
   {'KEGG_identifier': 'C03455'},
   {'KEGG_identifier': 'C05914'},
   {'KEGG_identifier': 'C05915'},
   {'KEGG_identifier': 'C05916'},
   {'KEGG_identifier': 'C05917'},
   {'KEGG_identifier': 'C11061'},
   {'KEGG_identifier': 'C15492'},
   {'KEGG_identifier': 'C15493'},
   {'KEGG_identifier': 'C16677'},
   {'KEGG_identifier': 'C16678'},
   {'KEGG_identifier': 'C16679'},
   {'KEGG_identifier': 'C16680'},
   {'KEGG_identifier': 'C16681'},
   {'KEGG_identifier': 'C16682'},
   {'KEGG_identifier': 'C16683'},
   {'KEGG_identifier': 'C21797'}]},
 {'pathway_id': 

#### Reactome pathways from OpenTargets

In [None]:
opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
    bridgedb_df=bridgedb_df_hl
)
opentargets_reactome_df.head()

In [None]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][0]

### 2.5 Gene Ontology from OpenTargets

In [None]:
opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(
    bridgedb_df=bridgedb_df_hl
)
opentargets_go_df.head()

In [None]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

### 2.6. Protein-Protein Interactions

In [None]:
input_species = "Mouse"
ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df, species=input_species)
ppi_df.head()

In [None]:
ppi_df[STRING_PPI_COL][3]

### 2.7 Gene expression edges

In [None]:
bgee_df, bgee_metadata = bgee.get_gene_expression(bridgedb_df=bridgedb_df)
bgee_df.head()

In [None]:
bgee_df[BGEE_GENE_EXPRESSION_LEVELS_COL][1]

### 2.8 Transporter Inhibitors

In [None]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df_hl)
inhibitor_df.head()

# 3. Generating Graph

### 3.1 Combing all the results into single dataframe

In [None]:
#        bgee_df,
#        disgenet_df,
#        minerva_df,
#        opentargets_reactome_df,
#        opentargets_go_df,
#        opentargets_compound_df,
#        inhibitor_df,
#        kegg_df,

combined_df = combine_sources(
    bridgedb_df,
    [
        wikipathways_df,
        ppi_df,
        ensembl_homologs_df,
        kegg_df
    ],
)

combined_df = combine_with_homologs(
    combined_df,
    [
        opentargets_reactome_df,
        opentargets_go_df,
        opentargets_compound_df,
        disgenet_df,
        inhibitor_df
    ],
)



combined_df.head(10)

In [None]:
combined_df.shape

### 3.2 Exporting the database in pickle format

In [None]:
with open("combined_df.pkl", "wb") as out:
    pickle.dump(combined_df, out)
# with open("opentargets_disease_compound_df.pkl", "wb") as out:
#     pickle.dump(opentargets_disease_compound_df, out)

### 3.3 Creating a graph from the annotated dataframe

In [None]:
# combined_df = generator.load_dataframe_from_pickle("combined_df.pkl")
# opentargets_disease_compound_df = generator.load_dataframe_from_pickle(
#     "opentargets_disease_compound_df.pkl"
# )

combined_df.head(15)

In [None]:
combined_df['KEGG_pathways'][2]

In [None]:
opentargets_disease_compound_df.head()

In [None]:
pygraph = generator.build_networkx_graph(combined_df, homolog_df_list = [opentargets_compound_df,
        disgenet_df])

In [None]:
print(pygraph)

### 3.4 Store the graph

In [None]:
with open("networkx_graph_test.pkl", "wb") as out:
    pickle.dump(pygraph, out)

### 3.5 Visualize the graph

In [None]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

# 4. Exporting Graph to external sources

### 4.1 Cytoscape
Make sure that the Cytoscape is open

In [None]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="Test network")

### 4.2 Neo4j

In [None]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "networkx_graph_usecase.graphml")

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph  before) Remove all the nodes before importing `.graphml` file

    ```MATCH (n) DETACH DELETE n```

- Import `.graphml` file

    ```call apoc.import.graphml('file:///networkx_graph_test.graphml',{readLabels:TRUE})```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```
    

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) 
        - Gene (```MATCH (n:Gene) RETURN count(n)```)
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```)
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) 
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) 
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) 
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) 
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) 
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) 
        - Disease (```MATCH (n:Disease) RETURN count(n)```) 
        - Compound (```MATCH (n:Compound) RETURN count(n)```)
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) 
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) 
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) 
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) 
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) 
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) 
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) 
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) 
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) 
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) 

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("networkx_graph_test.csv",{})```