# Example: Polysistice Kidney Disease use case

In [1]:
# Import modules
import os
import pickle

import pandas as pd

import pyBiodatafuse.annotators as ann
import pyBiodatafuse.constants as Cons
from pyBiodatafuse import human_homologs, id_mapper
from pyBiodatafuse.graph import cytoscape, generator, neo4j
from pyBiodatafuse.graph.rdf import BDFGraph

# from pyBiodatafuse.graph.rdf.graphdb import GraphDBManager
from pyBiodatafuse.utils import combine_sources, combine_with_homologs, create_harmonized_input_file

  from .autonotebook import tqdm as notebook_tqdm


# 1. Entity resolution using BridgeDB

### 1.1. Load the input list and convert it to a dataframe
Here we use the mice proteins from Ensembl as starting point

In [2]:
base_dir = os.path.abspath(os.getcwd())  # Ensures an absolute path
DATA_DIR = os.path.join(base_dir, "data")
os.makedirs(DATA_DIR, exist_ok=True)

In [3]:
genes_of_interest = """ENSMUSG00000026295
ENSMUSG00000022877
ENSMUSG00000020914
ENSMUSG00000024747
ENSMUSG00000032081
ENSMUSG00000004035
ENSMUSG00000072949
ENSMUSG00000028970
ENSMUSG00000028937
ENSMUSG00000075044
ENSMUSG00000067274
ENSMUSG00000000001
ENSMUSG00000030619
ENSMUSG00000027490
ENSMUSG00000022472
ENSMUSG00000059552"""

gene_list = genes_of_interest.split("\n")
len(gene_list)

16

In [4]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,ENSMUSG00000026295
1,ENSMUSG00000022877
2,ENSMUSG00000020914
3,ENSMUSG00000024747
4,ENSMUSG00000032081


### 1.2. Query BridgeDB

In [5]:
pickle_path = f"{DATA_DIR}/PKD_gene_list.pkl"
metadata_path = f"{DATA_DIR}/PKD_gene_list_metadata.pkl"

# Mouse usecase
input_species = "Mouse"

if not os.path.exists(pickle_path):
    bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
        identifiers=data_input,
        input_species=input_species,
        input_datasource="Ensembl",
        output_datasource="All",
    )
    bridgedb_df.to_pickle(pickle_path)
    with open(metadata_path, "wb") as file:
        pickle.dump(bridgedb_metadata, file)
else:
    bridgedb_df = pd.read_pickle(pickle_path)
    with open(metadata_path, "rb") as file:
        bridgedb_metadata = pickle.load(file)

bridgedb_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000026295,Ensembl,4434677,Affy
1,ENSMUSG00000026295,Ensembl,Q8K1I3,Uniprot-TrEMBL
2,ENSMUSG00000026295,Ensembl,A_51_P166152,Agilent
3,ENSMUSG00000026295,Ensembl,5314352,Affy
4,ENSMUSG00000026295,Ensembl,4638627,Affy


### 1.3 Homologs

In [6]:
pickle_path = f"{DATA_DIR}/homologs.pkl"

if not os.path.exists(pickle_path):
    ensembl_homologs_df, ensembl_metadata = human_homologs.get_homologs(bridgedb_df=bridgedb_df)
    ensembl_homologs_df.to_pickle(pickle_path)
else:
    ensembl_homologs_df = pd.read_pickle(pickle_path)

ensembl_homologs_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Ensembl_homologs
0,ENSMUSG00000026295,Ensembl,ENSMUSG00000026295,Ensembl,[{'homolog': 'ENSG00000072080'}]
1,ENSMUSG00000022877,Ensembl,ENSMUSG00000022877,Ensembl,[{'homolog': 'ENSG00000113905'}]
2,ENSMUSG00000020914,Ensembl,ENSMUSG00000020914,Ensembl,[{'homolog': 'ENSG00000131747'}]
3,ENSMUSG00000024747,Ensembl,ENSMUSG00000024747,Ensembl,[{'homolog': 'ENSG00000165092'}]
4,ENSMUSG00000032081,Ensembl,ENSMUSG00000032081,Ensembl,[{'homolog': 'ENSG00000110245'}]


In [7]:
homologs = (
    ensembl_homologs_df[Cons.ENSEMBL_HOMOLOG_COL]
    .apply(
        lambda x: (
            x[0]["homolog"] if isinstance(x, list) and len(x) > 0 and "homolog" in x[0] else None
        )
    )
    .dropna()
    .tolist()
)

len(homologs)

15

### 1.4 Query homologs

In [8]:
pickle_path = f"{DATA_DIR}/PKD_homolog_df.pkl"
metadata_path = f"{DATA_DIR}/PKD_homolog_df_metadata.pkl"

input_species = "Human"
data_input_hl = pd.DataFrame(homologs, columns=["identifier"])

if not os.path.exists(pickle_path):
    bridgedb_df_hl, bridgedb_metadata_hl = id_mapper.bridgedb_xref(
        identifiers=data_input_hl,
        input_species="Human",
        input_datasource="Ensembl",
        output_datasource="All",
    )
    bridgedb_df_hl.to_pickle(pickle_path)
    with open(metadata_path, "wb") as file:
        pickle.dump(bridgedb_metadata_hl, file)
else:
    bridgedb_df_hl = pd.read_pickle(pickle_path)
    with open(metadata_path, "rb") as file:
        bridgedb_metadata_hl = pickle.load(file)

bridgedb_df_hl.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSG00000072080,Ensembl,XM_011511699,RefSeq
1,ENSG00000072080,Ensembl,XM_005246102,RefSeq
2,ENSG00000072080,Ensembl,C9J6K0,Uniprot-TrEMBL
3,ENSG00000072080,Ensembl,GO:0010951,Gene Ontology
4,ENSG00000072080,Ensembl,11729425_a_at,Affy


# 2. Step-by-step graph generation

The following annotators have been used:
- DisGeNet
- OpenTargets
- PubChem
- WikiPathways
- 

### 2.1. Gene-Disease edges


In [None]:
load_dotenv("disgenet.env")

disgenet_api_key = os.getenv("DISGENET_API_KEY")
print(disgenet_api_key)

In [None]:
disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
    api_key=disgenet_api_key, bridgedb_df=bridgedb_df_hl
)
disgenet_df.head()

In [None]:
disgenet_df[DISGENET_DISEASE_COL][0]

### 2.2 Disease-Compound edges

In [None]:
# Prepare the input to use DISGENET output as seed for OpenTargets
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

In [None]:
(
    opentargets_disease_compound_df,
    opentargets_disease_compound_metadata,
) = opentargets.get_disease_compound_interactions(disease_mapping_df)
opentargets_disease_compound_df.head()

In [None]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

### 2.3 Compound Annotation

#### Compounds from OpenTargets

In [None]:
opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
    bridgedb_df=bridgedb_df_hl
)
opentargets_compound_df.head()

In [None]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][0]

#### Compounds from PubChem

In [None]:
pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(
    bridgedb_df=bridgedb_df_hl
)

### 2.4 Gene-Pathways edges

#### Pathways from WikiPathways

In [None]:
wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df)
wikipathways_df.head()

In [None]:
wikipathways_df[WIKIPATHWAYS][0]

#### Pathways from KEGG

In [None]:
compounds_of_interest = """C01089
C00020
C02571
C00212
C00041
C00152
C00049
C00719
C00114
C00158
C00300
C01026
C00122
C00031
C00025
C00064
C00037
C00135
C00262
C00130
C00294
C00407
C00186
C00123
C00149
C00073
C00137
C00003
C00153
C00079
C00588
C00346
C04230
C00245
C00188
C00082
C00043
C00105
C00106
C00299
C00183"""

metabolite_list = compounds_of_interest.split("\n")
data_input = pd.DataFrame(metabolite_list, columns=["identifier"])

bridgdb_df_cmp, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Mouse",
    input_datasource="KEGG Compound",
    output_datasource="All",
)
bridgdb_df_cmp.head()

In [None]:
kegg_df, kegg_metadata = kegg.get_pathways(bridgedb_df)
kegg_df.head()

In [None]:
kegg_df["KEGG_pathways"][2]

In [None]:
kegg_compound_df = kegg.get_compounds(bridgdb_df_cmp)
kegg_compound_df.head()

In [None]:
kegg_df.head()

In [None]:
kegg_compound_df["KEGG_compounds"][0]

In [29]:
# data_input_compounds = pd.DataFrame(kegg_identifiers, columns=["identifier"])
# data_input_compounds.head()

# bridgdb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
#     identifiers=data_input_compounds,
#     input_species="Mouse",
#     input_datasource="KEGG Compound",
#     output_datasource="PubChem Compound",
# )
# bridgdb_df.head(25)

In [None]:
import requests

chebi_id = "15422"

url = f"http://webservice.bridgedb.org/Human/xrefs/ChEBI/15422"
response = requests.get(url)

if response.status_code == 200:
    # Parse the response to extract metabolite name(s)
    lines = response.text.splitlines()
    print(lines)
    names = [line.split("\t")[2] for line in lines if len(line.split("\t")) > 2]
    print(names if names else "No metabolite names found.")
else:
    print(f"Error: Unable to retrieve data (status code {response.status_code}).")

#### Reactome pathways from OpenTargets

In [None]:
opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
    bridgedb_df=bridgedb_df_hl
)
opentargets_reactome_df.head()

In [None]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][0]

### 2.5 Gene Ontology from OpenTargets

In [None]:
opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(
    bridgedb_df=bridgedb_df_hl
)
opentargets_go_df.head()

In [None]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

### 2.6. Protein-Protein Interactions

In [None]:
input_species = "Mouse"
ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df, species=input_species)
ppi_df.head()

In [None]:
ppi_df[STRING_PPI_COL][3]

### Intact WIP

In [None]:
intact_df, intact_metadata = intact.get_interactions(bridgedb_df)
intact_df.head()

In [None]:
intact_df["IntAct_interactions"][0]

In [None]:
intact_compound_df, intact_compound_metadata = intact.get_compound_interactions(bridgedb_df)

In [None]:
intact_data = intact.get_compound_related_interactions()
print(intact_data)

### 2.7 Gene expression edges

In [None]:
bgee_df, bgee_metadata = bgee.get_gene_expression(bridgedb_df=bridgedb_df)
bgee_df.head()

In [None]:
bgee_df[BGEE_GENE_EXPRESSION_LEVELS_COL][1]

### 2.8 Transporter Inhibitors

In [None]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df_hl)
inhibitor_df.head()

# 3. Generating Graph

### 3.1 Combing all the results into single dataframe

In [None]:
#        bgee_df,
#        disgenet_df,
#        minerva_df,
#        opentargets_reactome_df,
#        opentargets_go_df,
#        opentargets_compound_df,
#        inhibitor_df,
#        kegg_df,
#        wikipathways_df,
#        ppi_df,
#        ensembl_homologs_df,

combined_df = combine_sources(
    bridgedb_df,
    [kegg_df, ppi_df, wikipathways_df, ensembl_homologs_df],
)

combined_df = combine_with_homologs(
    combined_df,
    [
        opentargets_reactome_df,
        opentargets_go_df,
        opentargets_compound_df,
        disgenet_df,
    ],
)


combined_df.head(10)

In [21]:
combined_df = combine_sources(
    bridgedb_df,
    [intact_df],
)

In [None]:
combined_df.shape

### 3.2 Exporting the database in pickle format

In [None]:
with open("combined_df.pkl", "wb") as out:
    pickle.dump(combined_df, out)
# with open("opentargets_disease_compound_df.pkl", "wb") as out:
#     pickle.dump(opentargets_disease_compound_df, out)

### 3.3 Creating a graph from the annotated dataframe

In [None]:
# combined_df = generator.load_dataframe_from_pickle("combined_df.pkl")
# opentargets_disease_compound_df = generator.load_dataframe_from_pickle(
#     "opentargets_disease_compound_df.pkl"
# )

combined_df.head(15)

In [None]:
combined_df["KEGG_pathways"][2]

In [None]:
opentargets_disease_compound_df.head()

In [None]:
pygraph = generator.build_networkx_graph(
    combined_df,
    disease_compound=opentargets_disease_compound_df,
    pathway_compound=kegg_compound_df,
    homolog_df_list=[
        opentargets_reactome_df,
        opentargets_compound_df,
    ],
)

In [None]:
pygraph = generator.build_networkx_graph(combined_df)

In [None]:
print(pygraph)

### 3.4 Store the graph

In [40]:
with open("networkx_graph_test.pkl", "wb") as out:
    pickle.dump(pygraph, out)

### 3.5 Visualize the graph

In [89]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

# 4. Exporting Graph to external sources

### 4.1 Cytoscape
Make sure that the Cytoscape is open

In [None]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="Test network")

### 4.2 Neo4j

In [None]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "networkx_graph_usecase_with_homologs.graphml")

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph  before) Remove all the nodes before importing `.graphml` file

    ```MATCH (n) DETACH DELETE n```

- Import `.graphml` file

    ```call apoc.import.graphml('file:///networkx_graph_test.graphml',{readLabels:TRUE})```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```
    

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) 
        - Gene (```MATCH (n:Gene) RETURN count(n)```)
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```)
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) 
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) 
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) 
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) 
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) 
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) 
        - Disease (```MATCH (n:Disease) RETURN count(n)```) 
        - Compound (```MATCH (n:Compound) RETURN count(n)```)
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) 
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) 
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) 
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) 
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) 
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) 
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) 
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) 
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) 
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) 

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("networkx_graph_test.csv",{})```