## Example: PCS use case

This notebook shows all the steps to generate PCS KG and the downstream analysis.

#### Set up the environment

In [5]:
import os

os.chdir(os.path.abspath(os.path.join(os.getcwd(), "src")))

In [6]:
# Import modules
import pickle

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from dotenv import load_dotenv

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import disgenet, minerva, opentargets, stringdb, wikipathways
from pyBiodatafuse.constants import (
    DISGENET_DISEASE_COL,
    MINERVA,
    OPENTARGETS_DISEASE_COMPOUND_COL,
    OPENTARGETS_GENE_COMPOUND_COL,
    OPENTARGETS_GO_COL,
    OPENTARGETS_REACTOME_COL,
    STRING_PPI_COL,
    WIKIPATHWAYS,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import (
    combine_sources,
    create_harmonized_input_file,
    create_or_append_to_metadata,
)

os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..")))

### Load the input list and convert it to a dataframe

In [7]:
file_path = os.path.join(os.getcwd(), "examples", "usecases", "PCS", "PCS_gene_list.csv")
data_input = pd.read_csv(file_path)
print("Total number of genes:", len(data_input.drop_duplicates()))
data_input.head()

Total number of genes: 2023


Unnamed: 0,identifier
0,LOC729609
1,LOC105374060
2,DMP1
3,PNLIP
4,OR4N3P


### Entity resolution using BridgeDB

In [8]:
bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Human",
    input_datasource="HGNC",
    output_datasource="All",
)
bridgedb_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "bridgedb_df.pkl"))
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "bridgedb_metadata.pkl"), "wb"
) as file:
    pickle.dump(bridgedb_metadata, file)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "bridgedb_df.pkl"), "rb"
) as file:
    bridgedb_df = pickle.load(file)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "bridgedb_metadata.pkl"), "rb"
) as file:
    bridgedb_metadata = pickle.load(file)

print("Number of genes with mapping in BridgeDb:", len(bridgedb_df["identifier"].unique()))
bridgedb_df.head()

Number of genes with mapping in BridgeDb: 1667


Unnamed: 0,identifier,identifier.source,target,target.source
0,DMP1,HGNC,Q13316,Uniprot-TrEMBL
1,DMP1,HGNC,HGNC:2932,HGNC Accession Number
2,DMP1,HGNC,DMP1,HGNC
3,DMP1,HGNC,ENSG00000152592,Ensembl
4,DMP1,HGNC,1758,NCBI Gene


### Gene to Disease annotatation from DisGeNet


**ADD your DISGENET API KEY in the main folder**

  **1)** Create a ``.env`` file and add DISGENET_API_KEY to it:

      DISGENET_API_KEY="your-API-key-value"

  **2)** Install *python-dotenv*:
  
      ```
      pip install python-dotenv
      ```

In [15]:
# Read the .env File
load_dotenv()
# Retrieve the key from the environment variable
disgenet_api_key = os.getenv("DISGENET_API_KEY")

4864214e-2cd6-4efd-b2b4-7b6cf36fcf90


In [None]:
disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
    api_key=disgenet_api_key, bridgedb_df=bridgedb_df
)
disgenet_df.to_pickle(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "disgenet_df.pkl")
)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "disgenet_metadata.pkl"
    ),
    "wb",
) as file:
    pickle.dump(disgenet_metadata, file)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "disgenet_df.pkl"), "rb"
) as file:
    disgenet_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "disgenet_metadata.pkl"
    ),
    "rb",
) as file:
    disgenet_metadata = pickle.load(file)

disgenet_df.head()



200




200




200




200




200




200




200




200




200




200




429


In [17]:
disgenet_df[DISGENET_DISEASE_COL][0]

KeyError: 'DISGENET_diseases'

### Add literature-based data
Genes found to be associated with Post-COVID-19

In [None]:
pcs_associated_genes = pd.read_excel(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "pcs_associated_genes.xlsx")
)
pcs_associated_genes.head()

#### Define the literature based info

In [None]:
from pyBiodatafuse.constants import LITERATURE_DISEASE_COL, LITERATURE_DISEASE_OUTPUT_DICT

literature_disease_attrs = LITERATURE_DISEASE_OUTPUT_DICT.copy()
literature_disease_attrs["disease_name"] = "Post-COVID-19"
literature_disease_attrs["id"] = "C00000"
literature_disease_attrs["source"] = "PMID: 37675861"


def get_literature_based_info(gene):
    if gene in pcs_associated_genes["Gene"].values:
        return [literature_disease_attrs]
    else:
        return [{"disease_name": np.nan, "id": np.nan, "source": np.nan}]


disgenet_df[LITERATURE_DISEASE_COL] = disgenet_df["identifier"].apply(get_literature_based_info)

disgenet_df.head()

In [None]:
disgenet_df[disgenet_df["identifier"] == "DMP1"][LITERATURE_DISEASE_COL]

In [None]:
print(pcs_associated_genes["Gene"].isin(disgenet_df["identifier"]).sum())

### Disease to Compound annotation from OpenTargets

##### Prepare the input to use DISGENET output as seed for OpenTargets


In [None]:
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

##### Disease to Compound annotation

TODO: to run again.

In [None]:
# (
#     opentargets_disease_compound_df,
#     opentargets_disease_compound_metadata,
# ) = opentargets.get_disease_compound_interactions(disease_mapping_df)

# opentargets_disease_compound_df.to_pickle(
#     os.path.join(
#         os.getcwd(),
#         "examples",
#         "usecases",
#         "PCS",
#         "datasources",
#         "opentargets_disease_compound_df.pkl",
#     )
# )
# with open(
#     os.path.join(
#         os.getcwd(),
#         "examples",
#         "usecases",
#         "PCS",
#         "datasources",
#         "opentargets_disease_compound_metadata.pkl",
#     ),
#     "wb",
# ) as file:
#     pickle.dump(opentargets_disease_compound_metadata, file)

with open(
    os.path.join(
        os.getcwd(),
        "examples",
        "usecases",
        "PCS",
        "datasources",
        "opentargets_disease_compound_df.pkl",
    ),
    "rb",
) as file:
    opentargets_disease_compound_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(),
        "examples",
        "usecases",
        "PCS",
        "datasources",
        "opentargets_disease_compound_metadata.pkl",
    ),
    "rb",
) as file:
    opentargets_disease_compound_metadata = pickle.load(file)
opentargets_disease_compound_df.head()

In [None]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

### Gene to Compound annotation from OpenTarget

In [None]:
# opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
#     bridgedb_df=bridgedb_df
# )

# opentargets_compound_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_compound_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_compound_metadata.pkl"), "wb") as file:
#     pickle.dump(opentargets_compound_metadata, file)

with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_compound_df.pkl"
    ),
    "rb",
) as file:
    opentargets_compound_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(),
        "examples",
        "usecases",
        "PCS",
        "datasources",
        "opentargets_compound_metadata.pkl",
    ),
    "rb",
) as file:
    opentargets_compound_metadata = pickle.load(file)

opentargets_compound_df.head()

In [None]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][3]

### Gene to Pathway annotation from MINERVA

In [None]:
# minerva_df, minerva_metadata = minerva.get_gene_minerva_pathways(
#     bridgedb_df, map_name="COVID19 Disease Map"
# )
# minerva_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "minerva_df.pkl"))
# with open(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "minerva_metadata.pkl"), "wb"
# ) as file:
#     pickle.dump(minerva_metadata, file)

with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "minerva_df.pkl"), "rb"
) as file:
    minerva_df = pickle.load(file)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "minerva_metadata.pkl"),
    "rb",
) as file:
    minerva_metadata = pickle.load(file)
minerva_df.head()

In [None]:
minerva_df[MINERVA][33]

### Gene to Pathway annotation from WikiPathways

In [None]:
# wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df)
# wikipathways_df.to_pickle(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "wikipathways_df.pkl")
# )
# with open(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "wikipathways_metadata.pkl"), "wb"
# ) as file:
#     pickle.dump(wikipathways_metadata, file)

with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "wikipathways_df.pkl"),
    "rb",
) as file:
    wikipathways_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "wikipathways_metadata.pkl"
    ),
    "rb",
) as file:
    wikipathways_metadata = pickle.load(file)
wikipathways_df.head()

In [None]:
wikipathways_df[WIKIPATHWAYS][3]

### Gene to Reactome Pathway from OpenTargets

In [None]:
# opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
#     bridgedb_df=bridgedb_df
# )
# opentargets_reactome_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_reactome_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_reactome_metadata.pkl"), "wb") as file:
#     pickle.dump(opentargets_reactome_metadata, file)

with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_reactome_df.pkl"
    ),
    "rb",
) as file:
    opentargets_reactome_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(),
        "examples",
        "usecases",
        "PCS",
        "datasources",
        "opentargets_reactome_metadata.pkl",
    ),
    "rb",
) as file:
    opentargets_reactome_metadata = pickle.load(file)

opentargets_reactome_df.head()

In [None]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][2]

### Gene Ontology annotation from OpenTargets

In [None]:
# opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(bridgedb_df=bridgedb_df)
# opentargets_go_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_go_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_go_metadata.pkl"), "wb") as file:
#     pickle.dump(opentargets_go_metadata, file)

with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_go_df.pkl"
    ),
    "rb",
) as file:
    opentargets_go_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_go_metadata.pkl"
    ),
    "rb",
) as file:
    opentargets_go_metadata = pickle.load(file)
opentargets_go_df.head()

In [None]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

### Protein-Protein interaction from STRING

In [None]:
# string_ppi_df, string_ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df)
# string_ppi_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "string_ppi_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "string_ppi_metadata.pkl"), "wb") as file:
#     pickle.dump(string_ppi_metadata, file)

with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "string_ppi_df.pkl"),
    "rb",
) as file:
    string_ppi_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "string_ppi_metadata.pkl"
    ),
    "rb",
) as file:
    string_ppi_metadata = pickle.load(file)
string_ppi_df.head()

In [None]:
string_ppi_df[STRING_PPI_COL][0]

### Combing all the results into single dataframe

In [27]:
combined_df = combine_sources(
    bridgedb_df,
    [
        disgenet_df,
        opentargets_compound_df,
        minerva_df,
        wikipathways_df,
        opentargets_reactome_df,
        opentargets_go_df,
        string_ppi_df,
    ],
)
combined_metadata = create_or_append_to_metadata(
    bridgedb_metadata,
    [
        disgenet_metadata,
        opentargets_disease_compound_metadata,
        opentargets_compound_metadata,
        minerva_metadata,
        wikipathways_metadata,
        opentargets_reactome_metadata,
        opentargets_go_metadata,
        string_ppi_metadata,
    ],
)

In [None]:
combined_df.head(4)

In [None]:
combined_df[LITERATURE_DISEASE_COL][0]

In [None]:
combined_metadata

In [None]:
combined_df.shape

In [None]:
combined_df.tail()

##### Exporting the combined data in pickle format

In [33]:
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "combined_df.pkl"), "wb") as out:
#     pickle.dump(combined_df, out)
# with open(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "combined_metadata.pkl"), "wb"
# ) as file:
#     pickle.dump(combined_metadata, file)

### Creating a graph from the annotated data

In [34]:
pygraph = generator.networkx_graph(combined_df, opentargets_disease_compound_df)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "pcs_networkx_graph.pkl"), "wb"
) as out:
    pickle.dump(pygraph, out)

# with open(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "pcs_networkx_graph.pkl"),
#     "rb",
# ) as file:
#     pygraph = pickle.load(file)

### Visualize the graph

In [30]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

#### Cytosacpe

In [None]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="PCS network")

#### Neo4j

In [35]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(
    pygraph,
    output_path=os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "pcs_networkx_graph.graphml"
    ),
)

#### RDF

In [None]:
from pyBiodatafuse.graph import rdf

g = rdf.generate_rdf(
    df=combined_df,
    base_uri="https://biodatafuse.org/example/",
    version_iri="https://biodatafuse.org/usecases/PCS/PCS_usecase.owl",
    orcid="https://orcid.org/0000-0002-4166-7093",
    author="Javier Millan Acosta",
    metadata=combined_metadata,
)

output_path = os.path.join(os.getcwd(), "examples", "usecases", "PCS", "pcs.rdf")
g.serialize(format="turtle", destination=output_path)

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph before) Remove all the nodes before importing `.graphml` file

    ```
    MATCH (n) DETACH DELETE n
    ```

- Import `.graphml` file

    ```
    call apoc.import.graphml('file:///pcs_networkx_graph.graphml',{readLabels:TRUE})
    ```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) = 19860
        - Gene (```MATCH (n:Gene) RETURN count(n)```) = 1667
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```) = 1847
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) = 678
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) = 1154
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) = 15
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) = 4624
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) = 1327
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) = 736
        - Disease (```MATCH (n:Disease) RETURN count(n)```) = 2914
            - DISGENET (```MATCH (n:Disease {source: "DISGENET"}) RETURN count(n)```) = 2913
            - Literature (```MATCH (n:Disease {source: "PMID: 37675861"}) RETURN count(n)```) = 1
        - Compound (```MATCH (n:Compound) RETURN count(n)```) = 2244
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) = 4501
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) = 101659
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) = 16844
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) = 30066 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) = 3174
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) = 26784
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) = 108
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) = 499
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) = 8215
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) = 38328
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) = 7636
            - Literature (```MATCH ()-[r:associated_with {source: "DISGENET"}]->() RETURN count(r)```) = 7607
            - DISGENET (```MATCH ()-[r:associated_with{source: "PMID: 37675861"}]->() RETURN count(r)```) = 29

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("pcs_networkx_graph.csv",{})```

### Dreamwalk algoritm

In [None]:
import os

new_path = os.path.join(os.getcwd(), "DREAMwalk")


os.chdir(new_path)

# Set the current working directory
current_dir = os.getcwd()
current_dir

In [None]:
import DREAMwalk.generate_dis_sim as dis_gen
import DREAMwalk.generate_files as gen
import pandas as pd
from DREAMwalk.calculate_drug_scores import find_candidates
from DREAMwalk.generate_embeddings import save_embedding_files
from DREAMwalk.generate_similarity_net import save_sim_graph
from DREAMwalk.predict_associations import predict_dda

In [None]:
# GENERSTE FILES
kg_data = pd.read_csv("../pcs_networkx_graph.csv")
kg_data.head()

In [None]:
kg_data.columns

In [None]:
gen.generate_files(kg_data)

In [None]:
dis_gen.save_dis_sim("../pcs_networkx_graph.csv", "dis_sim.tsv")