# Gene-to-graph workflow

This notebook showcases the steps to generate the BioDataFuse data and graph serializations from a list of genes.

Datasource or annotators used:
- Bgee
- DisGeNET
- OpenTargets
- MINERVA
- WikiPathways
- AOP-Wiki
- MolMedDB
- STRINGDB
- PubChem


In [None]:
# Import modules
import os
import pickle

import pandas as pd
from IPython.display import Image, display

import pyBiodatafuse.annotators as ann
import pyBiodatafuse.constants as Cons
from pyBiodatafuse import id_mapper
from pyBiodatafuse.graph.rdf import BDFGraph
from pyBiodatafuse.graph.rdf.graphdb import GraphDBManager
from pyBiodatafuse.graph.saver import save_graph
from pyBiodatafuse.utils import (
    combine_sources,
    create_harmonized_input_file,
    create_or_append_to_metadata,
)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Entity resolution with BridgeDB

The first step is to input the list of genes to query and retrieve their protein target and synonym identifiers using BridgeDB.

### 1.1. Load the input list and convert it to a dataframe

In [None]:
genes_of_interest = """7350
6198
1499
6528
6714
10000
10891
6194
7068
4193
3709
"""

gene_list = genes_of_interest.split("\n")
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head()

### 1.2. Query BridgeDB
The results will be stored in the following directories:

In [None]:
# Create directories
base_dir = os.path.abspath(os.getcwd())
DATA_DIR = os.path.join(base_dir, "data")
EXAMPLE_DIR = os.path.join(DATA_DIR, "gene_to_graph_workflow")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(EXAMPLE_DIR, exist_ok=True)

In [None]:
pickle_path = f"{EXAMPLE_DIR}/gene_list.pkl"
metadata_path = f"{EXAMPLE_DIR}/gene_list_metadata.pkl"

if not os.path.exists(pickle_path):
    bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
        identifiers=data_input,
        input_species="Human",
        input_datasource="NCBI Gene",
        output_datasource="All",
    )
    bridgedb_df.to_pickle(pickle_path)
    with open(metadata_path, "wb") as file:
        pickle.dump(bridgedb_metadata, file)
else:
    bridgedb_df = pd.read_pickle(pickle_path)
    with open(metadata_path, "rb") as file:
        bridgedb_metadata = pickle.load(file)

In [None]:
print("Number of genes with mapping in BridgeDb:", len(bridgedb_df["identifier"].unique()))
bridgedb_df.head()

## 2. Gene expression
### 2.1. Gene expression from Bgee

In [None]:
bgee_path = f"{EXAMPLE_DIR}/bgee.pkl"
bgee_metadata_path = f"{EXAMPLE_DIR}/bgee_metadata.pkl"

if not os.path.exists(bgee_path):
    bgee_df, bgee_metadata = ann.bgee.get_gene_expression(bridgedb_df=bridgedb_df)
    bgee_df.to_pickle(bgee_path)
    with open(bgee_metadata_path, "wb") as file:
        pickle.dump(bgee_metadata, file)
else:
    bgee_df = pd.read_pickle(bgee_path)
    with open(bgee_metadata_path, "rb") as file:
        bgee_metadata = pickle.load(file)

In [None]:
bgee_df.head(2)

## 3. Disease annotation
### 3.1. Gene to disease annotation with DisGeNET

In [None]:
disgenet_path = f"{EXAMPLE_DIR}/disgenet.pkl"
disgenet_metadata_path = f"{EXAMPLE_DIR}/disgenet_metadata.pkl"

disgenet_api_key = 'dotenv.dotenv_values(".env")["DISGENET_API_KEY"]'

In [None]:
disgenet_path = os.path.join(base_dir, "data", "gene_to_graph_workflow", "example_disgenet.pkl")
disgenet_metadata_path = os.path.join(
    base_dir, "data", "gene_to_graph_workflow", "example_disgenet_metadata.pkl"
)

if not os.path.exists(disgenet_path):
    disgenet_df, disgenet_metadata = ann.disgenet.get_gene_disease(
        api_key=disgenet_api_key, bridgedb_df=bridgedb_df
    )

    disgenet_df.to_pickle(disgenet_path)
    with open(disgenet_metadata_path, "wb") as file:
        pickle.dump(disgenet_metadata, file)
else:
    disgenet_df = pd.read_pickle(disgenet_path)
    with open(disgenet_metadata_path, "rb") as file:
        disgenet_metadata = pickle.load(file)

In [None]:
disgenet_df.head(2)

### 3.2. Disease to compound annotation from OpenTargets

In [None]:
# Prepare the input to use DISGENET output as seed for OpenTargets
disease_mapping_df = create_harmonized_input_file(
    annotated_df=disgenet_df, target_col=Cons.DISGENET_DISEASE_COL, target_source=Cons.EFO
)
disease_mapping_df.head()

In [None]:
opentargets_df, opentargets_metadata = opentargets.get_disease_compound_interactions(
    disease_mapping_df
)
opentargets_df

In [None]:
opentarget_path = f"{EXAMPLE_DIR}/opentarget_cmpd.pkl"
opentarget_metadata_path = f"{EXAMPLE_DIR}/opentarget_cmpd_metadata.pkl"

if not os.path.exists(opentarget_path):
    opentargets_df, opentargets_metadata = ann.opentargets.get_disease_compound_interactions(
        disease_mapping_df
    )
    opentargets_df.to_pickle(opentarget_path)
    with open(opentarget_metadata_path, "wb") as file:
        pickle.dump(opentargets_metadata, file)
else:
    opentargets_df = pd.read_pickle(opentarget_path)
    with open(opentarget_metadata_path, "rb") as file:
        opentargets_metadata = pickle.load(file)

## 4. Pathways and Gene Ontology terms
### 4.1. Pathways from MINERVA

In [None]:
minerva_path = f"{EXAMPLE_DIR}/minerva.pkl"
minerva_metadata_path = f"{EXAMPLE_DIR}/minerva_metadata.pkl"

if not os.path.exists(minerva_path):
    minerva_df, minerva_metadata = ann.minerva.get_gene_pathways(
        bridgedb_df, map_name="COVID19 Disease Map"
    )
    minerva_df.to_pickle(minerva_path)
    with open(minerva_metadata_path, "wb") as file:
        pickle.dump(minerva_metadata, file)

else:
    minerva_df = pd.read_pickle(minerva_path)
    with open(minerva_metadata_path, "rb") as file:
        minerva_metadata = pickle.load(file)

In [None]:
minerva_df.head(2)

### 4.2. Pathways from WikiPathways

In [None]:
wikipathways_path = f"{EXAMPLE_DIR}/wikipathways.pkl"
wikipathways_metadata_path = f"{EXAMPLE_DIR}/wikipathways_metadata.pkl"

if not os.path.exists(wikipathways_path):
    wikipathways_df, wikipathways_metadata = ann.wikipathways.get_gene_wikipathways(
        bridgedb_df=bridgedb_df
    )
    wikipathways_df.to_pickle(wikipathways_path)
    with open(wikipathways_metadata_path, "wb") as file:
        pickle.dump(wikipathways_metadata, file)

else:
    wikipathways_df = pd.read_pickle(wikipathways_path)
    with open(wikipathways_metadata_path, "rb") as file:
        wikipathways_metadata = pickle.load(file)

In [None]:
# Compound-gene interactions
wikipathways_molecular_path = f"{EXAMPLE_DIR}/wikipathways_molecular.pkl"
wikipathways_molecular_metadata_path = f"{EXAMPLE_DIR}/wikipathways_molecular_metadata.pkl"

if not os.path.exists(wikipathways_molecular_path):
    wikipathways_molecular_df, wikipathways_molecular_metadata = (
        ann.wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df, query_interactions=True)
    )
    wikipathways_molecular_df.to_pickle(wikipathways_molecular_path)
    with open(wikipathways_molecular_metadata_path, "wb") as file:
        pickle.dump(wikipathways_molecular_metadata, file)
else:
    wikipathways_molecular_df = pd.read_pickle(wikipathways_molecular_path)
    with open(wikipathways_molecular_metadata_path, "rb") as file:
        wikipathways_molecular_metadata = pickle.load(file)

In [16]:
wikipathways_molecular_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways_molecular
0,10000,Entrez Gene,10000,NCBI Gene,"[{'pathway_id': 'WP:3981', 'pathway_label': 'm..."
1,10891,Entrez Gene,10891,NCBI Gene,"[{'pathway_id': 'WP:5470', 'pathway_label': 'E..."


### 4.3. Reactome pathways from OpenTargets

In [None]:
opentargets_reactome_path = f"{EXAMPLE_DIR}/opentargets_reactome.pkl"
opentargets_reactome_metadata_path = f"{EXAMPLE_DIR}/opentargets_reactome_metadata.pkl"


if not os.path.exists(opentargets_reactome_path):
    opentargets_reactome_df, opentargets_reactome_metadata = (
        ann.opentargets.get_gene_reactome_pathways(bridgedb_df=bridgedb_df)
    )
    opentargets_reactome_df.to_pickle(opentargets_reactome_path)
    with open(opentargets_reactome_metadata_path, "wb") as file:
        pickle.dump(opentargets_reactome_metadata, file)
else:
    opentargets_reactome_df = pd.read_pickle(opentargets_reactome_path)
    with open(opentargets_reactome_metadata_path, "rb") as file:
        opentargets_reactome_metadata = pickle.load(file)

In [18]:
opentargets_reactome_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_reactome
0,10000,Entrez Gene,ENSG00000117020,Ensembl,"[{'pathway_label': 'FLT3 Signaling', 'pathway_..."
1,10891,Entrez Gene,ENSG00000109819,Ensembl,[{'pathway_label': 'Transcriptional regulation...


### 4.4. Gene Ontology from OpenTargets

In [None]:
opentargets_go_path = f"{EXAMPLE_DIR}/opentargets_go.pkl"
opentargets_go_metadata_path = f"{EXAMPLE_DIR}/opentargets_go_metadata.pkl"

if not os.path.exists(opentargets_go_path):
    opentargets_go_df, opentargets_go_metadata = ann.opentargets.get_gene_go_process(
        bridgedb_df=bridgedb_df
    )
    opentargets_go_df.to_pickle(opentargets_go_path)
    with open(opentargets_go_metadata_path, "wb") as file:
        pickle.dump(opentargets_go_metadata, file)
else:
    opentargets_go_df = pd.read_pickle(opentargets_go_path)
    with open(opentargets_go_metadata_path, "rb") as file:
        opentargets_go_metadata = pickle.load(file)

### 4.5. Adverse Outcome Pathways from AOP-Wiki RDF

In [None]:
aopwiki_path = f"{EXAMPLE_DIR}/aopwiki.pkl"
aopwiki_metadata_path = f"{EXAMPLE_DIR}/aopwiki_metadata.pkl"

if not os.path.exists(aopwiki_path):
    aopwiki_df, aopwiki_metadata = aopwiki.get_aops(bridgedb_df=bridgedb_df)
    aopwiki_df.to_pickle(aopwiki_path)
    with open(aopwiki_metadata_path, "wb") as file:
        pickle.dump(aopwiki_metadata, file)
else:
    aopwiki_df = pd.read_pickle(aopwiki_path)
    with open(aopwiki_metadata_path, "rb") as file:
        aopwiki_metadata = pickle.load(file)

aopwiki_df.head()

In [None]:
opentargets_gene_path = os.path.join(
    base_dir, "data", "gene_to_graph_workflow", "opentargets_gene_cmpd.pkl"
)
opentargets_gene_metadata_path = os.path.join(
    base_dir, "data", "gene_to_graph_workflow", "opentargets_gene_cmpd_metadata.pkl"
)

if not os.path.exists(opentargets_gene_path):
    opentargets_compound_df, opentargets_compound_metadata = (
        opentargets.get_gene_compound_interactions(bridgedb_df=bridgedb_df)
    )
    opentargets_compound_df.to_pickle(opentargets_gene_path)
    with open(opentargets_gene_metadata_path, "wb") as file:
        pickle.dump(opentargets_compound_metadata, file)

else:
    opentargets_compound_df = pd.read_pickle(opentargets_gene_path)
    with open(opentargets_gene_metadata_path, "rb") as file:
        opentargets_compound_metadata = pickle.load(file)

### 4.3. Reactome pathways from OpenTargets

In [None]:
opentargets_reactome_path = f"{EXAMPLE_DIR}/opentargets_reactome.pkl"
opentargets_reactome_metadata_path = f"{EXAMPLE_DIR}/opentargets_reactome_metadata.pkl"


if not os.path.exists(opentargets_reactome_path):
    opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
        bridgedb_df=bridgedb_df
    )
    opentargets_reactome_df.to_pickle(opentargets_reactome_path)
    with open(opentargets_reactome_metadata_path, "wb") as file:
        pickle.dump(opentargets_reactome_metadata, file)
else:
    opentargets_reactome_df = pd.read_pickle(opentargets_reactome_path)
    with open(opentargets_reactome_metadata_path, "rb") as file:
        opentargets_reactome_metadata = pickle.load(file)

### 4.4. Gene Ontology from OpenTargets

In [None]:
opentargets_go_path = f"{EXAMPLE_DIR}/opentargets_go.pkl"
opentargets_go_metadata_path = f"{EXAMPLE_DIR}/opentargets_go_metadata.pkl"

if not os.path.exists(opentargets_go_path):
    opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(
        bridgedb_df=bridgedb_df
    )
    opentargets_go_df.to_pickle(opentargets_go_path)
    with open(opentargets_go_metadata_path, "wb") as file:
        pickle.dump(opentargets_go_metadata, file)
else:
    opentargets_go_df = pd.read_pickle(opentargets_go_path)
    with open(opentargets_go_metadata_path, "rb") as file:
        opentargets_go_metadata = pickle.load(file)

## 5. Compound annotation
### 5.1. Compound annotation from OpenTargets

In [None]:
opentargets_gene_path = f"{EXAMPLE_DIR}/opentargets_gene_cmpd.pkl"
opentargets_gene_metadata_path = f"{EXAMPLE_DIR}/opentargets_gene_cmpd_metadata.pkl"

if not os.path.exists(opentargets_gene_path):
    opentargets_compound_df, opentargets_compound_metadata = (
        ann.opentargets.get_gene_compound_interactions(bridgedb_df=bridgedb_df)
    )
    opentargets_compound_df.to_pickle(opentargets_gene_path)
    with open(opentargets_gene_metadata_path, "wb") as file:
        pickle.dump(opentargets_compound_metadata, file)

else:
    opentargets_compound_df = pd.read_pickle(opentargets_gene_path)
    with open(opentargets_gene_metadata_path, "rb") as file:
        opentargets_compound_metadata = pickle.load(file)

### 5.2. Screening results of compounds on proteins encoded by genes annotation by PubChem

In [None]:
pubchem_path = os.path.join(base_dir, "data", "gene_to_graph_workflow", "example_pubchem.pkl")
pubchem_metadata_path = os.path.join(
    base_dir, "data", "gene_to_graph_workflow", "example_pubchem_metadata.pkl"
)

if not os.path.exists(pubchem_path):
    pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(
        bridgedb_df=bridgedb_df
    )
    pubchem_assay_df.to_pickle(pubchem_path)
    with open(pubchem_metadata_path, "wb") as file:
        pickle.dump(pubchem_assay_metadata, file)

else:
    pubchem_assay_df = pd.read_pickle(pubchem_path)
    with open(pubchem_metadata_path, "rb") as file:
        pubchem_assay_metadata = pickle.load(file)

## 6. Membrane transport annotations
### 6.1 Transporter inhibitor annotation from MolMeDB

In [None]:
molmedb_path = os.path.join(
    base_dir, "data", "gene_to_graph_workflow", "example_molmedb_gene_cmpd.pkl"
)
molmedb_metadata_path = os.path.join(
    base_dir, "data", "gene_to_graph_workflow", "example_molmedb_gene_cmpd_metadata.pkl"
)

if not os.path.exists(molmedb_path):
    inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df)
    inhibitor_df.to_pickle(molmedb_path)
    with open(molmedb_metadata_path, "wb") as file:
        pickle.dump(inhibitor_metadata, file)
else:
    inhibitor_df = pd.read_pickle(molmedb_path)
    with open(molmedb_metadata_path, "rb") as file:
        inhibitor_metadata = pickle.load(file)

## 7. Protein-Protein Interactions

### 7.1. Protein-Protein Interactions from STRING

In [None]:
string_path = os.path.join(base_dir, "data", "gene_to_graph_workflow", "example_string.pkl")
string_metadata_path = os.path.join(
    base_dir, "data", "gene_to_graph_workflow", "example_string_metadata.pkl"
)

if not os.path.exists(string_path):
    ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df)
    ppi_df.to_pickle(string_path)
    with open(string_metadata_path, "wb") as file:
        pickle.dump(ppi_metadata, file)
else:
    ppi_df = pd.read_pickle(string_path)
    with open(string_metadata_path, "rb") as file:
        ppi_metadata = pickle.load(file)

## 8. Graph generation 

### 8.1. Combine all data and metadata

In [None]:
combined_df = combine_sources(
    bridgedb_df,
    [
        bgee_df,
        disgenet_df,
        minerva_df,
        wikipathways_df,
        opentargets_reactome_df,
        opentargets_go_df,
        opentargets_compound_df,
        inhibitor_df,
        pubchem_assay_df,
        ppi_df,
        aopwiki_df,
        wikipathways_molecular_df,
    ],
)

In [None]:
opentargets_compound_df.head(2)

### 5.2. Screening results of compounds on proteins encoded by genes annotation by PubChem

In [None]:
pubchem_path = f"{EXAMPLE_DIR}/pubchem.pkl"
pubchem_metadata_path = f"{EXAMPLE_DIR}/pubchem_metadata.pkl"

if not os.path.exists(pubchem_path):
    pubchem_assay_df, pubchem_assay_metadata = ann.pubchem.get_protein_compound_screened(
        bridgedb_df=bridgedb_df
    )
    pubchem_assay_df.to_pickle(pubchem_path)
    with open(pubchem_metadata_path, "wb") as file:
        pickle.dump(pubchem_assay_metadata, file)

else:
    pubchem_assay_df = pd.read_pickle(pubchem_path)
    with open(pubchem_metadata_path, "rb") as file:
        pubchem_assay_metadata = pickle.load(file)

In [None]:
pubchem_assay_df.head(2)

### 5.3 Transporter inhibitor annotation from MolMeDB

In [None]:
molmedb_path = f"{EXAMPLE_DIR}/molmedb.pkl"
molmedb_metadata_path = f"{EXAMPLE_DIR}/molmedb_metadata.pkl"

if not os.path.exists(molmedb_path):
    inhibitor_df, inhibitor_metadata = ann.molmedb.get_gene_compound_inhibitor(
        bridgedb_df=bridgedb_df
    )
    inhibitor_df.to_pickle(molmedb_path)
    with open(molmedb_metadata_path, "wb") as file:
        pickle.dump(inhibitor_metadata, file)
else:
    inhibitor_df = pd.read_pickle(molmedb_path)
    with open(molmedb_metadata_path, "rb") as file:
        inhibitor_metadata = pickle.load(file)

> **NOTE**: No output for this database.

## 6. Protein-Protein Interactions

### 6.1. Protein-Protein Interactions from STRING

In [None]:
string_path = f"{EXAMPLE_DIR}/string.pkl"
string_metadata_path = f"{EXAMPLE_DIR}/string_metadata.pkl"

if not os.path.exists(string_path):
    ppi_df, ppi_metadata = ann.stringdb.get_ppi(bridgedb_df=bridgedb_df)
    ppi_df.to_pickle(string_path)
    with open(string_metadata_path, "wb") as file:
        pickle.dump(ppi_metadata, file)
else:
    ppi_df = pd.read_pickle(string_path)
    with open(string_metadata_path, "rb") as file:
        ppi_metadata = pickle.load(file)

In [None]:
> **NOTE:** - No interactions between the genes found

## 7. Graph generation 

### 7.1. Combine all data and metadata

In [None]:
combined_df_path = f"{EXAMPLE_DIR}/combined_df.pkl"
disease_compound_df_path = f"{EXAMPLE_DIR}/disease_compound_df.pkl"

if not os.path.exists(combined_df_path):
    combined_df = combine_sources(
        bridgedb_df,
        [
            bgee_df,
            disgenet_df,
            minerva_df,
            wikipathways_df,
            wikipathways_molecular_df,
            opentargets_reactome_df,
            opentargets_go_df,
            aopwiki_df,
            opentargets_compound_df,
            pubchem_assay_df,
            inhibitor_df,
            ppi_df,
        ],
    )
    combined_df.to_pickle(combined_df_path)
    opentargets_df.to_pickle(disease_compound_df_path)

else:
    combined_df = pd.read_pickle(combined_df_path)
    opentargets_df = pd.read_pickle(disease_compound_df_path)

In [None]:
combined_df.head()

### 7.2. Create a graph from the annotated dataframe

In [None]:
# Shuffling the rows
import numpy as np

combined_df["DISGENET_diseases"] = combined_df["DISGENET_diseases"].apply(np.random.permutation)

In [None]:
combined_metadata = create_or_append_to_metadata(
    bridgedb_metadata,
    [
        bgee_metadata,
        disgenet_metadata,
        opentargets_metadata,
        opentargets_compound_metadata,
        inhibitor_metadata,
        pubchem_assay_metadata,
        ppi_metadata,
        wikipathways_metadata,
        minerva_metadata,
        opentargets_reactome_metadata,
        opentargets_go_metadata,
        aopwiki_metadata,
        wikipathways_molecular_metadata,
    ],
)

We export the combined (meta) data in pickle format:

In [None]:
combined_df.to_pickle(
    os.path.join(base_dir, "data", "gene_to_graph_workflow", "example_df_shuffled.pkl")
)
with open("example_metadata.pkl", "wb") as out:
    pickle.dump(combined_metadata, out)

with open("opentargets_disease_compound_df.pkl", "wb") as out:
    pickle.dump(opentargets_metadata, out)

### 8.2. Create a graph from the annotated dataframe

In [None]:
pygraph = save_graph(
    combined_df=combined_df,
    combined_metadata=combined_metadata,
    disease_compound=opentargets_df,
    graph_name="gene_to_graph",
    graph_dir=EXAMPLE_DIR,
)

In [31]:
# Check the node types in the graph

node_types = set()
for node, node_data in pygraph.nodes(data=True):
    node_types.add(node_data[Cons.LABEL])

print(node_types)

{'Cellular Component', 'Biological Process', 'Adverse Outcome', 'Anatomical Entity', 'Pathway', 'Compound', 'Disease', 'Gene', 'Molecular Function', 'Key Event', 'Adverse Outcome Pathway', 'Side Effect', 'Molecular Initiating Event'}


### 7.3. Neo4j

Make sure you have the desktop version open and have a user login and password. 

If you receive the error: `AuthError: {code: Neo.ClientError.Security.Unauthorized} {message: The client is unauthorized due to authentication failure.}`, please make sure you change disable authentication in the config file (dbms.security.auth_enabled=false). See [here](https://stackoverflow.com/questions/53687901/neo4j-cant-log-in-neo-clienterror-security-unauthorized-the-client-is-unauth)

In [32]:
from pyBiodatafuse.graph import neo4j

neo4j.load_graph(
    pygraph, uri="bolt://localhost:7687", username="test", password="password"
)  # change username and password

### 7.4. Cytoscape
Make sure that the Cytoscape is open.

In [None]:
import logging

import py4cytoscape as p4c

from pyBiodatafuse.graph import cytoscape

# Disable all logging for py4cytoscape
logging.disable(logging.CRITICAL)  # Disable all logging
cytoscape.load_graph(pygraph, network_name="Gene-Graph example")  # Load the graph into Cytoscape

In [None]:
p4c.notebook_export_show_image()

### 7.5. RDF

In [None]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "networkx_graph_test.graphml")

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph  before) Remove all the nodes before importing `.graphml` file

    ```MATCH (n) DETACH DELETE n```

- Import `.graphml` file

    ```call apoc.import.graphml('file:///networkx_graph_test.graphml',{readLabels:TRUE})```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```
    

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) 
        - Gene (```MATCH (n:Gene) RETURN count(n)```)
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```)
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) 
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) 
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) 
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) 
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) 
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) 
        - Disease (```MATCH (n:Disease) RETURN count(n)```) 
        - Compound (```MATCH (n:Compound) RETURN count(n)```)
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) 
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) 
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) 
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) 
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) 
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) 
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) 
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) 
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) 
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) 

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("networkx_graph_test.csv",{})```

#### 8.5. RDF

In [None]:
# Instantiate a BDFGraph
bdf = BDFGraph(
    base_uri="https://biodatafuse.org/example/",
    version_iri="https://biodatafuse.org/example/test.owl",
    orcid="https://orcid.org/0000-0002-4166-7093",
    author="Javier Millan Acosta",
)

bdf.generate_rdf(combined_df, combined_metadata)  # Generate the RDF from the (meta)data

In [None]:
bdf.serialize(
    os.path.join(
        base_dir,
        "data",
        "gene_to_graph_workflow",
        "BDF_example_graph.ttl",
    ),
    format="ttl",
)

##### 7.5.1. Generate prefixes SHACL

SHACL graphs defining namespaces and prefixes can be loaded into SPARQL endpoints to avoid having to declare prefixes in the query.

In [None]:
# Use without parameters (defaults, does not save file)
bdf.shacl_prefixes()
# or Use with parameters
bdf.shacl_prefixes(
    path=os.path.join(
        base_dir,
        "examples",
        "data",
        "gene_to_graph_workflow",
        "gene_to_graph_workflow",
        "BDF_prefixes.ttl",
    ),
    namespaces=None,  # Optional, add more namespaces with a dictionary of {prefix:namespace,}
)

##### 7.5.2. Use [`shexer`](https://github.com/DaniFdezAlvarez/shexer/) to retrieve the RDF shapes

The `shexer` library is used to retrieve the shapes of the graph in SHACL (https://www.w3.org/TR/shacl/) and ShEx (https://shex.io/shex-semantics/).

- **SHACL**

In [None]:
# Use without parameters (defaults)
# bdf.shacl()

# Or use with parameters
bdf.shacl(
    path=os.path.join(
        base_dir,
        "data",
        "gene_to_graph_workflow",
        "BDF_example_shacl.ttl",
    ),  # Set a path for TTL serialization
    threshold=0.001,
    uml_figure_path=os.path.join(
        base_dir,
        "data",
        "gene_to_graph_workflow",
        "BDF_example_shacl.png",
    ),  # Set a path for diagram
)

# Display the UML figure
display(
    Image(
        os.path.join(
            base_dir,
            "data",
            "gene_to_graph_workflow",
            "BDF_example_shacl.png",
        )
    )
)

- **ShEx**

In [None]:
# Use without parameters (defaults)
# bdf.shex()

# Or use with parameters
bdf.shex(
    path=os.path.join(
        base_dir,
        "data",
        "gene_to_graph_workflow",
        "BDF_example_shex.ttl",
    ),  # Set a path for TTL serialization
    threshold=0.001,
    uml_figure_path=os.path.join(
        base_dir,
        "data",
        "gene_to_graph_workflow",
        "BDF_example_shex.png",
    ),  # Set a path for diagram
)

# Display the UML figure
display(
    Image(
        os.path.join(
            base_dir,
            "data",
            "gene_to_graph_workflow",
            "BDF_example_shex.png",
        )
    )
)

##### 7.5.3 Set up a GraphDB instance
- Download the latest version of GraphDB from https://graphdb.ontotext.com//
- Run from a direct access or terminal
- Open the GraphDB Workbench in your web browser by navigating to `http://localhost:7200`

The `GraphDBManager` allows you to perform simple operations on the your GraphDB location, such as creating a new repository, deleting a repository, importing RDF data into a repository, querying the loaded graphs and retrieving visualizations. For more advanced operations, you can use the GraphDB Workbench or the GraphDB API.

- Create a new repository

In [None]:
# GraphDB configuration
base_url = "http://localhost:7200"
repository_name = "gene_to_graph_workflow"
username = "admin"
password = "root"

print("Creating repository...")
GraphDBManager.create_repository(base_url, repository_name, username, password)

print("Listing repositories...")
repositories = GraphDBManager.list_repositories(base_url, username, password)
print("Repositories:", repositories)

- Load the RDF data into the repository

In [None]:
print("Uploading RDF graph to GraphDB...")
GraphDBManager.upload_to_graphdb(
    base_url, repository_name, username, password, bdf, file_format="turtle"
)
print("Counting triples in the repository...")
triple_count = GraphDBManager.count_triples(base_url, repository_name, username, password)
print(f"Triple count: {str(triple_count)}")

- Load the prefixes SHACL graph into the repository

In [None]:
print("Uploading Prefixes SHACL graph to GraphDB...")
GraphDBManager.upload_to_graphdb(
    base_url,
    repository_name,
    username,
    password,
    bdf.shacl_prefixes(),
)
print("Counting triples in the repository...")
triple_count = GraphDBManager.count_triples(base_url, repository_name, username, password)
print(f"Triple count: {str(triple_count)}")

- Delete the repository

In [None]:
print("Deleting repository...")
repositories = GraphDBManager.list_repositories(base_url, username, password)
print("Repositories:", repositories)
GraphDBManager.delete_repository(base_url, repository_name, username, password)
print("Listing repositories...")
repositories = GraphDBManager.list_repositories(base_url, username, password)
print("Repositories:", repositories)