# Gene-to-graph workflow

This notebook showcases the steps to generate the BioDataFuse data and graph serializations from a list of genes.


In [1]:
# Import modules
import os
import pickle

import pandas as pd
from IPython.display import Image, display

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    minerva,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.graph.rdf import BDFGraph
from pyBiodatafuse.graph.saver import save_graph
from pyBiodatafuse.utils import (
    combine_sources,
    create_harmonized_input_file,
    create_or_append_to_metadata,
)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Entity resolution with BridgeDB

The first step is to input the list of genes to query and retrieve their protein target and synonym identifiers using BridgeDB.

### 1.1. Load the input list and convert it to a dataframe

In [2]:
genes_of_interest = """7350
6198
1499
6528
6714
10000
10891
6194
7068
4193
3709
"""

gene_list = genes_of_interest.split("\n")
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,7350
1,6198
2,1499
3,6528
4,6714


### 1.2. Query BridgeDB
The results will be stored in the following directories:

In [3]:
base_dir = os.path.abspath(os.getcwd())
DATA_DIR = os.path.join(base_dir, "data")
EXAMPLE_DIR = os.path.join(DATA_DIR, "gene_to_graph_workflow")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(EXAMPLE_DIR, exist_ok=True)

In [4]:
pickle_path = f"{EXAMPLE_DIR}/gene_list.pkl"
metadata_path = f"{EXAMPLE_DIR}/gene_list_metadata.pkl"

if not os.path.exists(pickle_path):
    bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
        identifiers=data_input,
        input_species="Human",
        input_datasource="NCBI Gene",
        output_datasource="All",
    )
    bridgedb_df.to_pickle(pickle_path)
    with open(metadata_path, "wb") as file:
        pickle.dump(bridgedb_metadata, file)
else:
    bridgedb_df = pd.read_pickle(pickle_path)
    with open(metadata_path, "rb") as file:
        bridgedb_metadata = pickle.load(file)

In [5]:
print("Number of genes with mapping in BridgeDb:", len(bridgedb_df["identifier"].unique()))
bridgedb_df.head()

Number of genes with mapping in BridgeDb: 11


Unnamed: 0,identifier,identifier.source,target,target.source
0,7350,Entrez Gene,2787089,Affy
1,7350,Entrez Gene,8102904,Affy
2,7350,Entrez Gene,2787088,Affy
3,7350,Entrez Gene,GO:0071398,Gene Ontology
4,7350,Entrez Gene,2787087,Affy


## 2. Gene expression
### 2.1. Gene expression from Bgee

In [6]:
bgee_path = f"{EXAMPLE_DIR}/bgee.pkl"
bgee_metadata_path = f"{EXAMPLE_DIR}/bgee_metadata.pkl"

if not os.path.exists(bgee_path):
    bgee_df, bgee_metadata = bgee.get_gene_expression(bridgedb_df=bridgedb_df)
    bgee_df.to_pickle(bgee_path)
    with open(bgee_metadata_path, "wb") as file:
        pickle.dump(bgee_metadata, file)
else:
    bgee_df = pd.read_pickle(bgee_path)
    with open(bgee_metadata_path, "rb") as file:
        bgee_metadata = pickle.load(file)

In [7]:
bgee_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels
0,10000,Entrez Gene,ENSG00000117020,Ensembl,"[{'anatomical_entity_id': 'UBERON:0000178', 'a..."
1,10891,Entrez Gene,ENSG00000109819,Ensembl,"[{'anatomical_entity_id': 'UBERON:0000178', 'a..."


## 3. Disease annotation
### 3.1. Gene to disease annotation with DisGeNET

In [8]:
disgenet_path = f"{EXAMPLE_DIR}/disgenet.pkl"
disgenet_metadata_path = f"{EXAMPLE_DIR}/disgenet_metadata.pkl"

DISGENET_API_KEY = "191dc560-6d91-43c0-ae9f-01655efdf6d5"

if not os.path.exists(disgenet_path):
    disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
        api_key=DISGENET_API_KEY, bridgedb_df=bridgedb_df
    )

    disgenet_df.to_pickle(disgenet_path)
    with open(disgenet_metadata_path, "wb") as file:
        pickle.dump(disgenet_metadata, file)
else:
    disgenet_df = pd.read_pickle(disgenet_path)
    with open(disgenet_metadata_path, "rb") as file:
        disgenet_metadata = pickle.load(file)

In [9]:
disgenet_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases
0,10000,Entrez Gene,10000,NCBI Gene,"[{'disease_name': 'melanoma', 'HPO': 'HPO:HP:0..."
1,10891,Entrez Gene,10891,NCBI Gene,"[{'disease_name': 'Diabetes Mellitus, Non-Insu..."


### 3.2. Disease to compound annotation from OpenTargets

In [10]:
# Prepare the input to use DISGENET output as seed for OpenTargets
import pyBiodatafuse.constants as Cons

disease_mapping_df = create_harmonized_input_file(
    disgenet_df, Cons.DISGENET_DISEASE_COL, Cons.EFO, Cons.UMLS
)
disease_mapping_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,UMLS_C0025202,UMLS,EFO_0000756,EFO
1,UMLS_C0376358,UMLS,EFO_0000673,EFO
2,UMLS_C0376358,UMLS,EFO_0001663,EFO
3,UMLS_C1621958,UMLS,EFO_0000519,EFO
4,UMLS_C0017636,UMLS,EFO_0000519,EFO


In [11]:
opentarget_path = f"{EXAMPLE_DIR}/opentarget_cmpd.pkl"
opentarget_metadata_path = f"{EXAMPLE_DIR}/opentarget_cmpd_metadata.pkl"

if not os.path.exists(opentarget_path):
    opentargets_df, opentargets_metadata = opentargets.get_disease_compound_interactions(
        disease_mapping_df
    )
    opentargets_df.to_pickle(opentarget_path)
    with open(opentarget_metadata_path, "wb") as file:
        pickle.dump(opentargets_metadata, file)
else:
    opentargets_df = pd.read_pickle(opentarget_path)
    with open(opentarget_metadata_path, "rb") as file:
        opentargets_metadata = pickle.load(file)

>NOTE: No drugs for the diseases of interest found.

## 4. Pathways and Gene Ontology terms
### 4.1. Pathways from MINERVA

In [12]:
minerva_path = f"{EXAMPLE_DIR}/minerva.pkl"
minerva_metadata_path = f"{EXAMPLE_DIR}/minerva_metadata.pkl"

if not os.path.exists(minerva_path):
    minerva_df, minerva_metadata = minerva.get_gene_pathways(
        bridgedb_df, map_name="COVID19 Disease Map"
    )
    minerva_df.to_pickle(minerva_path)
    with open(minerva_metadata_path, "wb") as file:
        pickle.dump(minerva_metadata, file)

else:
    minerva_df = pd.read_pickle(minerva_path)
    with open(minerva_metadata_path, "rb") as file:
        minerva_metadata = pickle.load(file)

In [13]:
minerva_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,MINERVA_pathways
0,10000,Entrez Gene,ENSG00000117020,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
1,10891,Entrez Gene,ENSG00000109819,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."


### 4.2. Pathways from WikiPathways

In [14]:
wikipathways_path = f"{EXAMPLE_DIR}/wikipathways.pkl"
wikipathways_metadata_path = f"{EXAMPLE_DIR}/wikipathways_metadata.pkl"

if not os.path.exists(wikipathways_path):
    wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(
        bridgedb_df=bridgedb_df
    )
    wikipathways_df.to_pickle(wikipathways_path)
    with open(wikipathways_metadata_path, "wb") as file:
        pickle.dump(wikipathways_metadata, file)
else:
    wikipathways_df = pd.read_pickle(wikipathways_path)
    with open(wikipathways_metadata_path, "rb") as file:
        wikipathways_metadata = pickle.load(file)

In [15]:
wikipathways_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways
0,10000,Entrez Gene,10000,NCBI Gene,"[{'pathway_id': 'WP:WP4746', 'pathway_label': ..."
1,10891,Entrez Gene,10891,NCBI Gene,"[{'pathway_id': 'WP:WP4746', 'pathway_label': ..."


### Alternatively, you can retrieve the molecular interactions for genes and metabolites from wikipathways:

In [16]:
wikipathways_mol_path = f"{EXAMPLE_DIR}/wikipathways_mol.pkl"
wikipathways_mol_metadata_path = f"{EXAMPLE_DIR}/wikipathways_mol_metadata.pkl"

if not os.path.exists(wikipathways_mol_path):
    wikipathways_mol_df, wikipathways_mol_metadata = wikipathways.get_gene_wikipathways(
        bridgedb_df=bridgedb_df, query_interactions=True
    )
    wikipathways_mol_df.to_pickle(wikipathways_mol_path)
    with open(wikipathways_mol_metadata_path, "wb") as file:
        pickle.dump(wikipathways_mol_metadata, file)
else:
    wikipathways_mol_df = pd.read_pickle(wikipathways_mol_path)
    with open(wikipathways_mol_metadata_path, "rb") as file:
        wikipathways_mol_metadata = pickle.load(file)

In [17]:
wikipathways_mol_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways_molecular
0,10000,Entrez Gene,10000,NCBI Gene,"[{'pathway_id': 'WP:4746', 'pathway_label': 'T..."
1,10891,Entrez Gene,10891,NCBI Gene,"[{'pathway_id': 'WP:5294', 'pathway_label': 'S..."


### 4.3. Reactome pathways from OpenTargets

In [18]:
opentargets_reactome_path = f"{EXAMPLE_DIR}/opentargets_reactome.pkl"
opentargets_reactome_metadata_path = f"{EXAMPLE_DIR}/opentargets_reactome_metadata.pkl"


if not os.path.exists(opentargets_reactome_path):
    opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
        bridgedb_df=bridgedb_df
    )
    opentargets_reactome_df.to_pickle(opentargets_reactome_path)
    with open(opentargets_reactome_metadata_path, "wb") as file:
        pickle.dump(opentargets_reactome_metadata, file)
else:
    opentargets_reactome_df = pd.read_pickle(opentargets_reactome_path)
    with open(opentargets_reactome_metadata_path, "rb") as file:
        opentargets_reactome_metadata = pickle.load(file)

In [19]:
opentargets_reactome_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_reactome
0,10000,Entrez Gene,ENSG00000117020,Ensembl,"[{'pathway_label': 'FLT3 Signaling', 'pathway_..."
1,10891,Entrez Gene,ENSG00000109819,Ensembl,[{'pathway_label': 'Transcriptional regulation...


### 4.4. Gene Ontology from OpenTargets

In [20]:
opentargets_go_path = f"{EXAMPLE_DIR}/opentargets_go.pkl"
opentargets_go_metadata_path = f"{EXAMPLE_DIR}/opentargets_go_metadata.pkl"

if not os.path.exists(opentargets_go_path):
    opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(
        bridgedb_df=bridgedb_df
    )
    opentargets_go_df.to_pickle(opentargets_go_path)
    with open(opentargets_go_metadata_path, "wb") as file:
        pickle.dump(opentargets_go_metadata, file)
else:
    opentargets_go_df = pd.read_pickle(opentargets_go_path)
    with open(opentargets_go_metadata_path, "rb") as file:
        opentargets_go_metadata = pickle.load(file)

In [21]:
opentargets_go_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_go
0,10000,Entrez Gene,ENSG00000117020,Ensembl,"[{'go_id': 'GO:0004711', 'go_name': 'ribosomal..."
1,10891,Entrez Gene,ENSG00000109819,Ensembl,"[{'go_id': 'GO:0022904', 'go_name': 'respirato..."


## 5. Compound annotation
### 5.1. Compound annotation from OpenTargets

In [22]:
opentargets_gene_path = f"{EXAMPLE_DIR}/opentargets_gene_cmpd.pkl"
opentargets_gene_metadata_path = f"{EXAMPLE_DIR}/opentargets_gene_cmpd_metadata.pkl"

if not os.path.exists(opentargets_gene_path):
    opentargets_compound_df, opentargets_compound_metadata = (
        opentargets.get_gene_compound_interactions(bridgedb_df=bridgedb_df)
    )
    opentargets_compound_df.to_pickle(opentargets_gene_path)
    with open(opentargets_gene_metadata_path, "wb") as file:
        pickle.dump(opentargets_compound_metadata, file)

else:
    opentargets_compound_df = pd.read_pickle(opentargets_gene_path)
    with open(opentargets_gene_metadata_path, "rb") as file:
        opentargets_compound_metadata = pickle.load(file)

In [23]:
opentargets_compound_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_gene_compounds
0,10000,Entrez Gene,ENSG00000117020,Ensembl,"[{'chembl_id': 'CHEMBL:CHEMBL2177390', 'drugba..."
1,10891,Entrez Gene,ENSG00000109819,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."


### 5.2. Screening results of compounds on proteins encoded by genes annotation by PubChem

In [24]:
pubchem_path = f"{EXAMPLE_DIR}/pubchem.pkl"
pubchem_metadata_path = f"{EXAMPLE_DIR}/pubchem_metadata.pkl"

if not os.path.exists(pubchem_path):
    pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(
        bridgedb_df=bridgedb_df
    )
    pubchem_assay_df.to_pickle(pubchem_path)
    with open(pubchem_metadata_path, "wb") as file:
        pickle.dump(pubchem_assay_metadata, file)

else:
    pubchem_assay_df = pd.read_pickle(pubchem_path)
    with open(pubchem_metadata_path, "rb") as file:
        pubchem_assay_metadata = pickle.load(file)

In [25]:
pubchem_assay_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,PubChem_assays
0,10000,Entrez Gene,A0A5F9ZGY0,Uniprot-TrEMBL,"[{'pubchem_assay_id': nan, 'assay_type': nan, ..."
1,10000,Entrez Gene,A0A5F9ZGZ4,Uniprot-TrEMBL,"[{'pubchem_assay_id': nan, 'assay_type': nan, ..."


## 6. Membrane transport annotations
### 6.1 Transporter inhibitor annotation from MolMeDB

In [26]:
molmedb_path = f"{EXAMPLE_DIR}/molmedb_gene_cmpd.pkl"
molmedb_metadata_path = f"{EXAMPLE_DIR}/molmedb_gene_cmpd_metadata.pkl"

if not os.path.exists(molmedb_path):
    inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df)
    inhibitor_df.to_pickle(molmedb_path)
    with open(molmedb_metadata_path, "wb") as file:
        pickle.dump(inhibitor_metadata, file)
else:
    inhibitor_df = pd.read_pickle(molmedb_path)
    with open(molmedb_metadata_path, "rb") as file:
        inhibitor_metadata = pickle.load(file)

> NOTE: No annotations found here

## 7. Protein-Protein Interactions

### 7.1. Protein-Protein Interactions from STRING

In [27]:
string_path = f"{EXAMPLE_DIR}/string.pkl"
string_metadata_path = f"{EXAMPLE_DIR}/string_metadata.pkl"

if not os.path.exists(string_path):
    ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df)
    ppi_df.to_pickle(string_path)
    with open(string_metadata_path, "wb") as file:
        pickle.dump(ppi_metadata, file)
else:
    ppi_df = pd.read_pickle(string_path)
    with open(string_metadata_path, "rb") as file:
        ppi_metadata = pickle.load(file)

In [28]:
ppi_df.head(2)

> NOTE: No interactions found here

## 8. Graph generation 

### 8.1. Combine all data and metadata

In [29]:
combined_df = combine_sources(
    bridgedb_df,
    [
        bgee_df,
        disgenet_df,
        minerva_df,
        wikipathways_df,
        opentargets_reactome_df,
        opentargets_go_df,
        opentargets_compound_df,
        inhibitor_df,
        pubchem_assay_df,
        ppi_df,
        wikipathways_mol_df,
    ],
)
combined_df.to_pickle(f"{EXAMPLE_DIR}/combined_df.pkl")

In [30]:
combined_df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels,DISGENET_diseases,MINERVA_pathways,WikiPathways,OpenTargets_reactome,OpenTargets_go,OpenTargets_gene_compounds,PubChem_assays,WikiPathways_molecular
0,7350,Entrez Gene,ENSG00000109424,Ensembl,"[{'anatomical_entity_id': 'UBERON:0000955', 'a...","[{'disease_name': 'Obesity', 'HPO': 'HPO:HP:00...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP:WP4746', 'pathway_label': ...","[{'pathway_label': 'Mitochondrial Uncoupling',...","[{'go_id': 'GO:1990542', 'go_name': 'mitochond...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pubchem_assay_id': nan, 'assay_type': nan, ...","[{'pathway_id': 'WP:4746', 'pathway_label': 'T..."
1,6198,Entrez Gene,ENSG00000108443,Ensembl,"[{'anatomical_entity_id': 'UBERON:0000178', 'a...","[{'disease_name': 'Colorectal Carcinoma', 'HPO...","[{'pathway_id': 'MINERVA:939', 'pathway_label'...","[{'pathway_id': 'WP:WP4746', 'pathway_label': ...",[{'pathway_label': 'mTORC1-mediated signalling...,"[{'go_id': 'GO:0043201', 'go_name': 'response ...","[{'chembl_id': 'CHEMBL:CHEMBL3545076', 'drugba...","[{'pubchem_assay_id': nan, 'assay_type': nan, ...","[{'pathway_id': 'WP:4936', 'pathway_label': 'P..."


The following code chunk shuffles the DisGeNET-queried data before serializing the graphs and displaying them, as it is not open access.

In [31]:
# Shuffling the rows
import numpy as np

combined_df["DISGENET_diseases"] = combined_df["DISGENET_diseases"].apply(np.random.permutation)

In [32]:
combined_metadata = create_or_append_to_metadata(
    bridgedb_metadata,
    [
        bgee_metadata,
        disgenet_metadata,
        opentargets_metadata,
        opentargets_compound_metadata,
        inhibitor_metadata,
        pubchem_assay_metadata,
        ppi_metadata,
        wikipathways_metadata,
        minerva_metadata,
        opentargets_reactome_metadata,
        opentargets_go_metadata,
        wikipathways_mol_metadata,
    ],
)

### 8.2. Create a graph from the annotated dataframe

In [33]:
pygraph = save_graph(
    combined_df=combined_df,
    combined_metadata=combined_metadata,
    disease_compound=opentargets_df,
    graph_name="examples",
    graph_dir=EXAMPLE_DIR,
)

Combined DataFrame saved in /Users/yojana/Documents/GitHub/pyBiodatafuse/examples/data/gene_to_graph_workflow/examples_df.pkl
Metadata saved in /Users/yojana/Documents/GitHub/pyBiodatafuse/examples/data/gene_to_graph_workflow/examples_metadata.pkl
Building graph: 100%|██████████| 78/78 [00:00<00:00, 138.04it/s]
Graph is built successfully
Graph saved in /Users/yojana/Documents/GitHub/pyBiodatafuse/examples/data/gene_to_graph_workflow/examples_graph.pkl and /Users/yojana/Documents/GitHub/pyBiodatafuse/examples/data/gene_to_graph_workflow/examples_graph.gml
Graph saved in /Users/yojana/Documents/GitHub/pyBiodatafuse/examples/data/gene_to_graph_workflow/examples_graph.edgelist


#### 8.3. Cytoscape
Make sure that the Cytoscape is open.

In [None]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(
    pygraph,
    network_name="Gene to Graph Workflow",
)

In [None]:
import py4cytoscape as p4c

p4c.notebook_export_show_image()