# Example: Graph generation from a gene

This notebook will show you how to use the tool to generate a KG on the underlying data.

In [1]:
new_path = r"C:\Users\are10\Documents\BAFSTU\code\pyBioDatafusemain\pyBiodatafuse\examples"

import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.append(project_root)

os.chdir(new_path)

# Set the current working directory
current_dir = os.getcwd()
print("Current directory:", current_dir)

Current directory: C:\Users\are10\Documents\BAFSTU\code\pyBioDatafusemain\pyBiodatafuse\examples


In [2]:
# Import modules
import pickle

import pandas as pd
from dotenv import load_dotenv

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    kegg,
    minerva,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.constants import (
    BGEE_GENE_EXPRESSION_LEVELS_COL,
    DISGENET_DISEASE_COL,
    KEGG_COL_NAME,
    MINERVA,
    MOLMEDB_PROTEIN_COMPOUND_COL,
    OPENTARGETS_DISEASE_COMPOUND_COL,
    OPENTARGETS_GENE_COMPOUND_COL,
    OPENTARGETS_GO_COL,
    OPENTARGETS_REACTOME_COL,
    PUBCHEM_COMPOUND_ASSAYS_COL,
    STRING_PPI_COL,
    WIKIPATHWAYS,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import combine_sources, create_harmonized_input_file

### Load the input list and convert it to a dataframe

In [3]:
# genes_of_interest = """AAGRN
# ALG14
# ALG2
# CHAT
# CHD8
# CHRNA1
# CHRNB1
# CHRND
# CHRNE
# CHRNG
# COL13A1
# COLQ
# DOK7
# DPAGT1
# GFPT1
# GMPPB
# LAMA5
# LAMB2
# LRP4
# MUSK
# MYO9A
# PLEC
# PREPL
# PURA
# RAPSN
# RPH3A
# SCN4A
# SLC18A3
# SLC25A1
# SLC5A7
# SNAP25
# SYT2
# TOR1AIP1
# UNC13A
# VAMP1"""
# genes_of_interest = """ENSMUSG00000059552
# ENSMUSG00000035847
# ENSMUSG00000016495
# ENSMUSP00000029201
# ENSMUSG00000073421
# ENSMUSG00000076883
# ENSMUSG00000013663
# ENSMUSG00000017146
# ENSMUSG00000021250
# ENSMUSG00000046532
# ENSMUSG00000026950
# ENSMUSG00000006782
# ENSMUSG00000020287
# ENSMUSP00000035037
# ENSMUSG00000051457
# ENSMUSP00000013807
# ENSMUSG00000029104
# ENSMUSG00000037716
# NSMUSG00000027262
# ENSMUSG00000020052
# ENSMUSG00000064341
# ENSMUSG00000027512
# """

#Mice Ensembl
genes_of_interest = """ENSMUSG00000067274
ENSMUSG00000000001
ENSMUSG00000084349
ENSMUSG00000025428
ENSMUSG00000044533"""

#Rat Ensembl
# genes_of_interest = """ENSRNOG00060027926
# ENSRNOG00055005387
# ENSRNOG00060018596
# ENSRNOG00060011358
# ENSRNOG00055009275 
# """

#Mice HGNC
# genes_of_interest = """Rpl3
# Atp5f1a
# Rps2
# plp0
# Gnai3"""

#Mice MGI
# genes_of_interest = """14679
# 100043000
# 11946
# 16898
# 11837"""

#Human HGNC
# genes_of_interest = """CHRNG
# DMD
# AHR
# SCN4A
# LC25A1
# HTR3A"""

#Human Ensembl
# genes_of_interest = """ENSG00000196811
# ENSG00000198947
# ENSG00000106546
# ENSG00000007314
# ENSG00000183048
# ENSG00000166736"""

gene_list = genes_of_interest.split("\n")
len(gene_list)

5

In [4]:
# from pyBiodatafuse import data_loader, id_mapper

# data_input = data_loader.create_df_from_dea("full_de_genes_treated_vs_untreated_plus_cpm_fc.csv")
# data_filtered = data_input[data_input['DE'].isin([1, -1])]
# features_filtered = data_filtered['identifier']

# gene_list = features_filtered.tolist()
# print(len(gene_list))

In [5]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head(20)

Unnamed: 0,identifier
0,ENSMUSG00000067274
1,ENSMUSG00000000001
2,ENSMUSG00000084349
3,ENSMUSG00000025428
4,ENSMUSG00000044533


### Entity resolution using BridgeDB

In [6]:
input_species="Mouse"

bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
   identifiers=data_input,
    input_species=input_species,
    input_datasource="Ensembl",
    output_datasource="All",
)
# bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
#     identifiers=data_input,
#     input_species="Human",
#     input_datasource="HGNC",
#    output_datasource="All",
# )
print(bridgedb_df["target.source"].unique())
bridgedb_df.head(25)



['Ensembl' 'Uniprot-TrEMBL' 'NCBI Gene' 'MGI']


Unnamed: 0,identifier,identifier.source,target,target.source
0,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl
1,ENSMUSG00000067274,Ensembl,Q5M8R8,Uniprot-TrEMBL
2,ENSMUSG00000067274,Ensembl,S4R1N1,Uniprot-TrEMBL
3,ENSMUSG00000067274,Ensembl,11837,NCBI Gene
4,ENSMUSG00000067274,Ensembl,P14869,Uniprot-TrEMBL
5,ENSMUSG00000067274,Ensembl,MGI:1927636,MGI
6,ENSMUSG00000067274,Ensembl,D3YVM5,Uniprot-TrEMBL
7,ENSMUSG00000000001,Ensembl,MGI:95773,MGI
8,ENSMUSG00000000001,Ensembl,Q9DC51,Uniprot-TrEMBL
9,ENSMUSG00000000001,Ensembl,ENSMUSG00000000001,Ensembl


### Gene expression from Bgee

In [1]:
bgee_df, bgee_metadata = bgee.get_gene_expression(bridgedb_df=bridgedb_df)
bgee_df.head()

NameError: name 'bgee' is not defined

In [43]:
bgee_df[BGEE_GENE_EXPRESSION_LEVELS_COL][1]

[{'anatomical_entity_id': 'UBERON_0002371',
  'anatomical_entity_name': 'bone marrow',
  'expression_level': 98.45455,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0000955',
  'anatomical_entity_name': 'brain',
  'expression_level': 99.51974,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0004535',
  'anatomical_entity_name': 'cardiovascular system',
  'expression_level': 99.83003,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0001007',
  'anatomical_entity_name': 'digestive system'

### Disease annotatation from DisGeNet


ADD your DISGENET API KEY in the main folder

**1)** Create a .env File and add DISGENET_API_KEY to it:

DISGENET_API_KEY="your-API-key-value"

**2)** Install *python-dotenv*:
```
pip install python-dotenv
```

In [8]:
# Read the .env File
load_dotenv()
# Retrieve the key from the environment variable
disgenet_api_key = os.getenv("DISGENET_API_KEY")

In [None]:
disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
    api_key=disgenet_api_key, bridgedb_df=bridgedb_df
)
disgenet_df.head()

In [10]:
disgenet_df[DISGENET_DISEASE_COL][0]

[{'disease_name': 'Mammary Neoplasms',
  'HPO': 'HPO_HP:0100013',
  'NCI': 'NCI_C2910',
  'OMIM': '',
  'MONDO': 'MONDO_0021100',
  'ORDO': '',
  'EFO': 'EFO_0003869',
  'DO': 'DO_3459, DO_1612',
  'MESH': 'MESH_D001943',
  'UMLS': 'UMLS_C1458155',
  'disease_type': 'disease',
  'disease_umlscui': 'C1458155',
  'score': 0.8500000000000001,
  'ei': 0.9176470588235294,
  'el': None},
 {'disease_name': 'Rheumatoid Arthritis',
  'HPO': 'HPO_HP:0001370',
  'NCI': 'NCI_C2884',
  'OMIM': 'OMIM_607218, OMIM_180300',
  'MONDO': 'MONDO_0008383',
  'ORDO': '',
  'EFO': 'EFO_0000685',
  'DO': 'DO_7148',
  'MESH': 'MESH_D001172',
  'UMLS': 'UMLS_C0003873',
  'disease_type': 'disease',
  'disease_umlscui': 'C0003873',
  'score': 0.8,
  'ei': 0.9523809523809523,
  'el': None},
 {'disease_name': 'Obesity',
  'HPO': 'HPO_HP:0001513',
  'NCI': 'NCI_C159658, NCI_C3283',
  'OMIM': 'OMIM_601665',
  'MONDO': 'MONDO_0011122, MONDO_0019182',
  'ORDO': '',
  'EFO': 'EFO_0001073',
  'DO': 'DO_9970',
  'MESH': '

### Disease to compound annotation from OpenTargets

In [11]:
# Prepare the input to use DISGENET output as seed for OpenTargets
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,UMLS_C1458155,UMLS,EFO_0003869,EFO
1,UMLS_C0003873,UMLS,EFO_0000685,EFO
2,UMLS_C0028754,UMLS,EFO_0001073,EFO
3,UMLS_C0025517,UMLS,EFO_0000589,EFO
4,UMLS_C2973725,UMLS,EFO_0001361,EFO


In [12]:
(
    opentargets_disease_compound_df,
    opentargets_disease_compound_metadata,
) = opentargets.get_disease_compound_interactions(disease_mapping_df)
opentargets_disease_compound_df.head()

  check_columns_against_constants(
  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0000786,UMLS,EFO_1001255,EFO,"[{'chembl_id': 'CHEMBL1276308', 'drugbank_id':..."
1,UMLS_C0000889,UMLS,EFO_1000660,EFO,"[{'chembl_id': 'CHEMBL1431', 'drugbank_id': 'D..."
2,UMLS_C0002940,UMLS,EFO_0004264,EFO,"[{'chembl_id': 'CHEMBL1491', 'drugbank_id': 'D..."
3,UMLS_C0002940,UMLS,EFO_0009659,EFO,"[{'chembl_id': 'CHEMBL526', 'drugbank_id': 'DB..."
4,UMLS_C0003873,UMLS,EFO_0000685,EFO,"[{'chembl_id': 'CHEMBL2103743', 'drugbank_id':..."


In [13]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

[{'chembl_id': 'CHEMBL1276308',
  'drugbank_id': 'DB00834',
  'compound_cid': '55245',
  'compound_name': 'MIFEPRISTONE',
  'clincal_trial_phase': 4.0,
  'is_approved': True,
  'relation': 'treats',
  'adverse_effect_count': 35.0,
  'adverse_effect': [{'name': 'abortion incomplete'},
   {'name': 'haemorrhage'},
   {'name': 'pregnancy'},
   {'name': 'endometritis'},
   {'name': 'induced abortion failed'},
   {'name': 'vaginal haemorrhage'},
   {'name': 'anaemia'},
   {'name': 'muscle spasms'},
   {'name': 'metrorrhagia'},
   {'name': 'abortion induced incomplete'},
   {'name': 'menorrhagia'},
   {'name': 'pain'},
   {'name': 'uterine haemorrhage'},
   {'name': 'post abortion infection'},
   {'name': 'uterine rupture'},
   {'name': 'ectopic pregnancy'},
   {'name': 'blood potassium decreased'},
   {'name': 'syncope'},
   {'name': 'endometritis bacterial'},
   {'name': 'pelvic inflammatory disease'},
   {'name': 'uterine dilation and curettage'},
   {'name': 'haemorrhagic anaemia'},
   {'

### Pathways from MINERVA

In [None]:
minerva_df, minerva_metadata = minerva.get_gene_minerva_pathways(
    bridgedb_df, map_name="COVID19 Disease Map"
)
minerva_df.head()

In [None]:
minerva_metadata

In [None]:
minerva_metadata

In [None]:
minerva_df[MINERVA][0]

### Pathways from WikiPathways

In [24]:
wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df)
wikipathways_df.head()

Querying WikiPathways: 100%|██████████| 1/1 [00:18<00:00, 18.22s/it]


Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways
0,ENSMUSG00000000001,Ensembl,14679,NCBI Gene,"[{'pathway_id': 'WP:WP2292', 'pathway_label': ..."
1,ENSMUSG00000025428,Ensembl,11946,NCBI Gene,"[{'pathway_id': 'WP:WP295', 'pathway_label': '..."
2,ENSMUSG00000044533,Ensembl,16898,NCBI Gene,"[{'pathway_id': 'WP:WP163', 'pathway_label': '..."
3,ENSMUSG00000067274,Ensembl,11837,NCBI Gene,"[{'pathway_id': 'WP:WP163', 'pathway_label': '..."


In [44]:
wikipathways_df[WIKIPATHWAYS][0]

[{'pathway_id': 'WP:WP2292',
  'pathway_label': 'Chemokine signaling pathway',
  'pathway_gene_count': 187},
 {'pathway_id': 'WP:WP553',
  'pathway_label': 'Calcium regulation in cardiac cells',
  'pathway_gene_count': 145},
 {'pathway_id': 'WP:WP232',
  'pathway_label': 'G protein signaling pathways',
  'pathway_gene_count': 91},
 {'pathway_id': 'WP:WP57',
  'pathway_label': 'Signal transduction of S1P receptor',
  'pathway_gene_count': 22}]

### Reactome pathways from OpenTargets

In [7]:
opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
    bridgedb_df=bridgedb_df
)
opentargets_reactome_df.head()

  opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(


In [20]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][0]

[{'pathway_label': 'Endogenous sterols', 'pathway_id': 'R-HSA-211976'},
 {'pathway_label': 'PPARA activates gene expression',
  'pathway_id': 'R-HSA-1989781'},
 {'pathway_label': 'Phase I - Functionalization of compounds',
  'pathway_id': 'R-HSA-211945'},
 {'pathway_label': 'Xenobiotics', 'pathway_id': 'R-HSA-211981'},
 {'pathway_label': 'Aryl hydrocarbon receptor signalling',
  'pathway_id': 'R-HSA-8937144'}]

### Gene Ontology from OpenTargets

In [21]:
opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(
    bridgedb_df=bridgedb_df
)
opentargets_go_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_go
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'go_id': 'GO:0005667', 'go_name': 'transcrip..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'go_id': 'GO:0015464', 'go_name': 'acetylcho..."
2,DMD,HGNC,ENSG00000198947,Ensembl,"[{'go_id': 'GO:0016010', 'go_name': 'dystrophi..."
3,HTR3A,HGNC,ENSG00000166736,Ensembl,"[{'go_id': 'GO:1904602', 'go_name': 'serotonin..."
4,SCN4A,HGNC,ENSG00000007314,Ensembl,"[{'go_id': 'GO:0035725', 'go_name': 'sodium io..."


In [None]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

### Compounds from OpenTarget

In [None]:
opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
    bridgedb_df=bridgedb_df
)
opentargets_compound_df.head()

In [None]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][0]

### Transporter inhibitors from MolMeDB

In [None]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df)
inhibitor_df.head()

In [None]:
print(inhibitor_df[MOLMEDB_PROTEIN_COMPOUND_COL][47])
inhibitor_df[inhibitor_df["target"] == "P35499"]

### Screening results of compounds on proteins encoded by genes annotation by PubChem

In [None]:
pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(
    bridgedb_df=bridgedb_df
)
pubchem_assay_df.head()

In [None]:
pubchem_assay_df[PUBCHEM_COMPOUND_ASSAYS_COL][0]

### Protein-Protein interactions from STRING

In [23]:
ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df, species=input_species)
ppi_df.head()

  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,StringDB_ppi
0,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl,"[{'stringdb_link_to': 'Atp5a1', 'Ensembl': 'EN..."
1,ENSMUSG00000000001,Ensembl,ENSMUSG00000000001,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
2,ENSMUSG00000084349,Ensembl,ENSMUSG00000084349,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
3,ENSMUSG00000025428,Ensembl,ENSMUSG00000025428,Ensembl,"[{'stringdb_link_to': 'Rps2', 'Ensembl': 'ENSM..."
4,ENSMUSG00000044533,Ensembl,ENSMUSG00000044533,Ensembl,"[{'stringdb_link_to': 'Atp5a1', 'Ensembl': 'EN..."


In [42]:
ppi_df[STRING_PPI_COL][3]

[{'stringdb_link_to': 'Rps2',
  'Ensembl': 'ENSMUSP00000092502',
  'score': 0.902,
  'string_id': '10090.ENSMUSP00000092502',
  'uniprot_id': 'P25444'},
 {'stringdb_link_to': 'Rplp0',
  'Ensembl': 'ENSMUSP00000083705',
  'score': 0.903,
  'string_id': '10090.ENSMUSP00000083705',
  'uniprot_id': 'P14869'}]

### Pathways from KEGG

In [None]:
kegg_df, kegg_metadata = kegg.get_pathways(bridgedb_df)
kegg_df.head()

{'pathway_id': 'path:mmu03010', 'gene_count': 264, 'compounds': []}
{'pathway_id': 'path:mmu05171', 'gene_count': 333, 'compounds': ['C00027', 'C00046', 'C00165', 'C00290', 'C00533', 'C00704', 'C00873', 'C02135', 'C15850', 'C20640']}
{'pathway_id': 'path:mmu04015', 'gene_count': 213, 'compounds': ['C00035', 'C00044', 'C00076', 'C00165', 'C00575']}
{'pathway_id': 'path:mmu04022', 'gene_count': 171, 'compounds': ['C00020', 'C00027', 'C00076', 'C00144', 'C00212', 'C00238', 'C00533', 'C00575', 'C00942', 'C01245']}
{'pathway_id': 'path:mmu04024', 'gene_count': 223, 'compounds': ['C00020', 'C00042', 'C00076', 'C00080', 'C00165', 'C00186', 'C00212', 'C00238', 'C00288', 'C00334', 'C00416', 'C00547', 'C00575', 'C00584', 'C00698', 'C00780', 'C00788', 'C01089', 'C01245', 'C01312', 'C01330', 'C01996', 'C03758', 'C20792', 'C20793']}


KeyboardInterrupt: 

In [8]:
kegg_df[KEGG_COL_NAME][1]

NameError: name 'kegg_df' is not defined

### Combing all the results into single dataframe

In [30]:
#        disgenet_df,
#        minerva_df,
#        opentargets_reactome_df,
#        opentargets_go_df,
#        opentargets_compound_df,
#        inhibitor_df,
#        pubchem_assay_df,

combined_df = combine_sources(
    bridgedb_df,
    [
        bgee_df,
        wikipathways_df,
        kegg_df,
        ppi_df,
    ],
)
combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels,WikiPathways,KEGG_pathways,StringDB_ppi
0,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a...","[{'pathway_id': 'WP:WP163', 'pathway_label': '...","{'KEGG_id': 'mmu:11837', 'pathways': [{'pathwa...","[{'stringdb_link_to': 'Atp5a1', 'Ensembl': 'EN..."
1,ENSMUSG00000000001,Ensembl,ENSMUSG00000000001,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'pathway_id': 'WP:WP2292', 'pathway_label': ...","{'KEGG_id': 'mmu:14679', 'pathways': [{'pathwa...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
2,ENSMUSG00000084349,Ensembl,ENSMUSG00000084349,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a...",,,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
3,ENSMUSG00000025428,Ensembl,ENSMUSG00000025428,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a...","[{'pathway_id': 'WP:WP295', 'pathway_label': '...","{'KEGG_id': 'mmu:11946', 'pathways': [{'pathwa...","[{'stringdb_link_to': 'Rps2', 'Ensembl': 'ENSM..."
4,ENSMUSG00000044533,Ensembl,ENSMUSG00000044533,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a...","[{'pathway_id': 'WP:WP163', 'pathway_label': '...","{'KEGG_id': 'mmu:16898', 'pathways': [{'pathwa...","[{'stringdb_link_to': 'Atp5a1', 'Ensembl': 'EN..."


In [31]:
combined_df.shape

(5, 8)

### Exporting the database in pickle format

In [33]:
with open("combined_df.pkl", "wb") as out:
    pickle.dump(combined_df, out)
# with open("opentargets_disease_compound_df.pkl", "wb") as out:
#     pickle.dump(opentargets_disease_compound_df, out)

## Creating a graph from the annotated dataframe

In [34]:
# combined_df = generator.load_dataframe_from_pickle("combined_df.pkl")
# opentargets_disease_compound_df = generator.load_dataframe_from_pickle(
#     "opentargets_disease_compound_df.pkl"
# )

combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels,WikiPathways,KEGG_pathways,StringDB_ppi
0,ENSMUSG00000067274,Ensembl,ENSMUSG00000067274,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a...","[{'pathway_id': 'WP:WP163', 'pathway_label': '...","{'KEGG_id': 'mmu:11837', 'pathways': [{'pathwa...","[{'stringdb_link_to': 'Atp5a1', 'Ensembl': 'EN..."
1,ENSMUSG00000000001,Ensembl,ENSMUSG00000000001,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'pathway_id': 'WP:WP2292', 'pathway_label': ...","{'KEGG_id': 'mmu:14679', 'pathways': [{'pathwa...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
2,ENSMUSG00000084349,Ensembl,ENSMUSG00000084349,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a...",,,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
3,ENSMUSG00000025428,Ensembl,ENSMUSG00000025428,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a...","[{'pathway_id': 'WP:WP295', 'pathway_label': '...","{'KEGG_id': 'mmu:11946', 'pathways': [{'pathwa...","[{'stringdb_link_to': 'Rps2', 'Ensembl': 'ENSM..."
4,ENSMUSG00000044533,Ensembl,ENSMUSG00000044533,Ensembl,"[{'anatomical_entity_id': 'UBERON_0002371', 'a...","[{'pathway_id': 'WP:WP163', 'pathway_label': '...","{'KEGG_id': 'mmu:16898', 'pathways': [{'pathwa...","[{'stringdb_link_to': 'Atp5a1', 'Ensembl': 'EN..."


In [34]:
opentargets_disease_compound_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0000786,UMLS,EFO_1001255,EFO,"[{'chembl_id': 'CHEMBL1276308', 'drugbank_id':..."
1,UMLS_C0000889,UMLS,EFO_1000660,EFO,"[{'chembl_id': 'CHEMBL1431', 'drugbank_id': 'D..."
2,UMLS_C0002940,UMLS,EFO_0004264,EFO,"[{'chembl_id': 'CHEMBL1491', 'drugbank_id': 'D..."
3,UMLS_C0002940,UMLS,EFO_0009659,EFO,"[{'chembl_id': 'CHEMBL526', 'drugbank_id': 'DB..."
4,UMLS_C0003873,UMLS,EFO_0000685,EFO,"[{'chembl_id': 'CHEMBL2103743', 'drugbank_id':..."


In [36]:
pygraph = generator.build_networkx_graph(combined_df)

Building graph:   0%|          | 0/5 [00:00<?, ?it/s]

Building graph: 100%|██████████| 5/5 [00:00<00:00, 108.62it/s]


### Store the graph

In [37]:
with open("networkx_graph_test.pkl", "wb") as out:
    pickle.dump(pygraph, out)

## Visualize the graph

In [None]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

#### Cytosacpe
Make sure that the Cytoscape is open

In [41]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="Test network")

Applying default style...
Applying preferred layout


#### Neo4j

In [38]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "networkx_graph_test.graphml")

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph  before) Remove all the nodes before importing `.graphml` file

    ```MATCH (n) DETACH DELETE n```

- Import `.graphml` file

    ```call apoc.import.graphml('file:///networkx_graph_test.graphml',{readLabels:TRUE})```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```
    

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) 
        - Gene (```MATCH (n:Gene) RETURN count(n)```)
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```)
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) 
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) 
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) 
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) 
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) 
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) 
        - Disease (```MATCH (n:Disease) RETURN count(n)```) 
        - Compound (```MATCH (n:Compound) RETURN count(n)```)
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) 
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) 
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) 
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) 
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) 
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) 
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) 
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) 
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) 
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) 

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("networkx_graph_test.csv",{})```