# Example: Graph generation from a gene

This notebook will show you how to use the tool to generate a KG on the underlying data.

In [1]:
new_path = "E:\BioDataFuse\pyBiodatafuse"

import os

os.chdir(new_path)

# Set the current working directory
current_dir = os.getcwd()
print("Current directory:", current_dir)

Current directory: E:\BioDataFuse\pyBiodatafuse


In [2]:
# Import modules
import pickle

import pandas as pd
from dotenv import load_dotenv

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    minerva,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.constants import (
    BGEE_GENE_EXPRESSION_LEVELS_COL,
    DISGENET_DISEASE_COL,
    MINERVA,
    MOLMEDB_PROTEIN_COMPOUND_COL,
    OPENTARGETS_DISEASE_COMPOUND_COL,
    OPENTARGETS_GENE_COMPOUND_COL,
    OPENTARGETS_GO_COL,
    OPENTARGETS_REACTOME_COL,
    PUBCHEM_COMPOUND_ASSAYS_COL,
    STRING_PPI_COL,
    WIKIPATHWAYS,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import combine_sources, create_harmonized_input_file

### Load the input list and convert it to a dataframe

In [3]:
# genes_of_interest = """AAGRN
# ALG14
# ALG2
# CHAT
# CHD8
# CHRNA1
# CHRNB1
# CHRND
# CHRNE
# CHRNG
# COL13A1
# COLQ
# DOK7
# DPAGT1
# GFPT1
# GMPPB
# LAMA5
# LAMB2
# LRP4
# MUSK
# MYO9A
# PLEC
# PREPL
# PURA
# RAPSN
# RPH3A
# SCN4A
# SLC18A3
# SLC25A1
# SLC5A7
# SNAP25
# SYT2
# TOR1AIP1
# UNC13A
# VAMP1"""
genes_of_interest = """CHRNG
DMD
AHR
SCN4A
SLC25A1
HTR3A"""
# # genes_of_interest = "DMD"
# genes_of_interest = "AHR"
# genes_of_interest = """DMD
# AHR"""
# genes_of_interest = "CHRNG"
# genes_of_interest = "HOXA10"

gene_list = genes_of_interest.split("\n")
len(gene_list)

6

In [4]:
data_input = pd.DataFrame(gene_list, columns=["identifier"])
data_input.head()

Unnamed: 0,identifier
0,CHRNG
1,DMD
2,AHR
3,SCN4A
4,SLC25A1


### Entity resolution using BridgeDB

In [5]:
bridgdb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_input,
    input_species="Human",
    input_datasource="HGNC",
    output_datasource="All",
)
bridgdb_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,CHRNG,HGNC,GO:0042391,Gene Ontology
1,CHRNG,HGNC,A_23_P5718,Agilent
2,CHRNG,HGNC,GO:0016021,Gene Ontology
3,CHRNG,HGNC,GO:0016020,Gene Ontology
4,CHRNG,HGNC,GO:0006936,Gene Ontology


### Gene expression from Bgee

In [6]:
bgee_df, bgee_metadata = bgee.get_gene_expression(bridgedb_df=bridgdb_df)
bgee_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a..."
2,DMD,HGNC,ENSG00000198947,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a..."
3,HTR3A,HGNC,ENSG00000166736,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a..."
4,SCN4A,HGNC,ENSG00000007314,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a..."


In [7]:
bgee_df[BGEE_GENE_EXPRESSION_LEVELS_COL][0]

[{'anatomical_entity_id': 'UBERON_0000178',
  'anatomical_entity_name': 'blood',
  'expression_level': 80.23869999999998,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0002371',
  'anatomical_entity_name': 'bone marrow',
  'expression_level': 77.48559,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0000955',
  'anatomical_entity_name': 'brain',
  'expression_level': 64.3383,
  'confidence_level_id': 'CIO_0000029',
  'confidence_level_name': 'high confidence level',
  'developmental_stage_id': 'UBERON_0000104',
  'developmental_stage_name': 'life cycle'},
 {'anatomical_entity_id': 'UBERON_0000310',
  'anatomical_entity_name': 'breast',
  'expression_le

### Disease annotatation from DisGeNet


ADD your DISGENET API KEY in the main folder

**1)** Create a .env File and add DISGENET_API_KEY to it:

DISGENET_API_KEY="your-API-key-value"

**2)** Install *python-dotenv*:
```
pip install python-dotenv
```

In [8]:
# Read the .env File
load_dotenv()
# Retrieve the key from the environment variable
disgenet_api_key = os.getenv("DISGENET_API_KEY")

In [9]:
disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
    api_key=disgenet_api_key, bridgedb_df=bridgdb_df
)
disgenet_df.head()



Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases
0,AHR,HGNC,196,NCBI Gene,"[{'disease_name': 'Mammary Neoplasms', 'HPO': ..."
1,CHRNG,HGNC,1146,NCBI Gene,[{'disease_name': 'Multiple pterygium syndrome...
2,DMD,HGNC,1756,NCBI Gene,"[{'disease_name': 'Muscular Dystrophy, Duchenn..."
3,HTR3A,HGNC,3359,NCBI Gene,"[{'disease_name': 'Schizophrenia', 'HPO': 'HPO..."
4,SCN4A,HGNC,6329,NCBI Gene,[{'disease_name': 'Potassium aggravated myoton...


In [10]:
disgenet_df[DISGENET_DISEASE_COL][0]

[{'disease_name': 'Mammary Neoplasms',
  'HPO': 'HPO_HP:0100013',
  'NCI': 'NCI_C2910',
  'OMIM': '',
  'MONDO': 'MONDO_0021100',
  'ORDO': '',
  'EFO': 'EFO_0003869',
  'DO': 'DO_3459, DO_1612',
  'MESH': 'MESH_D001943',
  'UMLS': 'UMLS_C1458155',
  'disease_type': 'disease',
  'disease_umlscui': 'C1458155',
  'score': 0.8500000000000001,
  'ei': 0.9176470588235294,
  'el': None},
 {'disease_name': 'Rheumatoid Arthritis',
  'HPO': 'HPO_HP:0001370',
  'NCI': 'NCI_C2884',
  'OMIM': 'OMIM_607218, OMIM_180300',
  'MONDO': 'MONDO_0008383',
  'ORDO': '',
  'EFO': 'EFO_0000685',
  'DO': 'DO_7148',
  'MESH': 'MESH_D001172',
  'UMLS': 'UMLS_C0003873',
  'disease_type': 'disease',
  'disease_umlscui': 'C0003873',
  'score': 0.8,
  'ei': 0.9523809523809523,
  'el': None},
 {'disease_name': 'Obesity',
  'HPO': 'HPO_HP:0001513',
  'NCI': 'NCI_C159658, NCI_C3283',
  'OMIM': 'OMIM_601665',
  'MONDO': 'MONDO_0011122, MONDO_0019182',
  'ORDO': '',
  'EFO': 'EFO_0001073',
  'DO': 'DO_9970',
  'MESH': '

### Disease to compound annotation from OpenTargets

In [11]:
# Prepare the input to use DISGENET output as seed for OpenTargets
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,UMLS_C1458155,UMLS,EFO_0003869,EFO
1,UMLS_C0003873,UMLS,EFO_0000685,EFO
2,UMLS_C0028754,UMLS,EFO_0001073,EFO
3,UMLS_C0025517,UMLS,EFO_0000589,EFO
4,UMLS_C2973725,UMLS,EFO_0001361,EFO


In [12]:
(
    opentargets_disease_compound_df,
    opentargets_disease_compound_metadata,
) = opentargets.get_disease_compound_interactions(disease_mapping_df)
opentargets_disease_compound_df.head()

  check_columns_against_constants(
  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0000786,UMLS,EFO_1001255,EFO,"[{'chembl_id': 'CHEMBL1276308', 'drugbank_id':..."
1,UMLS_C0000889,UMLS,EFO_1000660,EFO,"[{'chembl_id': 'CHEMBL1431', 'drugbank_id': 'D..."
2,UMLS_C0002940,UMLS,EFO_0004264,EFO,"[{'chembl_id': 'CHEMBL1491', 'drugbank_id': 'D..."
3,UMLS_C0002940,UMLS,EFO_0009659,EFO,"[{'chembl_id': 'CHEMBL526', 'drugbank_id': 'DB..."
4,UMLS_C0003873,UMLS,EFO_0000685,EFO,"[{'chembl_id': 'CHEMBL2103743', 'drugbank_id':..."


In [13]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

[{'chembl_id': 'CHEMBL1276308',
  'drugbank_id': 'DB00834',
  'compound_cid': '55245',
  'compound_name': 'MIFEPRISTONE',
  'clincal_trial_phase': 4.0,
  'is_approved': True,
  'relation': 'treats',
  'adverse_effect_count': 35.0,
  'adverse_effect': [{'name': 'abortion incomplete'},
   {'name': 'haemorrhage'},
   {'name': 'pregnancy'},
   {'name': 'endometritis'},
   {'name': 'induced abortion failed'},
   {'name': 'vaginal haemorrhage'},
   {'name': 'anaemia'},
   {'name': 'muscle spasms'},
   {'name': 'metrorrhagia'},
   {'name': 'abortion induced incomplete'},
   {'name': 'menorrhagia'},
   {'name': 'pain'},
   {'name': 'uterine haemorrhage'},
   {'name': 'post abortion infection'},
   {'name': 'uterine rupture'},
   {'name': 'ectopic pregnancy'},
   {'name': 'blood potassium decreased'},
   {'name': 'syncope'},
   {'name': 'endometritis bacterial'},
   {'name': 'pelvic inflammatory disease'},
   {'name': 'uterine dilation and curettage'},
   {'name': 'haemorrhagic anaemia'},
   {'

### Pathways from MINERVA

In [14]:
minerva_df, minerva_metadata = minerva.get_gene_minerva_pathways(
    bridgdb_df, map_name="COVID19 Disease Map"
)
minerva_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,MINERVA
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'pathway_id': 953.0, 'pathway_label': 'Kynur..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
2,DMD,HGNC,ENSG00000198947,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
3,HTR3A,HGNC,ENSG00000166736,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
4,SCN4A,HGNC,ENSG00000007314,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."


In [15]:
minerva_df[MINERVA][0]

[{'pathway_id': 953.0,
  'pathway_label': 'Kynurenine synthesis pathway',
  'pathway_gene_count': 45.0}]

### Pathways from WikiPathways

In [16]:
wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgdb_df)
wikipathways_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways
0,AHR,HGNC,196,NCBI Gene,"[{'pathway_id': 'WP4673', 'pathway_label': 'Ma..."
1,CHRNG,HGNC,1146,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
2,DMD,HGNC,1756,NCBI Gene,"[{'pathway_id': 'WP5356', 'pathway_label': 'Af..."
3,HTR3A,HGNC,3359,NCBI Gene,"[{'pathway_id': 'WP706', 'pathway_label': 'Sud..."
4,SCN4A,HGNC,6329,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."


In [17]:
wikipathways_df[WIKIPATHWAYS][0]

[{'pathway_id': 'WP4673',
  'pathway_label': 'Male infertility',
  'pathway_gene_count': 145.0},
 {'pathway_id': 'WP5088',
  'pathway_label': 'Prostaglandin signaling',
  'pathway_gene_count': 31.0},
 {'pathway_id': 'WP3869',
  'pathway_label': 'Cannabinoid receptor signaling',
  'pathway_gene_count': 31.0},
 {'pathway_id': 'WP2882',
  'pathway_label': 'Nuclear receptors meta-pathway',
  'pathway_gene_count': 318.0},
 {'pathway_id': 'WP5130',
  'pathway_label': 'Th17 cell differentiation pathway',
  'pathway_gene_count': 71.0},
 {'pathway_id': 'WP2873',
  'pathway_label': 'Aryl hydrocarbon receptor pathway',
  'pathway_gene_count': 46.0},
 {'pathway_id': 'WP3893',
  'pathway_label': 'Development and heterogeneity of the ILC family',
  'pathway_gene_count': 32.0},
 {'pathway_id': 'WP465',
  'pathway_label': 'Tryptophan metabolism',
  'pathway_gene_count': 32.0},
 {'pathway_id': 'WP5115',
  'pathway_label': 'Network map of SARS-CoV-2 signaling',
  'pathway_gene_count': 276.0},
 {'pathway

### Reactome pathways from OpenTargets

In [18]:
opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
    bridgedb_df=bridgdb_df
)
opentargets_reactome_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_reactome
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'pathway_label': 'Endogenous sterols', 'path..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,[{'pathway_label': 'Highly sodium permeable po...
2,DMD,HGNC,ENSG00000198947,Ensembl,[{'pathway_label': 'Striated Muscle Contractio...
3,HTR3A,HGNC,ENSG00000166736,Ensembl,[{'pathway_label': 'Neurotransmitter receptors...
4,SCN4A,HGNC,ENSG00000007314,Ensembl,[{'pathway_label': 'Phase 0 - rapid depolarisa...


In [19]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][0]

[{'pathway_label': 'Endogenous sterols', 'pathway_id': 'R-HSA-211976'},
 {'pathway_label': 'PPARA activates gene expression',
  'pathway_id': 'R-HSA-1989781'},
 {'pathway_label': 'Phase I - Functionalization of compounds',
  'pathway_id': 'R-HSA-211945'},
 {'pathway_label': 'Xenobiotics', 'pathway_id': 'R-HSA-211981'},
 {'pathway_label': 'Aryl hydrocarbon receptor signalling',
  'pathway_id': 'R-HSA-8937144'}]

### Gene Ontology from OpenTargets

In [20]:
opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(bridgedb_df=bridgdb_df)
opentargets_go_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_go
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'go_id': 'GO:0005667', 'go_name': 'transcrip..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'go_id': 'GO:0015464', 'go_name': 'acetylcho..."
2,DMD,HGNC,ENSG00000198947,Ensembl,"[{'go_id': 'GO:0016010', 'go_name': 'dystrophi..."
3,HTR3A,HGNC,ENSG00000166736,Ensembl,"[{'go_id': 'GO:1904602', 'go_name': 'serotonin..."
4,SCN4A,HGNC,ENSG00000007314,Ensembl,"[{'go_id': 'GO:0035725', 'go_name': 'sodium io..."


In [21]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

[{'go_id': 'GO:0005667',
  'go_name': 'transcription regulator complex',
  'go_type': 'C'},
 {'go_id': 'GO:0004879',
  'go_name': 'nuclear receptor activity',
  'go_type': 'F'},
 {'go_id': 'GO:0005634', 'go_name': 'nucleus', 'go_type': 'C'},
 {'go_id': 'GO:0046982',
  'go_name': 'protein heterodimerization activity',
  'go_type': 'F'},
 {'go_id': 'GO:0009410',
  'go_name': 'response to xenobiotic stimulus',
  'go_type': 'P'},
 {'go_id': 'GO:0000976',
  'go_name': 'transcription cis-regulatory region binding',
  'go_type': 'F'},
 {'go_id': 'GO:0005829', 'go_name': 'cytosol', 'go_type': 'C'},
 {'go_id': 'GO:0030888',
  'go_name': 'regulation of B cell proliferation',
  'go_type': 'P'},
 {'go_id': 'GO:0009636',
  'go_name': 'response to toxic substance',
  'go_type': 'P'},
 {'go_id': 'GO:0051879', 'go_name': 'Hsp90 protein binding', 'go_type': 'F'},
 {'go_id': 'GO:0001094',
  'go_name': 'TFIID-class transcription factor complex binding',
  'go_type': 'F'},
 {'go_id': 'GO:0001568',
  'go_n

### Compounds from OpenTarget

In [22]:
opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
    bridgedb_df=bridgdb_df
)
opentargets_compound_df.head()

  check_columns_against_constants(
  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_gene_compounds
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'chembl_id': 'CHEMBL259571', 'drugbank_id': ..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'chembl_id': 'CHEMBL1200641', 'drugbank_id':..."
2,DMD,HGNC,ENSG00000198947,Ensembl,"[{'chembl_id': 'CHEMBL2108278', 'drugbank_id':..."
3,HTR3A,HGNC,ENSG00000166736,Ensembl,"[{'chembl_id': 'CHEMBL56564', 'drugbank_id': '..."
4,SCN4A,HGNC,ENSG00000007314,Ensembl,"[{'chembl_id': 'CHEMBL1077896', 'drugbank_id':..."


In [23]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][0]

[{'chembl_id': 'CHEMBL259571',
  'drugbank_id': 'DB06083',
  'compound_cid': '6439522',
  'compound_name': 'TAPINAROF',
  'clincal_trial_phase': 4.0,
  'is_approved': True,
  'relation': 'activates',
  'adverse_effect_count': nan,
  'adverse_effect': None}]

### Transporter inhibitors from MolMeDB

In [25]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgdb_df)
inhibitor_df.head()

  inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgdb_df)


In [None]:
print(inhibitor_df[MOLMEDB_PROTEIN_COMPOUND_COL][47])
inhibitor_df[inhibitor_df["target"] == "P35499"]

### Screening results of compounds on proteins encoded by genes annotation by PubChem

In [27]:
pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(
    bridgedb_df=bridgdb_df
)
pubchem_assay_df.head()

  pubchem_assay_df, pubchem_assay_metadata = pubchem.get_protein_compound_screened(


In [None]:
pubchem_assay_df[PUBCHEM_COMPOUND_ASSAYS_COL][0]

[{'pubchem_assay_id': nan,
  'assay_type': nan,
  'outcome': nan,
  'compound_cid': nan,
  'compound_name': nan,
  'smiles': nan,
  'inchi': nan}]

### Protein-Protein interactions from STRING

In [28]:
ppi_df, ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgdb_df)
ppi_df.head()

  check_columns_against_constants(


Unnamed: 0,identifier,identifier.source,target,target.source,StringDB_ppi
0,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
1,DMD,HGNC,ENSG00000198947,Ensembl,"[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
2,AHR,HGNC,ENSG00000106546,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
3,SCN4A,HGNC,ENSG00000007314,Ensembl,"[{'stringdb_link_to': 'DMD', 'Ensembl': 'ENSP0..."
4,SLC25A1,HGNC,ENSG00000100075,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."


In [29]:
ppi_df[STRING_PPI_COL][3]

[{'stringdb_link_to': 'DMD', 'Ensembl': 'ENSP00000354923', 'score': 0.475},
 {'stringdb_link_to': 'CHRNG', 'Ensembl': 'ENSP00000498757', 'score': 0.454}]

### Combing all the results into single dataframe

In [30]:
combined_df = combine_sources(
    [
        bgee_df,
        disgenet_df,
        minerva_df,
        wikipathways_df,
        opentargets_reactome_df,
        opentargets_go_df,
        opentargets_compound_df,
        inhibitor_df,
        pubchem_assay_df,
        ppi_df,
    ]
)
combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels,DISGENET_diseases,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,OpenTargets_gene_compounds,StringDB_ppi
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_name': 'Mammary Neoplasms', 'HPO': ...","[{'pathway_id': 953.0, 'pathway_label': 'Kynur...","[{'pathway_id': 'WP4673', 'pathway_label': 'Ma...","[{'pathway_label': 'Endogenous sterols', 'path...","[{'go_id': 'GO:0005667', 'go_name': 'transcrip...","[{'chembl_id': 'CHEMBL259571', 'drugbank_id': ...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...",[{'disease_name': 'Multiple pterygium syndrome...,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Highly sodium permeable po...,"[{'go_id': 'GO:0015464', 'go_name': 'acetylcho...","[{'chembl_id': 'CHEMBL1200641', 'drugbank_id':...","[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
2,DMD,HGNC,ENSG00000198947,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_name': 'Muscular Dystrophy, Duchenn...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP5356', 'pathway_label': 'Af...",[{'pathway_label': 'Striated Muscle Contractio...,"[{'go_id': 'GO:0016010', 'go_name': 'dystrophi...","[{'chembl_id': 'CHEMBL2108278', 'drugbank_id':...","[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
3,HTR3A,HGNC,ENSG00000166736,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_name': 'Schizophrenia', 'HPO': 'HPO...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP706', 'pathway_label': 'Sud...",[{'pathway_label': 'Neurotransmitter receptors...,"[{'go_id': 'GO:1904602', 'go_name': 'serotonin...","[{'chembl_id': 'CHEMBL56564', 'drugbank_id': '...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
4,SCN4A,HGNC,ENSG00000007314,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...",[{'disease_name': 'Potassium aggravated myoton...,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Phase 0 - rapid depolarisa...,"[{'go_id': 'GO:0035725', 'go_name': 'sodium io...","[{'chembl_id': 'CHEMBL1077896', 'drugbank_id':...","[{'stringdb_link_to': 'DMD', 'Ensembl': 'ENSP0..."


In [33]:
combined_df.shape

(6, 12)

### Exporting the database in pickle format

In [34]:
with open("combined_df.pkl", "wb") as out:
    pickle.dump(combined_df, out)
with open("opentargets_disease_compound_df.pkl", "wb") as out:
    pickle.dump(opentargets_disease_compound_df, out)

## Creating a graph from the annotated dataframe

In [35]:
combined_df = generator.load_dataframe_from_pickle("combined_df.pkl")
opentargets_disease_compound_df = generator.load_dataframe_from_pickle(
    "opentargets_disease_compound_df.pkl"
)

combined_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels,DISGENET_diseases,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,OpenTargets_gene_compounds,StringDB_ppi
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_name': 'Mammary Neoplasms', 'HPO': ...","[{'pathway_id': 953.0, 'pathway_label': 'Kynur...","[{'pathway_id': 'WP4673', 'pathway_label': 'Ma...","[{'pathway_label': 'Endogenous sterols', 'path...","[{'go_id': 'GO:0005667', 'go_name': 'transcrip...","[{'chembl_id': 'CHEMBL259571', 'drugbank_id': ...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...",[{'disease_name': 'Multiple pterygium syndrome...,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Highly sodium permeable po...,"[{'go_id': 'GO:0015464', 'go_name': 'acetylcho...","[{'chembl_id': 'CHEMBL1200641', 'drugbank_id':...","[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
2,DMD,HGNC,ENSG00000198947,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_name': 'Muscular Dystrophy, Duchenn...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP5356', 'pathway_label': 'Af...",[{'pathway_label': 'Striated Muscle Contractio...,"[{'go_id': 'GO:0016010', 'go_name': 'dystrophi...","[{'chembl_id': 'CHEMBL2108278', 'drugbank_id':...","[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
3,HTR3A,HGNC,ENSG00000166736,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_name': 'Schizophrenia', 'HPO': 'HPO...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP706', 'pathway_label': 'Sud...",[{'pathway_label': 'Neurotransmitter receptors...,"[{'go_id': 'GO:1904602', 'go_name': 'serotonin...","[{'chembl_id': 'CHEMBL56564', 'drugbank_id': '...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
4,SCN4A,HGNC,ENSG00000007314,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...",[{'disease_name': 'Potassium aggravated myoton...,"[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Phase 0 - rapid depolarisa...,"[{'go_id': 'GO:0035725', 'go_name': 'sodium io...","[{'chembl_id': 'CHEMBL1077896', 'drugbank_id':...","[{'stringdb_link_to': 'DMD', 'Ensembl': 'ENSP0..."


In [36]:
opentargets_disease_compound_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0000786,UMLS,EFO_1001255,EFO,"[{'chembl_id': 'CHEMBL1276308', 'drugbank_id':..."
1,UMLS_C0000889,UMLS,EFO_1000660,EFO,"[{'chembl_id': 'CHEMBL1431', 'drugbank_id': 'D..."
2,UMLS_C0002940,UMLS,EFO_0004264,EFO,"[{'chembl_id': 'CHEMBL1491', 'drugbank_id': 'D..."
3,UMLS_C0002940,UMLS,EFO_0009659,EFO,"[{'chembl_id': 'CHEMBL526', 'drugbank_id': 'DB..."
4,UMLS_C0003873,UMLS,EFO_0000685,EFO,"[{'chembl_id': 'CHEMBL2103743', 'drugbank_id':..."


In [38]:
pygraph = generator.networkx_graph(combined_df, opentargets_disease_compound_df)

### Store the graph

In [39]:
with open("networkx_graph.pkl", "wb") as out:
    pickle.dump(pygraph, out)

## Visualize the graph

In [None]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

In [40]:
from pyBiodatafuse.graph import cytoscape, neo4j

neo4j.save_graph_to_graphml(pygraph, output_path="graph_to-test.graphml")
cytoscape.load_graph(pygraph, network_name="test_graph")

Applying default style...
Applying preferred layout
