## Example: PCS use case

This notebook shows all the steps to generate PCS KG and the downstream analysis.

#### Set up the environment

In [1]:
new_path = "E:\BioDataFuse\pyBiodatafuse"

import os

os.chdir(new_path)

# Set the current working directory
current_dir = os.getcwd()

In [2]:
# Import modules
import pickle

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from dotenv import load_dotenv

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import disgenet, minerva, opentargets, stringdb, wikipathways
from pyBiodatafuse.constants import (
    DISGENET_DISEASE_COL,
    MINERVA,
    OPENTARGETS_DISEASE_COMPOUND_COL,
    OPENTARGETS_GENE_COMPOUND_COL,
    OPENTARGETS_GO_COL,
    OPENTARGETS_REACTOME_COL,
    STRING_PPI_COL,
    WIKIPATHWAYS,
)
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import (
    combine_sources,
    create_harmonized_input_file,
    create_or_append_to_metadata,
)

### Load the input list and convert it to a dataframe

In [3]:
data_input = pd.read_csv(os.path.join(os.getcwd(), r"examples\usecases\PCS\PCS_gene_list.csv"))
print("Total number of genes:", len(data_input.drop_duplicates()))
data_input.head()

Total number of genes: 2023


Unnamed: 0,identifier
0,LOC729609
1,LOC105374060
2,DMP1
3,PNLIP
4,OR4N3P


### Entity resolution using BridgeDB

In [4]:
# bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
#     identifiers=data_input,
#     input_species="Human",
#     input_datasource="HGNC",
#     output_datasource="All",
# )
# bridgedb_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "bridgedb_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "bridgedb_metadata.pkl"), "wb") as file:
#     pickle.dump(bridgedb_metadata, file)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "bridgedb_df.pkl"), "rb"
) as file:
    bridgedb_df = pickle.load(file)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "bridgedb_metadata.pkl"), "rb"
) as file:
    bridgedb_metadata = pickle.load(file)

print("Number of genes with mapping in BridgeDb:", len(bridgedb_df["identifier"].unique()))
bridgedb_df.head()

Number of genes with mapping in BridgeDb: 1667


Unnamed: 0,identifier,identifier.source,target,target.source
0,DMP1,HGNC,Q13316,Uniprot-TrEMBL
1,DMP1,HGNC,HGNC:2932,HGNC Accession Number
2,DMP1,HGNC,DMP1,HGNC
3,DMP1,HGNC,ENSG00000152592,Ensembl
4,DMP1,HGNC,1758,NCBI Gene


### Gene to Disease annotatation from DisGeNet


**ADD your DISGENET API KEY in the main folder**

  **1)** Create a ``.env`` file and add DISGENET_API_KEY to it:

      DISGENET_API_KEY="your-API-key-value"

  **2)** Install *python-dotenv*:
  
      ```
      pip install python-dotenv
      ```

In [5]:
# Read the .env File
load_dotenv()
# Retrieve the key from the environment variable
disgenet_api_key = os.getenv("DISGENET_API_KEY")

In [6]:
# disgenet_df, disgenet_metadata = disgenet.get_gene_disease(
#     api_key=disgenet_api_key, bridgedb_df=bridgedb_df
# )
# disgenet_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "disgenet_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "disgenet_metadata.pkl"), "wb") as file:
#     pickle.dump(disgenet_metadata, file)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "disgenet_df.pkl"), "rb"
) as file:
    disgenet_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "disgenet_metadata.pkl"
    ),
    "rb",
) as file:
    disgenet_metadata = pickle.load(file)

disgenet_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases
0,A2ML1,HGNC,144568,NCBI Gene,"[{'disease_name': 'Noonan Syndrome', 'HPO': ''..."
1,AAMDC,HGNC,28971,NCBI Gene,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,..."
2,ABCA1,HGNC,19,NCBI Gene,"[{'disease_name': 'Tangier Disease', 'HPO': ''..."
3,ABCB1,HGNC,5243,NCBI Gene,"[{'disease_name': 'Epilepsy', 'HPO': 'HPO_HP:0..."
4,ABCC6P1,HGNC,653190,NCBI Gene,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,..."


In [7]:
disgenet_df[DISGENET_DISEASE_COL][0]

[{'disease_name': 'Noonan Syndrome',
  'HPO': '',
  'NCI': 'NCI_C34854',
  'OMIM': 'OMIM_163950, OMIM_176876',
  'MONDO': 'MONDO_0018997',
  'ORDO': 'ORDO_648',
  'EFO': '',
  'DO': 'DO_0060254, DO_11983, DO_11725, DO_2962, DO_14681, DO_3490, DO_14796, DO_6683',
  'MESH': 'MESH_D009634',
  'UMLS': 'UMLS_C0028326',
  'disease_type': 'disease',
  'disease_umlscui': 'C0028326',
  'score': 0.7,
  'ei': 0.8333333333333334,
  'el': 'Disputed'},
 {'disease_name': 'Otitis Media',
  'HPO': 'HPO_HP:0000388',
  'NCI': 'NCI_C34885',
  'OMIM': '',
  'MONDO': 'MONDO_0005441',
  'ORDO': '',
  'EFO': 'EFO_0004992',
  'DO': 'DO_10754',
  'MESH': 'MESH_D010033',
  'UMLS': 'UMLS_C0029882',
  'disease_type': 'disease',
  'disease_umlscui': 'C0029882',
  'score': 0.65,
  'ei': 1.0,
  'el': None},
 {'disease_name': 'Noonan Syndrome 1',
  'HPO': '',
  'NCI': 'NCI_C75459',
  'OMIM': 'OMIM_176876, OMIM_163950',
  'MONDO': 'MONDO_0008104, MONDO_0018997',
  'ORDO': 'ORDO_648',
  'EFO': '',
  'DO': 'DO_0060578, D

### Add literature-based data
Genes found to be associated with Post-COVID-19

In [8]:
pcs_associated_genes = pd.read_excel(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "pcs_associated_genes.xlsx")
)
pcs_associated_genes.head()

Unnamed: 0,Gene
0,CTLA4
1,PTPN22
2,KIT
3,KRAS
4,NF1


#### Define the literature based info

In [9]:
from pyBiodatafuse.constants import LITERATURE_DISEASE_COL, LITERATURE_DISEASE_OUTPUT_DICT

literature_disease_attrs = LITERATURE_DISEASE_OUTPUT_DICT.copy()
literature_disease_attrs["disease_name"] = "Post-COVID-19"
literature_disease_attrs["id"] = "C00000"
literature_disease_attrs["source"] = "PMID: 37675861"


def get_literature_based_info(gene):
    if gene in pcs_associated_genes["Gene"].values:
        return [literature_disease_attrs]
    else:
        return [{"disease_name": np.nan, "id": np.nan, "source": np.nan}]


disgenet_df[LITERATURE_DISEASE_COL] = disgenet_df["identifier"].apply(get_literature_based_info)

disgenet_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info
0,A2ML1,HGNC,144568,NCBI Gene,"[{'disease_name': 'Noonan Syndrome', 'HPO': ''...","[{'disease_name': nan, 'id': nan, 'source': nan}]"
1,AAMDC,HGNC,28971,NCBI Gene,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]"
2,ABCA1,HGNC,19,NCBI Gene,"[{'disease_name': 'Tangier Disease', 'HPO': ''...","[{'disease_name': nan, 'id': nan, 'source': nan}]"
3,ABCB1,HGNC,5243,NCBI Gene,"[{'disease_name': 'Epilepsy', 'HPO': 'HPO_HP:0...","[{'disease_name': nan, 'id': nan, 'source': nan}]"
4,ABCC6P1,HGNC,653190,NCBI Gene,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]"


In [10]:
disgenet_df[disgenet_df["identifier"] == "DMP1"][LITERATURE_DISEASE_COL]

362    [{'disease_name': 'Post-COVID-19', 'id': 'C000...
Name: literature_based_info, dtype: object

In [11]:
print(pcs_associated_genes["Gene"].isin(disgenet_df["identifier"]).sum())

29


### Disease to Compound annotation from OpenTargets

##### Prepare the input to use DISGENET output as seed for OpenTargets


In [12]:
disease_mapping_df = create_harmonized_input_file(disgenet_df, DISGENET_DISEASE_COL, "EFO", "UMLS")
disease_mapping_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source
0,UMLS_C0029882,UMLS,EFO_0004992,EFO
1,UMLS_C0004153,UMLS,EFO_0003914,EFO
2,UMLS_C0004153,UMLS,EFO_1000819,EFO
3,UMLS_C0342898,UMLS,EFO_0700136,EFO
4,UMLS_C0010054,UMLS,EFO_0001645,EFO


##### Disease to Compound annotation

TODO: to run again.

In [13]:
# (
#     opentargets_disease_compound_df,
#     opentargets_disease_compound_metadata,
# ) = opentargets.get_disease_compound_interactions(disease_mapping_df)

# opentargets_disease_compound_df.to_pickle(
#     os.path.join(
#         os.getcwd(),
#         "examples",
#         "usecases",
#         "PCS",
#         "datasources",
#         "opentargets_disease_compound_df.pkl",
#     )
# )
# with open(
#     os.path.join(
#         os.getcwd(),
#         "examples",
#         "usecases",
#         "PCS",
#         "datasources",
#         "opentargets_disease_compound_metadata.pkl",
#     ),
#     "wb",
# ) as file:
#     pickle.dump(opentargets_disease_compound_metadata, file)

with open(
    os.path.join(
        os.getcwd(),
        "examples",
        "usecases",
        "PCS",
        "datasources",
        "opentargets_disease_compound_df.pkl",
    ),
    "rb",
) as file:
    opentargets_disease_compound_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(),
        "examples",
        "usecases",
        "PCS",
        "datasources",
        "opentargets_disease_compound_metadata.pkl",
    ),
    "rb",
) as file:
    opentargets_disease_compound_metadata = pickle.load(file)
opentargets_disease_compound_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_disease_compounds
0,UMLS_C0000786,UMLS,EFO_1001255,EFO,"[{'chembl_id': 'CHEMBL1276308', 'drugbank_id':..."
1,UMLS_C0000889,UMLS,EFO_1000660,EFO,"[{'chembl_id': 'CHEMBL1431', 'drugbank_id': 'D..."
2,UMLS_C0001125,UMLS,EFO_1000036,EFO,"[{'chembl_id': 'CHEMBL306823', 'drugbank_id': ..."
3,UMLS_C0001175,UMLS,EFO_0000765,EFO,"[{'chembl_id': 'CHEMBL704', 'drugbank_id': 'DB..."
4,UMLS_C0001306,UMLS,EFO_1001345,EFO,"[{'chembl_id': 'CHEMBL628', 'drugbank_id': 'DB..."


In [14]:
opentargets_disease_compound_df[OPENTARGETS_DISEASE_COMPOUND_COL][0]

[{'chembl_id': 'CHEMBL1276308',
  'drugbank_id': 'DB00834',
  'compound_cid': nan,
  'compound_name': 'MIFEPRISTONE',
  'clincal_trial_phase': 4.0,
  'is_approved': True,
  'relation': 'treats',
  'adverse_effect_count': 35.0,
  'adverse_effect': [{'name': 'abortion incomplete'},
   {'name': 'haemorrhage'},
   {'name': 'pregnancy'},
   {'name': 'endometritis'},
   {'name': 'induced abortion failed'},
   {'name': 'vaginal haemorrhage'},
   {'name': 'anaemia'},
   {'name': 'muscle spasms'},
   {'name': 'metrorrhagia'},
   {'name': 'abortion induced incomplete'},
   {'name': 'menorrhagia'},
   {'name': 'pain'},
   {'name': 'uterine haemorrhage'},
   {'name': 'post abortion infection'},
   {'name': 'uterine rupture'},
   {'name': 'ectopic pregnancy'},
   {'name': 'blood potassium decreased'},
   {'name': 'syncope'},
   {'name': 'endometritis bacterial'},
   {'name': 'pelvic inflammatory disease'},
   {'name': 'uterine dilation and curettage'},
   {'name': 'haemorrhagic anaemia'},
   {'name

### Gene to Compound annotation from OpenTarget

In [15]:
# opentargets_compound_df, opentargets_compound_metadata = opentargets.get_gene_compound_interactions(
#     bridgedb_df=bridgedb_df
# )

# opentargets_compound_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_compound_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_compound_metadata.pkl"), "wb") as file:
#     pickle.dump(opentargets_compound_metadata, file)

with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_compound_df.pkl"
    ),
    "rb",
) as file:
    opentargets_compound_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(),
        "examples",
        "usecases",
        "PCS",
        "datasources",
        "opentargets_compound_metadata.pkl",
    ),
    "rb",
) as file:
    opentargets_compound_metadata = pickle.load(file)

opentargets_compound_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_gene_compounds
0,A2ML1,HGNC,ENSG00000166535,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
1,AAMDC,HGNC,ENSG00000087884,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."
2,ABCA1,HGNC,ENSG00000165029,Ensembl,"[{'chembl_id': 'CHEMBL608', 'drugbank_id': 'DB..."
3,ABCB1,HGNC,ENSG00000085563,Ensembl,"[{'chembl_id': 'CHEMBL1086218', 'drugbank_id':..."
4,ABCC13,HGNC,ENSG00000243064,Ensembl,"[{'chembl_id': nan, 'drugbank_id': nan, 'compo..."


In [16]:
opentargets_compound_df[OPENTARGETS_GENE_COMPOUND_COL][3]

[{'chembl_id': 'CHEMBL1086218',
  'drugbank_id': 'DB11869',
  'compound_cid': '5281884',
  'compound_name': 'VALSPODAR',
  'clincal_trial_phase': 3.0,
  'is_approved': False,
  'relation': 'activates',
  'adverse_effect_count': nan,
  'adverse_effect': None},
 {'chembl_id': 'CHEMBL444172',
  'drugbank_id': 'DB06191',
  'compound_cid': '3036703',
  'compound_name': 'ZOSUQUIDAR',
  'clincal_trial_phase': 3.0,
  'is_approved': False,
  'relation': 'activates',
  'adverse_effect_count': nan,
  'adverse_effect': None},
 {'chembl_id': 'CHEMBL348475',
  'drugbank_id': 'DB06240',
  'compound_cid': '148201',
  'compound_name': 'TARIQUIDAR',
  'clincal_trial_phase': 3.0,
  'is_approved': False,
  'relation': 'activates',
  'adverse_effect_count': 2.0,
  'adverse_effect': [{'name': 'breast cancer female'},
   {'name': 'malignant neoplasm progression'}]},
 {'chembl_id': 'CHEMBL4594298',
  'drugbank_id': None,
  'compound_cid': '11399764',
  'compound_name': 'ENCEQUIDAR',
  'clincal_trial_phase': 3

### Gene to Pathway annotation from MINERVA

In [17]:
# minerva_df, minerva_metadata = minerva.get_gene_minerva_pathways(
#     bridgedb_df, map_name="COVID19 Disease Map"
# )
# minerva_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "minerva_df.pkl"))
# with open(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "minerva_metadata.pkl"), "wb"
# ) as file:
#     pickle.dump(minerva_metadata, file)

with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "minerva_df.pkl"), "rb"
) as file:
    minerva_df = pickle.load(file)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "minerva_metadata.pkl"),
    "rb",
) as file:
    minerva_metadata = pickle.load(file)
minerva_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,MINERVA
0,A2ML1,HGNC,ENSG00000166535,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
1,AAMDC,HGNC,ENSG00000087884,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
2,ABCA1,HGNC,ENSG00000165029,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
3,ABCB1,HGNC,ENSG00000085563,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
4,ABCC13,HGNC,ENSG00000243064,Ensembl,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."


In [18]:
minerva_df[MINERVA][33]

[{'pathway_id': 953.0,
  'pathway_label': 'Kynurenine synthesis pathway',
  'pathway_gene_count': 45.0}]

### Gene to Pathway annotation from WikiPathways

In [19]:
# wikipathways_df, wikipathways_metadata = wikipathways.get_gene_wikipathways(bridgedb_df=bridgedb_df)
# wikipathways_df.to_pickle(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "wikipathways_df.pkl")
# )
# with open(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "wikipathways_metadata.pkl"), "wb"
# ) as file:
#     pickle.dump(wikipathways_metadata, file)

with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "wikipathways_df.pkl"),
    "rb",
) as file:
    wikipathways_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "wikipathways_metadata.pkl"
    ),
    "rb",
) as file:
    wikipathways_metadata = pickle.load(file)
wikipathways_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,WikiPathways
0,A2ML1,HGNC,144568,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
1,AAMDC,HGNC,28971,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."
2,ABCA1,HGNC,19,NCBI Gene,"[{'pathway_id': 'WP4718', 'pathway_label': 'Ch..."
3,ABCB1,HGNC,5243,NCBI Gene,"[{'pathway_id': 'WP3672', 'pathway_label': 'ln..."
4,ABCC6P1,HGNC,653190,NCBI Gene,"[{'pathway_id': nan, 'pathway_label': nan, 'pa..."


In [20]:
wikipathways_df[WIKIPATHWAYS][3]

[{'pathway_id': 'WP3672',
  'pathway_label': 'lncRNA-mediated mechanisms of therapeutic resistance',
  'pathway_gene_count': 7.0},
 {'pathway_id': 'WP2876',
  'pathway_label': 'Pregnane X receptor pathway',
  'pathway_gene_count': 33.0},
 {'pathway_id': 'WP4917',
  'pathway_label': 'Proximal tubule transport',
  'pathway_gene_count': 57.0},
 {'pathway_id': 'WP4673',
  'pathway_label': 'Male infertility',
  'pathway_gene_count': 145.0},
 {'pathway_id': 'WP2328',
  'pathway_label': 'Allograft rejection',
  'pathway_gene_count': 102.0},
 {'pathway_id': 'WP3640',
  'pathway_label': 'Imatinib and chronic myeloid leukemia',
  'pathway_gene_count': 20.0},
 {'pathway_id': 'WP2882',
  'pathway_label': 'Nuclear receptors meta-pathway',
  'pathway_gene_count': 318.0},
 {'pathway_id': 'WP1604',
  'pathway_label': 'Codeine and morphine metabolism',
  'pathway_gene_count': 17.0},
 {'pathway_id': 'WP2289',
  'pathway_label': 'Drug induction of bile acid pathway',
  'pathway_gene_count': 17.0},
 {'pat

### Gene to Reactome Pathway from OpenTargets

In [21]:
# opentargets_reactome_df, opentargets_reactome_metadata = opentargets.get_gene_reactome_pathways(
#     bridgedb_df=bridgedb_df
# )
# opentargets_reactome_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_reactome_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_reactome_metadata.pkl"), "wb") as file:
#     pickle.dump(opentargets_reactome_metadata, file)

with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_reactome_df.pkl"
    ),
    "rb",
) as file:
    opentargets_reactome_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(),
        "examples",
        "usecases",
        "PCS",
        "datasources",
        "opentargets_reactome_metadata.pkl",
    ),
    "rb",
) as file:
    opentargets_reactome_metadata = pickle.load(file)

opentargets_reactome_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_reactome
0,A2ML1,HGNC,ENSG00000166535,Ensembl,"[{'pathway_label': nan, 'pathway_id': nan}]"
1,AAMDC,HGNC,ENSG00000087884,Ensembl,"[{'pathway_label': nan, 'pathway_id': nan}]"
2,ABCA1,HGNC,ENSG00000165029,Ensembl,[{'pathway_label': 'PPARA activates gene expre...
3,ABCB1,HGNC,ENSG00000085563,Ensembl,[{'pathway_label': 'Abacavir transmembrane tra...
4,ABCC13,HGNC,ENSG00000243064,Ensembl,"[{'pathway_label': nan, 'pathway_id': nan}]"


In [22]:
opentargets_reactome_df[OPENTARGETS_REACTOME_COL][2]

[{'pathway_label': 'PPARA activates gene expression',
  'pathway_id': 'R-HSA-1989781'},
 {'pathway_label': 'Defective ABCA1 causes TGD',
  'pathway_id': 'R-HSA-5682113'},
 {'pathway_label': 'NR1H3 & NR1H2 regulate gene expression linked to cholesterol transport and efflux',
  'pathway_id': 'R-HSA-9029569'},
 {'pathway_label': 'HDL assembly', 'pathway_id': 'R-HSA-8963896'}]

### Gene Ontology annotation from OpenTargets

In [23]:
# opentargets_go_df, opentargets_go_metadata = opentargets.get_gene_go_process(bridgedb_df=bridgedb_df)
# opentargets_go_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_go_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_go_metadata.pkl"), "wb") as file:
#     pickle.dump(opentargets_go_metadata, file)

with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_go_df.pkl"
    ),
    "rb",
) as file:
    opentargets_go_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "opentargets_go_metadata.pkl"
    ),
    "rb",
) as file:
    opentargets_go_metadata = pickle.load(file)
opentargets_go_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,OpenTargets_go
0,A2ML1,HGNC,ENSG00000166535,Ensembl,"[{'go_id': 'GO:0052548', 'go_name': 'regulatio..."
1,AAMDC,HGNC,ENSG00000087884,Ensembl,"[{'go_id': 'GO:0005737', 'go_name': 'cytoplasm..."
2,ABCA1,HGNC,ENSG00000165029,Ensembl,"[{'go_id': 'GO:0005524', 'go_name': 'ATP bindi..."
3,ABCB1,HGNC,ENSG00000085563,Ensembl,"[{'go_id': 'GO:0008559', 'go_name': 'ABC-type ..."
4,ABCC13,HGNC,ENSG00000243064,Ensembl,"[{'go_id': nan, 'go_name': nan, 'go_type': nan}]"


In [24]:
opentargets_go_df[OPENTARGETS_GO_COL][0]

[{'go_id': 'GO:0052548',
  'go_name': 'regulation of endopeptidase activity',
  'go_type': 'P'},
 {'go_id': 'GO:0070062', 'go_name': 'extracellular exosome', 'go_type': 'C'},
 {'go_id': 'GO:0030414',
  'go_name': 'peptidase inhibitor activity',
  'go_type': 'F'},
 {'go_id': 'GO:0005615', 'go_name': 'extracellular space', 'go_type': 'C'},
 {'go_id': 'GO:0004867',
  'go_name': 'serine-type endopeptidase inhibitor activity',
  'go_type': 'F'}]

### Protein-Protein interaction from STRING

In [25]:
# string_ppi_df, string_ppi_metadata = stringdb.get_ppi(bridgedb_df=bridgedb_df)
# string_ppi_df.to_pickle(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "string_ppi_df.pkl"))
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "string_ppi_metadata.pkl"), "wb") as file:
#     pickle.dump(string_ppi_metadata, file)

with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "datasources", "string_ppi_df.pkl"),
    "rb",
) as file:
    string_ppi_df = pickle.load(file)
with open(
    os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "datasources", "string_ppi_metadata.pkl"
    ),
    "rb",
) as file:
    string_ppi_metadata = pickle.load(file)
string_ppi_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,StringDB_ppi
0,DMP1,HGNC,ENSG00000152592,Ensembl,"[{'stringdb_link_to': 'TNFRSF11B', 'Ensembl': ..."
1,PNLIP,HGNC,ENSG00000175535,Ensembl,"[{'stringdb_link_to': 'LIPE', 'Ensembl': 'ENSP..."
2,OR4N3P,HGNC,ENSG00000259435,Ensembl,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
3,SLC6A14,HGNC,ENSG00000268104,Ensembl,"[{'stringdb_link_to': 'SLC7A11', 'Ensembl': 'E..."
4,DEFB105A,HGNC,ENSG00000186562,Ensembl,"[{'stringdb_link_to': 'DEFB118', 'Ensembl': 'E..."


In [26]:
string_ppi_df[STRING_PPI_COL][0]

[{'stringdb_link_to': 'TNFRSF11B',
  'Ensembl': 'ENSP00000297350',
  'score': 0.409},
 {'stringdb_link_to': 'HSPA5', 'Ensembl': 'ENSP00000324173', 'score': 0.504},
 {'stringdb_link_to': 'GAPDH', 'Ensembl': 'ENSP00000380070', 'score': 0.449},
 {'stringdb_link_to': 'CD44', 'Ensembl': 'ENSP00000398632', 'score': 0.601},
 {'stringdb_link_to': 'ENPP1', 'Ensembl': 'ENSP00000498074', 'score': 0.625},
 {'stringdb_link_to': 'RUNX2', 'Ensembl': 'ENSP00000360493', 'score': 0.713}]

### Combing all the results into single dataframe

In [27]:
combined_df = combine_sources(
    bridgedb_df,
    [
        disgenet_df,
        opentargets_compound_df,
        minerva_df,
        wikipathways_df,
        opentargets_reactome_df,
        opentargets_go_df,
        string_ppi_df,
    ],
)
combined_metadata = create_or_append_to_metadata(
    bridgedb_metadata,
    [
        disgenet_metadata,
        opentargets_disease_compound_metadata,
        opentargets_compound_metadata,
        minerva_metadata,
        wikipathways_metadata,
        opentargets_reactome_metadata,
        opentargets_go_metadata,
        string_ppi_metadata,
    ],
)

In [28]:
combined_df.head(4)

Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info,OpenTargets_gene_compounds,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,StringDB_ppi
0,DMP1,HGNC,ENSG00000152592,Ensembl,"[{'disease_name': 'Hypophosphatemic Rickets', ...","[{'disease_name': 'Post-COVID-19', 'id': 'C000...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP3971', 'pathway_label': 'OS...","[{'pathway_label': 'ECM proteoglycans', 'pathw...","[{'go_id': 'GO:0005788', 'go_name': 'endoplasm...","[{'stringdb_link_to': 'TNFRSF11B', 'Ensembl': ..."
1,PNLIP,HGNC,ENSG00000175535,Ensembl,[{'disease_name': 'Pancreatic Lipase Deficienc...,"[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': 'CHEMBL175247', 'drugbank_id': ...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Retinoid metabolism and tr...,"[{'go_id': 'GO:0004806', 'go_name': 'triglycer...","[{'stringdb_link_to': 'LIPE', 'Ensembl': 'ENSP..."
2,OR4N3P,HGNC,ENSG00000259435,Ensembl,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': nan, 'go_name': nan, 'go_type': nan}]","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
3,SLC6A14,HGNC,ENSG00000268104,Ensembl,"[{'disease_name': 'Cystic Fibrosis', 'HPO': ''...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP2882', 'pathway_label': 'Nu...",[{'pathway_label': 'Amino acid transport acros...,"[{'go_id': 'GO:0015657', 'go_name': 'branched-...","[{'stringdb_link_to': 'SLC7A11', 'Ensembl': 'E..."


In [29]:
combined_df[LITERATURE_DISEASE_COL][0]

[{'disease_name': 'Post-COVID-19', 'id': 'C00000', 'source': 'PMID: 37675861'}]

In [30]:
combined_metadata

[{'datasource': 'DISGENET',
  'metadata': {'lastUpdate': '10 Jul 2024', 'version': 'DISGENET v24.2'},
  'query': {'size': 1590,
   'input_type': 'NCBI Gene',
   'time': '0:31:18.977092',
   'date': '2024-09-11 14:58:51',
   'url': 'https://api.disgenet.com/api/v1/gda/summary',
   'number_of_added_nodes': 2913,
   'number_of_added_edges': 7607}},
 {'datasource': 'Open Targets GraphQL & REST API Beta',
  'metadata': {'source_version': {'apiVersion': {'x': '24',
     'y': '1',
     'z': '4'}},
   'data_version': {'dataVersion': {'year': '24', 'month': '06'}}},
  'query': {'size': 1112,
   'input_type': 'EFO',
   'time': '0:00:00.897231',
   'date': '2024-08-27 10:35:49',
   'url': 'https://api.platform.opentargets.org/api/v4/graphql',
   'number_of_added_nodes': 1299,
   'number_of_added_edges': 5710}},
 {'datasource': 'Open Targets GraphQL & REST API Beta',
  'metadata': {'source_version': {'apiVersion': {'x': '24',
     'y': '1',
     'z': '4'}},
   'data_version': {'dataVersion': {'yea

In [31]:
combined_df.shape

(2421, 12)

In [32]:
combined_df.tail()

Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info,OpenTargets_gene_compounds,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,StringDB_ppi
2416,PRDX3,HGNC,ENSG00000165672,Ensembl,"[{'disease_name': 'SPINOCEREBELLAR ATAXIA, AUT...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': 933.0, 'pathway_label': 'Elect...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Detoxification of Reactive...,"[{'go_id': 'GO:0005515', 'go_name': 'protein b...","[{'stringdb_link_to': 'SIRT1', 'Ensembl': 'ENS..."
2417,FGB,HGNC,ENSG00000171564,Ensembl,"[{'disease_name': 'Cardiovascular Diseases', '...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': 'CHEMBL2109072', 'drugbank_id':...","[{'pathway_id': 951.0, 'pathway_label': 'Coagu...","[{'pathway_id': 'WP5115', 'pathway_label': 'Ne...",[{'pathway_label': 'p130Cas linkage to MAPK si...,"[{'go_id': 'GO:0005576', 'go_name': 'extracell...","[{'stringdb_link_to': 'LBP', 'Ensembl': 'ENSP0..."
2418,TEX14,HGNC,ENSG00000121101,Ensembl,[{'disease_name': 'Non-obstructive azoospermia...,"[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': 'GO:0032466', 'go_name': 'negative ...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
2419,FBN1,HGNC,ENSG00000166147,Ensembl,"[{'disease_name': 'Marfan Syndrome', 'HPO': ''...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': 945.0, 'pathway_label': 'Nsp9 ...","[{'pathway_id': 'WP3668', 'pathway_label': 'Hy...",[{'pathway_label': 'TGF-beta receptor signalin...,"[{'go_id': 'GO:0005201', 'go_name': 'extracell...","[{'stringdb_link_to': 'SERPINE1', 'Ensembl': '..."
2420,EPHA3,HGNC,ENSG00000044524,Ensembl,[{'disease_name': 'Adenocarcinoma of lung (dis...,"[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': 'CHEMBL24828', 'drugbank_id': '...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP2882', 'pathway_label': 'Nu...","[{'pathway_label': 'EPH-Ephrin signaling', 'pa...","[{'go_id': 'GO:0010717', 'go_name': 'regulatio...","[{'stringdb_link_to': 'EFNA2', 'Ensembl': 'ENS..."


##### Exporting the combined data in pickle format

In [33]:
# with open(os.path.join(os.getcwd(), "examples", "usecases", "PCS", "combined_df.pkl"), "wb") as out:
#     pickle.dump(combined_df, out)
# with open(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "combined_metadata.pkl"), "wb"
# ) as file:
#     pickle.dump(combined_metadata, file)

### Creating a graph from the annotated data

In [34]:
pygraph = generator.networkx_graph(combined_df, opentargets_disease_compound_df)
with open(
    os.path.join(os.getcwd(), "examples", "usecases", "PCS", "pcs_networkx_graph.pkl"), "wb"
) as out:
    pickle.dump(pygraph, out)

# with open(
#     os.path.join(os.getcwd(), "examples", "usecases", "PCS", "pcs_networkx_graph.pkl"),
#     "rb",
# ) as file:
#     pygraph = pickle.load(file)

### Visualize the graph

In [30]:
# pos = nx.circular_layout(pygraph)

# plt.figure(3, figsize=(30, 30))
# nx.draw(pygraph, pos)
# plt.show()

#### Cytosacpe

In [None]:
from pyBiodatafuse.graph import cytoscape

cytoscape.load_graph(pygraph, network_name="PCS network")

#### Neo4j

In [35]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(
    pygraph,
    output_path=os.path.join(
        os.getcwd(), "examples", "usecases", "PCS", "pcs_networkx_graph.graphml"
    ),
)

##### Steps to load the graph in Neo4j

- Add `.graphml` file in **import** subfolder of the DBMS folder
- Install apoc plugin
- Create `apoc.conf` file:
    ```
    apoc.trigger.enabled=true
    apoc.import.file.enabled=true
    apoc.export.file.enabled=true
    apoc.import.file.use_neo4j_config=true
    ```
- Add `apoc.conf` file to **conf** subfolder of the DBMS folder
- Open Neo4j Browser
- (Optionl, only run if you have imported a graph before) Remove all the nodes before importing `.graphml` file

    ```
    MATCH (n) DETACH DELETE n
    ```

- Import `.graphml` file

    ```
    call apoc.import.graphml('file:///pcs_networkx_graph.graphml',{readLabels:TRUE})
    ```

- Add indexes after importing the graph for improving the performance of queries

    ```
    create index Gene for (n:Gene) on (n.node_type)
    create index Pathway for (n:Pathway) on (n.node_type)
    create index `Biological Process` for (n:`Biological Process`) on (n.node_type)
    create index `Molecular Function` for (n:`Molecular Function`) on (n.node_type)
    create index `Cellular Component` for (n:`Cellular Component`) on (n.node_type)
    create index Disease for (n:Disease) on (n.node_type)
    create index Compound for (n:Compound) on (n.node_type)
    create index `Side Effect` for (n:`Side Effect`) on (n.node_type)
    ```

- Count the number of each node type
    - total (```MATCH (n) RETURN count(n)```) = 19859
        - Gene (```MATCH (n:Gene) RETURN count(n)```) = 1667
        - Pathway (```MATCH (n:Pathway) RETURN count(n)```) = 1847
            - WikiPathways (```MATCH (n:Pathway {source: "WikiPathways"}) RETURN count(n)```) = 678
            - OpenTargets, Reactome (```MATCH (n:Pathway {source: "OpenTargets"}) RETURN count(n)```) = 1154
            - MINERVA (```MATCH (n:Pathway {source: "MINERVA"}) RETURN count(n)```) = 15
        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) = 4624
        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) = 1327
        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) = 736
        - Disease (```MATCH (n:Disease) RETURN count(n)```) = 2913
        - Compound (```MATCH (n:Compound) RETURN count(n)```) = 2244
        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) = 4501
- Count the number of each edge type
    - total (```MATCH ()-[r]->() RETURN count(r)```) = 101630
        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) = 16844
        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) = 30066 
            - WikiPathways (```MATCH ()-[r:part_of {source: "WikiPathways"}]->() RETURN count(r)```) = 3174
            - OpenTargets, Reactome (```MATCH ()-[r:part_of {source: "OpenTargets"}]->() RETURN count(r)```) = 26784
            - MINERVA (```MATCH ()-[r:part_of {source: "MINERVA"}]->() RETURN count(r)```) = 108
        - activates (```MATCH ()-[r:activates]->() RETURN count(r)```) = 499
        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) = 8215
        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) = 38328
        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71
        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) = 7607

- Export the graph as a `.csv` file

    ```call apoc.export.csv.all("pcs_networkx_graph.csv",{})```

### Dreamwalk algoritm

In [1]:
import os

new_path = os.path.join(os.getcwd(), "DREAMwalk")


os.chdir(new_path)

# Set the current working directory
current_dir = os.getcwd()
current_dir

'e:\\BioDataFuse\\pyBiodatafuse\\examples\\usecases\\PCS\\DREAMwalk'

In [2]:
import DREAMwalk.generate_dis_sim as dis_gen
import DREAMwalk.generate_files as gen
import pandas as pd
from DREAMwalk.calculate_drug_scores import find_candidates
from DREAMwalk.generate_embeddings import save_embedding_files
from DREAMwalk.generate_similarity_net import save_sim_graph
from DREAMwalk.predict_associations import predict_dda

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# GENERSTE FILES
kg_data = pd.read_csv("../pcs_networkx_graph.csv")
kg_data.head()

  kg_data= pd.read_csv("../pcs_networkx_graph.csv")


Unnamed: 0,_id,_labels,DO,EFO,Ensembl,HPO,MESH,MONDO,NCI,OMIM,...,is_approved,name,source,_start,_end,_type,ei,el,score,source.1
0,39718.0,:Gene,,,ENSG00000152592,,,,,,...,,DMP1,BridgeDB,,,,,,,
1,39719.0,:Disease,,,,HPO_HP:0004912,MESH_D063730,"MONDO_0000044, MONDO_0024300",NCI_C131449,,...,,Hypophosphatemic Rickets,DISGENET,,,,,,,
2,39720.0,:Disease,DO_0050949,,,,MESH_C562792,"MONDO_0009430, MONDO_0017324",NCI_C123187,OMIM_241520,...,,Autosomal recessive hypophosphatemic vitamin D...,DISGENET,,,,,,,
3,39721.0,:Disease,DO_0050949,,,,MESH_C562792,"MONDO_0009430, MONDO_0017324",,"OMIM_600980, OMIM_241520",...,,"Hypophosphatemic Rickets, Autosomal Recessive, 1",DISGENET,,,,,,,
4,39722.0,:Pathway,,,,,,,,,...,,OSX and miRNAs in tooth development,WikiPathways,,,,,,,


In [4]:
kg_data.columns

Index(['_id', '_labels', 'DO', 'EFO', 'Ensembl', 'HPO', 'MESH', 'MONDO', 'NCI',
       'OMIM', 'ORDO', 'UMLS', 'adverse_effect_count', 'chembl_id',
       'clincal_trial_phase', 'compound_cid', 'disease_type',
       'disease_umlscui', 'drugbank_id', 'gene_count', 'id', 'is_approved',
       'name', 'source', '_start', '_end', '_type', 'ei', 'el', 'score',
       'source.1'],
      dtype='object')

In [4]:
gen.generate_files(kg_data)

Graph file is saved!
Node types file is saved!


AttributeError: 'DataFrame' object has no attribute 'colmuns'

In [5]:
dis_gen.save_dis_sim("../pcs_networkx_graph.csv", "dis_sim.tsv")

KeyError: 'type'