# Example: RDF workflow

This notebook demonstrates the usage of the [rdf.py](../src/pyBiodatafuse/graph/rdf.py) module to generate an RDF BDF Knowledge Graph, and then tests some competency questions.

In [1]:
import os

os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..", "..", "..", "src")))
import pandas as pd
from pyBiodatafuse.graph import rdf
from shexer.shaper import Shaper
from shexer.consts import NT, SHEXC, SHACL_TURTLE
import json
from IPython.display import display_markdown


### Load the sample property table

In [2]:
data = pd.read_pickle("../examples/usecases/PCS/combined_df.pkl")
metadata = pd.read_pickle("../examples/usecases/PCS/combined_metadata.pkl")
data.head(3)

Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info,OpenTargets_gene_compounds,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,StringDB_ppi
0,DMP1,HGNC,ENSG00000152592,Ensembl,"[{'disease_name': 'Hypophosphatemic Rickets', ...","[{'disease_name': 'Post-COVID-19', 'id': 'C000...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP3971', 'pathway_label': 'OS...","[{'pathway_label': 'ECM proteoglycans', 'pathw...","[{'go_id': 'GO:0005788', 'go_name': 'endoplasm...","[{'stringdb_link_to': 'TNFRSF11B', 'Ensembl': ..."
1,PNLIP,HGNC,ENSG00000175535,Ensembl,[{'disease_name': 'Pancreatic Lipase Deficienc...,"[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': 'CHEMBL175247', 'drugbank_id': ...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Retinoid metabolism and tr...,"[{'go_id': 'GO:0004806', 'go_name': 'triglycer...","[{'stringdb_link_to': 'LIPE', 'Ensembl': 'ENSP..."
2,OR4N3P,HGNC,ENSG00000259435,Ensembl,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': nan, 'go_name': nan, 'go_type': nan}]","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."


In [3]:
print(len(data))
data.describe()

2421


Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info,OpenTargets_gene_compounds,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,StringDB_ppi
count,2421,2421,2421,2421,2329,2329,2421,2421,2329,2421,2421,2421
unique,1667,1,1675,1,1560,1566,1675,1596,1461,1447,1637,1667
top,TEKT4P2,HGNC,ENSG00000188681,Ensembl,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': nan, 'go_name': nan, 'go_type': nan}]","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
freq,256,2421,128,2421,128,128,128,128,128,128,128,128


### Generating RDF from table


The function to generate an RDF `rdflib` graph (`generate_rdf()`) takes arguments:


In [4]:
g = rdf.generate_rdf(
    df=data,
    base_uri="https://biodatafuse.org/example/",
    version_iri="https://biodatafuse.org/example/test.owl",
    orcid="https://orcid.org/0000-0002-4166-7093",
    author="Javier Millan Acosta",
    metadata=metadata,
)

### Print out result

In [5]:
ttl = g.serialize(format="turtle", destination='pcs_graph.ttl')

## Querying the graph

This section shows some competency questions around the PCS case.

In [6]:
import requests
import pandas as pd
from io import StringIO

endpoint_url = "http://pallascat:7200/repositories/BDF_PCS" # GraphDB endpoint
headers_sel = {
    "Accept": "text/csv",  # Request CSV format
    "Content-Type": "application/x-www-form-urlencoded",
}

def run_select_query(query_path, endpoint_url=endpoint_url, headers=headers_sel):
    base_dir = "../examples/usecases/PCS/SPARQL/select"
    query_path = os.path.join(base_dir, query_path)
    with open(query_path, 'r') as f:
        query = f.read()
    data = {"query": query}
    print(query)
    response = requests.post(endpoint_url, headers=headers, data=data)
    if response.status_code == 200:
        csv_data = StringIO(response.text)
        return pd.read_csv(csv_data)  # Convert CSV content to DataFrame
    else:
        raise Exception(f"Query failed with status {response.status_code}: {response.text}")

with open("../examples/usecases/PCS/SPARQL/questions.json", "r") as file:
    qs = json.load(file)

### Q1

In [23]:
print("QUESTIONS\n- " + "\n- ".join(qs['q1']), "\n")
q1 = run_select_query('q1.rq')
q1

QUESTIONS
- Which MINERVA pathways contain genes involved in COVID-related gene-disease interactions documented in the literature?
- What MINERVA pathways include genes linked to COVID gene-disease interactions based on existing studies?
- Identify MINERVA pathways that list genes with literature-supported interactions in COVID.
- Which pathways under MINERVA involve genes with documented COVID gene-disease interactions? 

PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX pr: <http://purl.obolibrary.org/obo/pr#>

SELECT ?gene ?geneLabel ?pw
WHERE {
    ?publication a obo:IAO_0000013 ;
                 sio:SIO_000628 ?gene ;
                 sio:SIO_000628 ?disease .
                 
                 
    ?disease a obo:NCIT_C7057 ;
            rdfs:label ?diseaseLabel .
    FILTER(CONTAINS(?diseaseLabel, "COVID"))
    
    ?gene a obo:NCIT_C16612 ;
          sio:SIO_000068 ?pw ;


Unnamed: 0,gene,geneLabel,pw
0,http://identifiers.org/ensembl/ENSG00000125356,NDUFA1,https://minerva-net.lcsb.uni.lu/api/933.0
1,http://identifiers.org/ensembl/ENSG00000125356,NDUFA1,https://minerva-net.lcsb.uni.lu/api/933.0
2,http://identifiers.org/ensembl/ENSG00000140990,NDUFB10,https://minerva-net.lcsb.uni.lu/api/933.0
3,http://identifiers.org/ensembl/ENSG00000140990,NDUFB10,https://minerva-net.lcsb.uni.lu/api/933.0
4,http://identifiers.org/ensembl/ENSG00000147123,NDUFB11,https://minerva-net.lcsb.uni.lu/api/933.0
5,http://identifiers.org/ensembl/ENSG00000147123,NDUFB11,https://minerva-net.lcsb.uni.lu/api/933.0
6,http://identifiers.org/ensembl/ENSG00000184983,NDUFA6,https://minerva-net.lcsb.uni.lu/api/933.0
7,http://identifiers.org/ensembl/ENSG00000184983,NDUFA6,https://minerva-net.lcsb.uni.lu/api/933.0
8,http://identifiers.org/ensembl/ENSG00000197594,ENPP1,https://minerva-net.lcsb.uni.lu/api/942.0
9,http://identifiers.org/ensembl/ENSG00000197594,ENPP1,https://minerva-net.lcsb.uni.lu/api/942.0


### Q2

In [25]:
print("QUESTIONS\n- " + "\n- ".join(qs["q2"]), "\n")
q2 = run_select_query("q2.rq")
q2

QUESTIONS
- Which genes are targeted by compounds in clinical stage 4 or approved, and what cellular components do these genes belong to?
- For genes targeted by clinically approved or stage 4 compounds, what are their associated cellular components?
- Identify genes targeted by clinical stage 4 or approved compounds and list their cellular component associations.
- Which cellular components are associated with genes targeted by stage 4 or approved compounds? 

PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX pr: <http://purl.obolibrary.org/obo/pr#>

SELECT DISTINCT ?gene ?geneLabel ?compartmentLabel ?drug ?agonist
WHERE {
    ?gene a obo:NCIT_C16612 ;
          rdfs:label ?geneLabel ;
          sio:SIO_000068 ?compartment .
    
    ?compartment rdfs:subClassOf obo:GO_0005575 ;
                 rdfs:label ?compartmentLabel .

    ?protein pr:has_gene_template ?gene .

    # Retr

Unnamed: 0,gene,geneLabel,compartmentLabel,drug,agonist
0,http://identifiers.org/ensembl/ENSG00000113231,PDE8B,cellular component,https://www.ebi.ac.uk/chembl/compound_report_c...,True
1,http://identifiers.org/ensembl/ENSG00000113231,PDE8B,cellular component,https://pubchem.ncbi.nlm.nih.gov/compound/3354,True
2,http://identifiers.org/ensembl/ENSG00000113231,PDE8B,cellular component,https://go.drugbank.com/drugs/DB01148,True
3,http://identifiers.org/ensembl/ENSG00000113231,PDE8B,cellular component,https://www.ebi.ac.uk/chembl/compound_report_c...,True
4,http://identifiers.org/ensembl/ENSG00000113231,PDE8B,cellular component,https://pubchem.ncbi.nlm.nih.gov/compound/3108,True
...,...,...,...,...,...
6059,http://identifiers.org/ensembl/ENSG00000115353,TACR1,plasma membrane,https://www.ebi.ac.uk/chembl/compound_report_c...,False
6060,http://identifiers.org/ensembl/ENSG00000115353,TACR1,cell body,https://www.ebi.ac.uk/chembl/compound_report_c...,False
6061,http://identifiers.org/ensembl/ENSG00000115353,TACR1,dendrite,https://www.ebi.ac.uk/chembl/compound_report_c...,False
6062,http://identifiers.org/ensembl/ENSG00000115353,TACR1,cell surface,https://www.ebi.ac.uk/chembl/compound_report_c...,False


### Q3

In [26]:
print("QUESTIONS\n- " + "\n- ".join(qs["q3"]), "\n")
q3 = run_select_query("q3.rq")
q3

QUESTIONS
- Which biological processes involve genes targeted by clinical stage 4 or approved compounds?
- What are the biological processes of genes targeted by approved or stage 4 compounds?
- Identify biological processes for genes targeted by compounds at stage 4 or approved for clinical use.
- Which genes targeted by stage 4 or approved compounds are involved in specific biological processes? 

PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX pr: <http://purl.obolibrary.org/obo/pr#>

SELECT ?gene ?geneLabel ?processLabel
WHERE {
    ?gene a obo:NCIT_C16612 ;
          rdfs:label ?geneLabel ;
          sio:SIO_000068 ?process .
    
    ?process rdfs:label ?processLabel ;
             rdfs:subClassOf obo:GO_0008150 .
    
    ?protein pr:has_gene_template ?gene .

    # Retrieve agonist drugs targeting the protein
    ?agonistDrug a sio:SIO_010038 ;
                 obo:RO_00

Unnamed: 0,gene,geneLabel,processLabel
0,http://identifiers.org/ensembl/ENSG00000196639,HRH1,regulation of vascular permeability
1,http://identifiers.org/ensembl/ENSG00000196639,HRH1,cellular response to histamine
2,http://identifiers.org/ensembl/ENSG00000196639,HRH1,G protein-coupled serotonin receptor signaling...
3,http://identifiers.org/ensembl/ENSG00000196639,HRH1,positive regulation of vasoconstriction
4,http://identifiers.org/ensembl/ENSG00000196639,HRH1,G protein-coupled receptor signaling pathway
...,...,...,...
13407,http://identifiers.org/ensembl/ENSG00000180720,CHRM4,G protein-coupled serotonin receptor signaling...
13408,http://identifiers.org/ensembl/ENSG00000180720,CHRM4,cell surface receptor signaling pathway
13409,http://identifiers.org/ensembl/ENSG00000180720,CHRM4,"G protein-coupled receptor signaling pathway, ..."
13410,http://identifiers.org/ensembl/ENSG00000180720,CHRM4,signal transduction


### Q4

In [27]:
print("QUESTIONS\n- " + "\n- ".join(qs["q4"]), "\n")
q4 = run_select_query("q4.rq")
q4

QUESTIONS
- What is the molecular function of genes targeted by clinical stage 4 or approved compounds?
- Which molecular functions are associated with genes targeted by approved or stage 4 compounds?
- Identify molecular functions for genes targeted by stage 4 or approved compounds.
- For genes targeted by stage 4 or approved compounds, what are their molecular functions? 

PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX pr: <http://purl.obolibrary.org/obo/pr#>

SELECT distinct ?gene ?geneLabel ?functionLabel
WHERE {
    ?gene a obo:NCIT_C16612 ;
          rdfs:label ?geneLabel ;
          sio:SIO_000068 ?function .
    
    ?function rdfs:label ?functionLabel ;
              rdfs:subClassOf obo:GO_0003674 .
    
    ?protein pr:has_gene_template ?gene .

    # Retrieve agonist drugs targeting the protein
    ?agonistDrug a sio:SIO_010038 ;
                 obo:RO_0018027 ?prot

Unnamed: 0,gene,geneLabel,functionLabel
0,http://identifiers.org/ensembl/ENSG00000196639,HRH1,histamine receptor activity
1,http://identifiers.org/ensembl/ENSG00000196639,HRH1,G protein-coupled serotonin receptor activity
2,http://identifiers.org/ensembl/ENSG00000196639,HRH1,neurotransmitter receptor activity
3,http://identifiers.org/ensembl/ENSG00000082175,PGR,estrogen response element binding
4,http://identifiers.org/ensembl/ENSG00000082175,PGR,protein binding
5,http://identifiers.org/ensembl/ENSG00000082175,PGR,nuclear steroid receptor activity
6,http://identifiers.org/ensembl/ENSG00000082175,PGR,DNA binding
7,http://identifiers.org/ensembl/ENSG00000082175,PGR,identical protein binding
8,http://identifiers.org/ensembl/ENSG00000082175,PGR,transcription coactivator binding
9,http://identifiers.org/ensembl/ENSG00000082175,PGR,steroid binding


### Q5

In [28]:
print("QUESTIONS\n- " + "\n- ".join(qs["q5"]), "\n")
q5 = run_select_query("q5.rq")
q5

QUESTIONS
- Which compounds target genes with protein products having a high PPI score (above 0.8) with other proteins?
- Identify compounds that target genes with protein interactions scoring above 0.8.
- Which compounds target genes whose protein products have a PPI score higher than 0.8?
- List compounds targeting genes where protein interactions exceed a PPI score of 0.8. 

PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX pr: <http://purl.obolibrary.org/obo/pr#>

SELECT DISTINCT ?drugLabel ?proteinLabel
WHERE {
    ?gene a obo:NCIT_C16612 ;
          rdfs:label ?geneLabel .
    ?protein pr:has_gene_template ?gene ;
             rdfs:label ?proteinLabel ;
             sio:SIO_000068 ?ppi .
    ?ppi sio:SIO_000216 ?score .
    ?score sio:has_value ?scoreValue .
    FILTER(?scoreValue > 0.8)

    ?drug ?predicate ?protein ;
          rdfs:label ?drugLabel .
    VALUES ?predicate

Unnamed: 0,drugLabel,proteinLabel
0,"FIBRINOLYSIN, HUMAN",ENSG00000171564xProtein
1,PLASMINOGEN,ENSG00000171564xProtein
2,ANCROD,ENSG00000171564xProtein
3,ALFIMEPRASE,ENSG00000171564xProtein
4,"FIBRINOGEN, HUMAN",ENSG00000171564xProtein
...,...,...
477,FOSAPREPITANT,ENSG00000115353xProtein
478,ROLAPITANT,ENSG00000115353xProtein
479,NETUPITANT,ENSG00000115353xProtein
480,FOSNETUPITANT,ENSG00000115353xProtein


### Q6 

In [29]:
print("QUESTIONS\n- " + "\n- ".join(qs["q6"]), "\n")
q6 = run_select_query("q6.rq")
q6

QUESTIONS
- Which pathways involve proteins targeted by drugs and have high PPI scores (above 0.8) with other proteins?
- Identify pathways containing proteins targeted by drugs that show PPIs above a score of 0.8.
- Which drug-targeted proteins involved in pathways show high PPI scores (above 0.8)?
- Pathways with proteins targeted by drugs and showing significant PPIs (score > 0.8) with other proteins. 

PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX pr: <http://purl.obolibrary.org/obo/pr#>

SELECT DISTINCT ?pw ?proteinLabel ?drugLabel
WHERE {
    ?gene a obo:NCIT_C16612 ;
          rdfs:label ?geneLabel .
    ?protein pr:has_gene_template ?gene ;
             rdfs:label ?proteinLabel ;
             sio:SIO_000068 ?ppi ;
             sio:SIO_000068 ?pw .
    ?pw a obo:PW_0000001 .

    ?ppi sio:SIO_000216 ?score .
    ?score sio:has_value ?scoreValue .
    FILTER(?scoreValue 

Unnamed: 0,pw,proteinLabel,drugLabel
0,https://minerva-net.lcsb.uni.lu/api/943.0,ENSG00000232810xProtein,ETANERCEPT
1,https://minerva-net.lcsb.uni.lu/api/943.0,ENSG00000232810xProtein,ADALIMUMAB
2,https://minerva-net.lcsb.uni.lu/api/943.0,ENSG00000232810xProtein,INFLIXIMAB
3,https://minerva-net.lcsb.uni.lu/api/943.0,ENSG00000232810xProtein,CERTOLIZUMAB PEGOL
4,https://minerva-net.lcsb.uni.lu/api/943.0,ENSG00000232810xProtein,GOLIMUMAB
...,...,...,...
7130,https://reactome.org/content/detail/R-HSA-112314,ENSG00000178084xProtein,AMISULPRIDE
7131,https://reactome.org/content/detail/R-HSA-112314,ENSG00000178084xProtein,CLOTHIAPINE
7132,https://reactome.org/content/detail/R-HSA-112314,ENSG00000178084xProtein,TROPISETRON
7133,https://reactome.org/content/detail/R-HSA-112314,ENSG00000178084xProtein,MOSAPRIDE


## Q7

In [30]:
print("QUESTIONS\n- " + "\n- ".join(qs["q7"]), "\n")
q7 = run_select_query("q7.rq")
q7

QUESTIONS
- Which compounds target genes with high PPI score protein products (0.8), and how do these compounds rank by adverse outcome count?
- Identify compounds targeting high-PPI-score genes (0.8) and sort by adverse outcome frequency.
- Which compounds interact with high-PPI genes (score 0.8) and what are their adverse outcome counts?
- Sort compounds targeting genes with high PPI scores (0.8) by the number of recorded adverse outcomes. 

PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX pr: <http://purl.obolibrary.org/obo/pr#>

SELECT DISTINCT ?drug ?drugLabel (COUNT(DISTINCT ?ae) AS ?adverseEventCount)
WHERE {
    ?gene a obo:NCIT_C16612 ;
          rdfs:label ?geneLabel .
    ?protein pr:has_gene_template ?gene ;
             rdfs:label ?proteinLabel ;
             sio:SIO_000068 ?ppi .

    ?ppi sio:SIO_000216 ?score .
    ?score sio:has_value ?scoreValue .
    FILTER(?sc

Unnamed: 0,drug,drugLabel,adverseEventCount
0,https://www.ebi.ac.uk/chembl/compound_report_c...,DICLOFENAC SODIUM,25
1,https://pubchem.ncbi.nlm.nih.gov/compound/5018304,DICLOFENAC SODIUM,25
2,https://www.ebi.ac.uk/chembl/compound_report_c...,CELECOXIB,25
3,https://pubchem.ncbi.nlm.nih.gov/compound/2662,CELECOXIB,25
4,https://go.drugbank.com/drugs/DB00482,CELECOXIB,25
...,...,...,...
459,https://pubchem.ncbi.nlm.nih.gov/compound/1134...,MARIZOMIB,1
460,https://go.drugbank.com/drugs/DB11762,MARIZOMIB,1
461,https://www.ebi.ac.uk/chembl/compound_report_c...,NIROGACESTAT,1
462,https://pubchem.ncbi.nlm.nih.gov/compound/4622...,NIROGACESTAT,1
