# Example: RDF workflow

This notebook demonstrates the usage of the [rdf.py](../src/pyBiodatafuse/graph/rdf.py) module.

In [1]:
import os

os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))
import pandas as pd

from pyBiodatafuse import constants
from pyBiodatafuse.graph import rdf

from rdflib import Graph, RDFS, RDF, OWL, URIRef
from graphviz import Digraph

  from .autonotebook import tqdm as notebook_tqdm


### Load the sample property table

In [2]:
data = pd.read_pickle("../combined_df.pkl")
data.head(3)

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,DISGENET_diseases,OpenTargets_diseases,OpenTargets_compounds,PubChem_assays,StringDB_ppi
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'pathway_id': 953.0, 'pathway_label': 'Kynur...","[{'pathway_id': 'WP5044', 'pathway_label': 'Ky...","[{'pathway_label': 'Endogenous sterols', 'path...","[{'go_id': 'GO:0005667', 'go_name': 'transcrip...","[{'disease_name': 'Mammary Neoplasms', 'HPO': ...",[{'disease_name': 'acute respiratory distress ...,"[{'chembl_id': 'CHEMBL259571', 'drugbank_id': ...","[{'pubchem_assay_id': nan, 'assay_type': nan, ...","[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
1,CHRNG,HGNC,ENSG00000196811,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Highly sodium permeable po...,"[{'go_id': 'GO:0015464', 'go_name': 'acetylcho...",[{'disease_name': 'Multiple pterygium syndrome...,[{'disease_name': 'acute respiratory distress ...,"[{'chembl_id': 'CHEMBL1200641', 'drugbank_id':...","[{'pubchem_assay_id': nan, 'assay_type': nan, ...","[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
2,DMD,HGNC,ENSG00000198947,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP2858', 'pathway_label': 'Ec...",[{'pathway_label': 'Striated Muscle Contractio...,"[{'go_id': 'GO:0016010', 'go_name': 'dystrophi...","[{'disease_name': 'Muscular Dystrophy, Duchenn...",[{'disease_name': 'acute respiratory distress ...,"[{'chembl_id': 'CHEMBL2108278', 'drugbank_id':...","[{'pubchem_assay_id': nan, 'assay_type': nan, ...","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."


In [3]:
print(len(data))
data.describe()

932


Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,DISGENET_diseases,OpenTargets_diseases,OpenTargets_compounds,PubChem_assays,StringDB_ppi
count,6,6,6,6,6,6,6,6,6,6,932,6,50,6
unique,6,1,6,1,6,6,6,6,6,6,72,6,50,6
top,AHR,HGNC,ENSG00000106546,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'pathway_id': 953.0, 'pathway_label': 'Kynur...","[{'pathway_id': 'WP5044', 'pathway_label': 'Ky...","[{'pathway_label': 'Endogenous sterols', 'path...","[{'go_id': 'GO:0005667', 'go_name': 'transcrip...","[{'disease_name': 'Mammary Neoplasms', 'HPO': ...","[{'disease_name': 'muscular dystrophy', 'thera...","[{'chembl_id': 'CHEMBL259571', 'drugbank_id': ...","[{'pubchem_assay_id': nan, 'assay_type': nan, ...","[{'stringdb_link_to': 'SCN4A', 'Ensembl': 'ENS..."
freq,1,6,1,6,1,1,1,1,1,1,499,1,1,1


### Generating RDF from table
Rows with no data will be skipped:

In [4]:
nas = data[data["identifier"].isna()]
print(f"Skipped row count: {len(nas)}")
nas.sample(3)

Skipped row count: 926


Unnamed: 0,identifier,identifier.source,target,target.source,Bgee_gene_expression_levels,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,DISGENET_diseases,OpenTargets_diseases,OpenTargets_compounds,PubChem_assays,StringDB_ppi
884,,,,,,,,,,,"[{'disease_name': 'cancer', 'therapeutic_areas...",,,
522,,,,,,,,,,,"[{'disease_name': 'muscular dystrophy', 'thera...",,,
488,,,,,,,,,,,"[{'disease_name': 'muscular dystrophy', 'thera...",,,


The function to generate an RDF `rdflib` graph (`generate_rdf()`) takes two arguments:


In [5]:
g = rdf.generate_rdf(df = data, base_uri = "https://biodatafuse.org/example/", version_iri = "https://biodatafuse.org/example/test.owl", orcid = "https://orcid.org/0000-0002-4166-7093", author="Javier Millan Acosta")

### Print out result

In [6]:
g.serialize(format="turtle", destination='test.ttl')

<Graph identifier=Nfe442ef79cef4b0ab366e790b9531389 (<class 'rdflib.graph.Graph'>)>

In [7]:
print(g.serialize(format="turtle"))

@prefix chebi: <http://purl.obolibrary.org/obo/chebi/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix ns1: <https://w3id.org/reproduceme#> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix sio: <http://semanticscience.org/resource/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix sssom: <https://w3id.org/sssom/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

obo:IAO_0000112 a owl:AnnotationProperty .

obo:IAO_0000116 a owl:AnnotationProperty .

obo:IAO_0000117 a owl:AnnotationProperty .

obo:IAO_0000118 a owl:AnnotationProperty .

obo:IAO_0000119 a owl:AnnotationProperty .

obo:IAO_0000412 a owl:AnnotationProperty .

obo:NCIT_NHC0 a owl:AnnotationProperty .

obo:NCIT_P106 a owl:AnnotationProperty .

obo:NCIT_P108 a owl:Annotat