# Example: RDF workflow

This notebook demonstrates the usage of the [rdf.py](../src/pyBiodatafuse/graph/rdf.py) module.

In [1]:
import os
os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))
import pandas as pd
from pyBiodatafuse.graph import rdf


### Load the sample property table

In [2]:
data = pd.read_pickle("../examples/usecases/PCS/combined_df.pkl")
metadata = pd.read_pickle("../examples/usecases/PCS/combined_metadata.pkl")
data.head(3)

Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info,OpenTargets_gene_compounds,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,StringDB_ppi
0,DMP1,HGNC,ENSG00000152592,Ensembl,"[{'disease_name': 'Hypophosphatemic Rickets', ...","[{'disease_name': 'Post-COVID-19', 'id': 'C000...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP3971', 'pathway_label': 'OS...","[{'pathway_label': 'ECM proteoglycans', 'pathw...","[{'go_id': 'GO:0005788', 'go_name': 'endoplasm...","[{'stringdb_link_to': 'TNFRSF11B', 'Ensembl': ..."
1,PNLIP,HGNC,ENSG00000175535,Ensembl,[{'disease_name': 'Pancreatic Lipase Deficienc...,"[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': 'CHEMBL175247', 'drugbank_id': ...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Retinoid metabolism and tr...,"[{'go_id': 'GO:0004806', 'go_name': 'triglycer...","[{'stringdb_link_to': 'LIPE', 'Ensembl': 'ENSP..."
2,OR4N3P,HGNC,ENSG00000259435,Ensembl,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': nan, 'go_name': nan, 'go_type': nan}]","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."


In [3]:
print(len(data))
data.describe()

2421


Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info,OpenTargets_gene_compounds,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,StringDB_ppi
count,2421,2421,2421,2421,2329,2329,2421,2421,2329,2421,2421,2421
unique,1667,1,1675,1,1560,1566,1675,1596,1461,1447,1637,1667
top,TEKT4P2,HGNC,ENSG00000188681,Ensembl,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': nan, 'go_name': nan, 'go_type': nan}]","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
freq,256,2421,128,2421,128,128,128,128,128,128,128,128


### Generating RDF from table


The function to generate an RDF `rdflib` graph (`generate_rdf()`) takes arguments:


In [4]:
g = rdf.generate_rdf(df = data, 
                     base_uri = "https://biodatafuse.org/example/", 
                     version_iri = "https://biodatafuse.org/example/test.owl", 
                     orcid = "https://orcid.org/0000-0002-4166-7093", 
                     author="Javier Millan Acosta", 
                     metadata = metadata)

ENSG00000152592_protein
ENSG00000152592_protein
ENSG00000152592_protein
ENSG00000152592_protein
ENSG00000152592_protein
ENSG00000152592_protein
ENSG00000175535_protein
ENSG00000175535_protein
ENSG00000175535_protein
ENSG00000175535_protein
ENSG00000175535_protein
ENSG00000175535_protein
ENSG00000268104_protein
ENSG00000268104_protein
ENSG00000268104_protein
ENSG00000268104_protein
ENSG00000268104_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186562_protein
ENSG00000186599_protein
ENSG00000186599_protein
ENSG00000186599_protein
ENSG00000186599_protein
ENSG00000186599_protein
ENSG00000186599_protein
ENSG00000186599_protein
ENSG00000186599_protein
ENSG00000186599_protein
ENSG00000162992_protein
ENSG00000162992_protein
ENSG00000162992_protein
ENSG00000162992_

### Print out result

In [5]:
g.serialize(format="turtle", destination='pcs_graph.ttl')

<Graph identifier=N669f04f2f4004147bb9b294e4a6d7a0f (<class 'rdflib.graph.Graph'>)>

In [6]:
stringdb = data.StringDB_ppi
c = 0
for i in stringdb:
    c +=1
    for j in i:
        if j['Ensembl'] == "ENSP00000394794":
            print(j['stringdb_link_to'], j['Ensembl'], data.target[c]+'_protein')

PTPN13 ENSP00000394794 ENSG00000220758_protein
PTPN13 ENSP00000394794 ENSG00000138166_protein
PTPN13 ENSP00000394794 ENSG00000100473_protein
PTPN13 ENSP00000394794 ENSG00000184983_protein
PTPN13 ENSP00000394794 ENSG00000160678_protein
PTPN13 ENSP00000394794 ENSG00000160097_protein
PTPN13 ENSP00000394794 ENSG00000225564_protein
