# Example: RDF workflow

This notebook demonstrates the usage of the [rdf.py](../src/pyBiodatafuse/graph/rdf.py) module.

In [7]:
import os
os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))
import pandas as pd
from pyBiodatafuse.graph import rdf


### Load the sample property table

In [8]:
data = pd.read_pickle("../combined_df_case.pkl")
metadata = pd.read_pickle("../combined_metadata_case.pkl")
data.head(3)

Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info,OpenTargets_gene_compounds,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,StringDB_ppi
0,DMP1,HGNC,ENSG00000152592,Ensembl,"[{'disease_name': 'Hypophosphatemic Rickets', ...","[{'disease_name': 'Post-COVID-19', 'id': 'C000...","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': 'WP3971', 'pathway_label': 'OS...","[{'pathway_label': 'ECM proteoglycans', 'pathw...","[{'go_id': 'GO:0005788', 'go_name': 'endoplasm...","[{'stringdb_link_to': 'TNFRSF11B', 'Ensembl': ..."
1,PNLIP,HGNC,ENSG00000175535,Ensembl,[{'disease_name': 'Pancreatic Lipase Deficienc...,"[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': 'CHEMBL175247', 'drugbank_id': ...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...",[{'pathway_label': 'Retinoid metabolism and tr...,"[{'go_id': 'GO:0004806', 'go_name': 'triglycer...","[{'stringdb_link_to': 'LIPE', 'Ensembl': 'ENSP..."
2,OR4N3P,HGNC,ENSG00000259435,Ensembl,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': nan, 'go_name': nan, 'go_type': nan}]","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."


In [9]:
print(len(data))
data.describe()

2421


Unnamed: 0,identifier,identifier.source,target,target.source,DISGENET_diseases,literature_based_info,OpenTargets_gene_compounds,MINERVA,WikiPathways,OpenTargets_reactome,OpenTargets_go,StringDB_ppi
count,2421,2421,2421,2421,2329,2329,2421,2421,2329,2421,2421,2421
unique,1667,1,1675,1,1560,1566,1675,1596,1461,1447,1637,1667
top,TEKT4P2,HGNC,ENSG00000188681,Ensembl,"[{'disease_name': nan, 'HPO': nan, 'NCI': nan,...","[{'disease_name': nan, 'id': nan, 'source': nan}]","[{'chembl_id': nan, 'drugbank_id': nan, 'compo...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_id': nan, 'pathway_label': nan, 'pa...","[{'pathway_label': nan, 'pathway_id': nan}]","[{'go_id': nan, 'go_name': nan, 'go_type': nan}]","[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."
freq,256,2421,128,2421,128,128,128,128,128,128,128,128


### Generating RDF from table
Rows with no data will be skipped:

In [10]:
nas = data[data["identifier"].isna()]
print(f"Skipped row count: {len(nas)}")
if len(nas) > 0:
    nas.sample(3)

Skipped row count: 0


In [11]:
pd.DataFrame(data['StringDB_ppi']).to_csv('ab')

The function to generate an RDF `rdflib` graph (`generate_rdf()`) takes arguments:


In [12]:
g = rdf.generate_rdf(df = data, 
                     base_uri = "https://biodatafuse.org/example/", 
                     version_iri = "https://biodatafuse.org/example/test.owl", 
                     orcid = "https://orcid.org/0000-0002-4166-7093", 
                     author="Javier Millan Acosta", 
                     metadata = metadata)

{'stringdb_link_to': 'TNFRSF11B', 'Ensembl': 'ENSP00000297350', 'score': 0.409}
{'stringdb_link_to': 'HSPA5', 'Ensembl': 'ENSP00000324173', 'score': 0.504}
{'stringdb_link_to': 'GAPDH', 'Ensembl': 'ENSP00000380070', 'score': 0.449}
{'stringdb_link_to': 'CD44', 'Ensembl': 'ENSP00000398632', 'score': 0.601}
{'stringdb_link_to': 'ENPP1', 'Ensembl': 'ENSP00000498074', 'score': 0.625}
{'stringdb_link_to': 'RUNX2', 'Ensembl': 'ENSP00000360493', 'score': 0.713}
{'stringdb_link_to': 'LIPE', 'Ensembl': 'ENSP00000244289', 'score': 0.592}
{'stringdb_link_to': 'CLPS', 'Ensembl': 'ENSP00000259938', 'score': 0.999}
{'stringdb_link_to': 'PLIN1', 'Ensembl': 'ENSP00000300055', 'score': 0.421}
{'stringdb_link_to': 'FASN', 'Ensembl': 'ENSP00000304592', 'score': 0.425}
{'stringdb_link_to': 'LEP', 'Ensembl': 'ENSP00000312652', 'score': 0.516}
{'stringdb_link_to': 'GCG', 'Ensembl': 'ENSP00000387662', 'score': 0.613}
{'stringdb_link_to': None, 'Ensembl': None, 'score': None}
{'stringdb_link_to': 'SLC7A11', '

KeyboardInterrupt: 

### Print out result

In [None]:
g.serialize(format="turtle", destination='test.ttl')

In [8]:
! grep  "NCIT_C18469" ../src/test.ttl