## An example of how to query OMA RDF data in two variants: 
1. from a local RDF dump
     - note: the code only uses a sample dataset, to use the full dump we would need to setup some sort of database copy on the cluster
2. from the remote OMA SPARQL endpoint
    - in this variant, you can experiment with any UniProt entry you want

### The sample data contains info for a few example UniProt URIs
(to be used just for demo purposes)

In [3]:
# NOTE: a list on entries can also be used in the SPARQL queries, e.g.

# VALUES ?protein2_uniprot {<URI1> <URI2> <URI3>...}

uniprot_entry = "<http://purl.uniprot.org/uniprot/A0A832T5P4>"

In [4]:
# The query that can be used both for local RDF or for remote SPARQL endpoint
# get HOG by UniProt ID(s)
query_OMA_HOGs = """
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX lscr: <http://purl.org/lscr#>

SELECT DISTINCT *  {
	VALUES ?protein2_uniprot { """ + uniprot_entry + "}" + """
    ?cluster a orth:OrthologsCluster.
    ?cluster orth:hasHomologousMember ?node1.
    ?cluster orth:hasHomologousMember ?node2.
	?node1 orth:hasHomologousMember ?protein1.
	?node2 orth:hasHomologousMember ?protein2.
	?protein2  lscr:xrefUniprot ?protein2_uniprot. 
    FILTER(?node1 != ?node2)
}

"""

### Option 1. Get HOGs from local RDF dump

Prerequisites: a folder with the relevant .ttl files

In [5]:
pip install rdflib


Note: you may need to restart the kernel to use updated packages.


In [6]:
from rdflib import Graph

#note: since full dump is large, consider using it only on cluster
# it also seems like rdflib doesn't really scale...so it's better to use the SPARQL endpoint
# if we need, we can setup a copy of the public one

# https://stackoverflow.com/questions/5678623/how-to-parse-big-datasets-using-rdflib
path_to_OMA_dir = "oma_sample"

# get all ttl files in folder
from os import listdir
from os.path import isfile, join
ttl_files_list = [join(path_to_OMA_dir, f) for f in listdir(path_to_OMA_dir) if isfile(join(path_to_OMA_dir, f)) and f.endswith(".ttl")]

g = Graph()

for ttl_file in ttl_files_list:
    print(ttl_file)
    g.parse(ttl_file)

oma_sample/orthOntology_v2_forOMA.ttl
oma_sample/lscr.ttl
oma_sample/sample_xref.ttl
oma_sample/sample_oma_hogs.ttl


In [7]:
# number of triples parsed
print(len(g))

1315


In [8]:
import pandas as pds
qres = g.query(query_OMA_HOGs)

print(len(qres))

# print the variables in the header

print(qres.vars)
# the actual result is stored in the "bindings"
#print(qres.bindings)
    
# the result can be converted to e.g. a Pandas dataframe for later use
df = pds.DataFrame(qres.bindings)
print(df.head)

2
[rdflib.term.Variable('protein1'), rdflib.term.Variable('cluster'), rdflib.term.Variable('protein2_uniprot'), rdflib.term.Variable('node1'), rdflib.term.Variable('node2'), rdflib.term.Variable('protein2')]
<bound method NDFrame.head of                                              cluster  \
0  https://omabrowser.org/oma/hog/resolve/HOG:C00...   
1  https://omabrowser.org/oma/hog/resolve/HOG:C00...   

                                               node1  \
0  https://omabrowser.org/oma/hog/resolve/HOG:C00...   
1  https://omabrowser.org/oma/hog/resolve/HOG:C00...   

                                               node2  \
0  https://omabrowser.org/oma/hog/resolve/HOG:C00...   
1  https://omabrowser.org/oma/hog/resolve/HOG:C00...   

                                     protein1  \
0  https://omabrowser.org/oma/info/PYRAB00578   
1  https://omabrowser.org/oma/info/PYRYC00473   

                                     protein2  \
0  https://omabrowser.org/oma/info/PYRHO01642   
1  https:

### Option 2. Get HOGs from OMA SPARQL endpoint

Prerequisites: the address of the OMA SPARQL endpoint

See extended examples in F1000 [tutorial](https://github.com/biosoda/tutorial_orthology/blob/master/Orthology_SPARQL_Notebook.ipynb)

In [9]:
# first we need to define the SPARQL endpoints of each source, to use later in the protocols
import sys
!{sys.executable} -m pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON
import sys, os, time
import pandas as pd

# always display full column results (don't truncate output)
pd.set_option('display.max_colwidth', None)

# the endpoints must be defined as wrappers for executing SPARQL queries
sparql_OMA = SPARQLWrapper("https://sparql.omabrowser.org/sparql")


# function to print in a table results of a SPARQL query
def pretty_print(results):
    
    # how to transform SPARQL results into Pandas dataframes
    
    # get header (column names) from results
    header = results["results"]["bindings"][0].keys()

    # display table of results:
    table = []
    
    # the SPARQL JSON results to the query are available in the "results", "bindings" entry:
    for entry in results["results"]["bindings"]:
        # append entries from the results to a regular Python list of rows, which we can then transform to a Pandas DF
        row = [entry[column]["value"] if entry.get(column, None) != None else None for column in header]
        table.append(row)
    df = pd.DataFrame(table, columns=list(header))
    return df



In [10]:
# the following variable defines how many entries to print from the results:
# e.g. here, we only print 3 example entries (leave empty to show all results)

NUM_EXAMPLES=3

In [11]:
sparql_OMA.setQuery(query_OMA_HOGs)
sparql_OMA.setReturnFormat(JSON)

results_OMA = sparql_OMA.query().convert()

pretty_print(results_OMA).head(NUM_EXAMPLES)

Unnamed: 0,protein2_uniprot,cluster,node1,node2,protein1,protein2
0,http://purl.uniprot.org/uniprot/A0A832T5P4,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_2260,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_186497,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_70601,https://omabrowser.org/oma/info/PYRFU01582,https://omabrowser.org/oma/info/PYRHO01642
1,http://purl.uniprot.org/uniprot/A0A832T5P4,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_2260,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_272844,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_70601,https://omabrowser.org/oma/info/PYRAB00578,https://omabrowser.org/oma/info/PYRHO01642
2,http://purl.uniprot.org/uniprot/A0A832T5P4,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_2260,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_342949,https://omabrowser.org/oma/hog/resolve/HOG:C0017089.5f_70601,https://omabrowser.org/oma/info/PYRSN00170,https://omabrowser.org/oma/info/PYRHO01642


In [12]:
# a little bit more info than in the sample can be found in the full dump OR 
# in the SPARQL endpoint

# these queries can also be copy-pasted and executed at https://sparql.omabrowser.org/lode/sparql

query_OMA_HOGs_extended = """
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX lscr: <http://purl.org/lscr#>

SELECT DISTINCT ?root_hog ?species_name ?protein1_uniprot (?protein1 as 
           ?protein1_OMA) ?taxLevel  {
	VALUES ?protein2_uniprot { """ + uniprot_entry + "}" + """
	?root_hog obo:CDAO_0000148 ?hog_cluster. #has_Root
	?hog_cluster orth:hasHomologousMember* ?node1.
	?node1 a orth:OrthologsCluster.
	?node1 orth:hasTaxonomicRange ?level.
	?level orth:taxRange ?taxLevel .
	?node1 orth:hasHomologousMember* ?protein1.
	?hog_cluster orth:hasHomologousMember* ?protein2.
	?protein1 a orth:Protein.
    	?protein1 orth:organism ?organism.
    	?organism obo:RO_0002162 ?taxon.
    	?taxon up:scientificName ?species_name.
	OPTIONAL {?protein1 lscr:xrefUniprot ?protein1_uniprot}. 
	?protein2 a orth:Protein. 
	?protein2  lscr:xrefUniprot ?protein2_uniprot. 
} ORDER BY ?taxLevel

"""
print(query_OMA_HOGs_extended)


PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX lscr: <http://purl.org/lscr#>

SELECT DISTINCT ?root_hog ?species_name ?protein1_uniprot (?protein1 as 
           ?protein1_OMA) ?taxLevel  {
	VALUES ?protein2_uniprot { <http://purl.uniprot.org/uniprot/A0A832T5P4>}
	?root_hog obo:CDAO_0000148 ?hog_cluster. #has_Root
	?hog_cluster orth:hasHomologousMember* ?node1.
	?node1 a orth:OrthologsCluster.
	?node1 orth:hasTaxonomicRange ?level.
	?level orth:taxRange ?taxLevel .
	?node1 orth:hasHomologousMember* ?protein1.
	?hog_cluster orth:hasHomologousMember* ?protein2.
	?protein1 a orth:Protein.
    	?protein1 orth:organism ?organism.
    	?organism obo:RO_0002162 ?taxon.
    	?taxon up:scientificName ?species_name.
	OPTIONAL {?protein1 lscr:xrefUniprot ?protein1_uniprot}. 
	?protein2 a orth:Protein. 
	?protein2  lscr:xrefUniprot ?protein2_uniprot. 
} ORDER

In [13]:
sparql_OMA.setQuery(query_OMA_HOGs_extended)
sparql_OMA.setReturnFormat(JSON)

results_OMA = sparql_OMA.query().convert()

# print some sample results...
pretty_print(results_OMA).head(NUM_EXAMPLES)

Unnamed: 0,root_hog,species_name,protein1_uniprot,protein1_OMA,taxLevel
0,https://omabrowser.org/oma/hog/resolve/HOG:C0017089_28890#ROOT_HOG,Archaeoglobus profundus (strain DSM 5631 / JCM 9629 / NBRC 100127 / Av18),http://purl.uniprot.org/uniprot/D2RDS2,https://omabrowser.org/oma/info/ARCPA01188,Archaeoglobaceae
1,https://omabrowser.org/oma/hog/resolve/HOG:C0017089_28890#ROOT_HOG,Archaeoglobus veneficus (strain DSM 11195 / SNP6),http://purl.uniprot.org/uniprot/F2KNH1,https://omabrowser.org/oma/info/ARCVS01308,Archaeoglobaceae
2,https://omabrowser.org/oma/hog/resolve/HOG:C0017089_28890#ROOT_HOG,Ferroglobus placidus (strain DSM 10642 / AEDII12DO),http://purl.uniprot.org/uniprot/D3S1E4,https://omabrowser.org/oma/info/FERPA02200,Archaeoglobaceae


In [14]:
# total number of orthologs for this gene (ordered by tax level)
print(len(results_OMA["results"]["bindings"]))

1785


In [None]:
#load string data

#map OMA to string IDS

#check interaction in the string dataset 

