# Extraction of the "Signaling by EGFR" pathway (R-HSA-177929) from the BioPAX export (v65) of Reactome

#### Import libraries

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON, CSV
import IPython
import pandas as pd
import subprocess
import time

#### Define URL of SPARQL endpoint

In [None]:
endpointURL_reactome_v65 = "http://localhost:3030/reactome_v65/query"
rdfFormat = "turtle"

#### Define RDF prefixes

In [None]:
reactomeVersion = 65 
prefixes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>

PREFIX chebi: <http://purl.obolibrary.org/obo/chebi/>
PREFIX chebidb: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX chebirel: <http://purl.obolibrary.org/obo/CHEBI#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

PREFIX bp3: <http://www.biopax.org/release/biopax-level3.owl#>

# Homo_sapiens-20170221.owl
#PREFIX reactome: <http://www.reactome.org/biopax/59/48887#> 
#
# Homo_sapiens-20210608.owl
#PREFIX reactome: <http://www.reactome.org/biopax/77/48887#>
#
# Homo_sapiens-20220614.owl
#PREFIX reactome: <http://www.reactome.org/biopax/81/48887#>
#
# Homo_sapiens-20221130.owl
#PREFIX reactome: <http://www.reactome.org/biopax/83/48887#>

PREFIX reactome: <http://www.reactome.org/biopax/{}/48887#>
""".format(reactomeVersion)

biopaxURI = "http://www.biopax.org/release/biopax-level3.owl#"

# Useful functions

In [None]:
def displaySparqlResults(results):
    '''
    Displays as HTML the result of a SPARQLWrapper query in a Jupyter notebook.
    
        Parameters:
            results (dictionnary): the result of a call to SPARQLWrapper.query().convert()
    '''
    variableNames = results['head']['vars']
    tableCode = '<table><tr><th>{}</th></tr><tr>{}</tr></table>'.format('</th><th>'.join(variableNames), '</tr><tr>'.join('<td>{}</td>'.format('</td><td>'.join([row[vName]['value'] if vName in row.keys() else "&nbsp;" for vName in variableNames]))for row in results["results"]["bindings"]))
    IPython.display.display(IPython.display.HTML(tableCode))

In [None]:
def getPathwayURIbyName(pathwayName):
    query="""
SELECT DISTINCT ?pathwayURI ?pathwayReactomeID
WHERE {
  ?pathwayURI bp3:name|bp3:displayName \"""" + pathwayName + """\" .
  ?pathwayURI bp3:xref [ rdf:type bp3:UnificationXref ;
                      bp3:db "Reactome" ;
                      bp3:id ?pathwayReactomeID ] .
}
"""
    sparql = SPARQLWrapper(endpointURL_reactome_v65)
    sparql.setQuery(prefixes+query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results

# Launch SPARQL endpoint loading the standalone BioPAX export of Reactome (version 65)

In [None]:
# TODO: change path
command = [
    '/home/cbeust/Softwares/JenaFuseki/apache-jena-fuseki-4.9.0/fuseki-server',
    '--file', '/home/cbeust/Projects/2024/BioPAX_Review/Compare_BioPAX_Files/ReactomeBioPAX/_00_Reactome_Data_v65/Homo_sapiens.owl',
    '--file', '/home/cbeust/Projects/2024/BioPAX_Review/BioPAX_Ontology/biopax-level3.owl',
    '/reactome_v65']
process = subprocess.Popen(command)
time.sleep(60)

# Extraction of "Signaling by EGFR" (R-HSA-177929) pathway from Reactome BioPAX export v65

## 1 - Extract direct pathway components

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?pathwayStart ?pathway_component ?direct_component
WHERE {
  VALUES ?pathwayStart { reactome:Pathway2275 } # "Signaling by EGFR" in Reactome BioPAX v65 (Pathway165 in Reactome BioPAX v38)
  
  # Pathway components
  ?pathwayStart bp3:pathwayComponent ?direct_component .
  ?pathwayStart ?pathway_component ?direct_component .
}
"""
sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/1_direct_pathway_compo.csv", "wb") as f:
    f.write(results)

## 2 - Description of direct pathway components of type BiochemicalReaction

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?direct_component ?relations_pathway_compo_br ?linked_pathway_compo_br
WHERE {
  VALUES ?pathwayStart { reactome:Pathway2275 } # "Signaling by EGFR" in Reactome BioPAX v65 (Pathway165 in Reactome BioPAX v38)
  
  # Pathway components
  ?pathwayStart bp3:pathwayComponent ?direct_component .
  ?pathwayStart ?pathway_component ?direct_component .
  ?direct_component rdf:type bp3:BiochemicalReaction .
  VALUES ?relations_pathway_compo_br { bp3:displayName bp3:left bp3:right bp3:conversionDirection bp3:participantStoichiometry}
  ?direct_component ?relations_pathway_compo_br ?linked_pathway_compo_br .
}
"""
sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/2_direct_pathway_compo_br.csv", "wb") as f:
    f.write(results)

## 3 - Description of direct pathway components of type Pathway

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?direct_component ?relations_pathway_compo_path ?linked_pathway_compo_path
WHERE {
  VALUES ?pathwayStart { reactome:Pathway2275 } # "Signaling by EGFR" in Reactome BioPAX v65 (Pathway165 in Reactome BioPAX v38)
  
  # Pathway components
  ?pathwayStart bp3:pathwayComponent ?direct_component .
  ?pathwayStart ?pathway_component ?direct_component .
  ?direct_component rdf:type bp3:Pathway .
  VALUES ?relations_pathway_compo_path { bp3:displayName bp3:pathwayComponent bp3:pathwayOrder }
  ?direct_component ?relations_pathway_compo_path ?linked_pathway_compo_path .
}
"""
sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/3_direct_pathway_compo_path.csv", "wb") as f:
    f.write(results)

## 4 - Extraction of direct pathway steps

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?pathwayStart ?pathway_order ?direct_ps
WHERE {
  VALUES ?pathwayStart { reactome:Pathway2275 } # "Signaling by EGFR" in Reactome BioPAX v65 (Pathway165 in Reactome BioPAX v38)
  
  # DIRECT PATHWAY STEPS
  ?pathwayStart bp3:pathwayOrder ?direct_ps .
  ?pathwayStart ?pathway_order ?direct_ps .
  ?direct_ps rdf:type bp3:PathwayStep .
}
"""
sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/4_direct_pathway_steps.csv", "wb") as f:
    f.write(results)

## 5 - Description of direct pathway steps 

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?direct_ps ?relations_of_interest ?linked_to_ps
WHERE {
  VALUES ?pathwayStart { reactome:Pathway2275 } # "Signaling by EGFR" in Reactome BioPAX v65 (Pathway165 in Reactome BioPAX v38)
  
  # DIRECT PATHWAY STEPS
  ?pathwayStart bp3:pathwayOrder ?direct_ps .
  ?pathwayStart ?pathway_order ?direct_ps .
  ?direct_ps rdf:type bp3:PathwayStep .
  
  # RELATIONS OF PATHWAY STEPS
  VALUES ?relations_of_interest { bp3:stepProcess bp3:nextStep }
  ?direct_ps ?relations_of_interest ?linked_to_ps .
}
"""
sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/5_direct_pathway_steps_next_steps_and_step_process.csv", "wb") as f:
    f.write(results)

## 6 - Description of step processes linked to pathway steps

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?linked_to_ps ?relations_of_interest_starting_from_step_process ?step_process_relations
WHERE {
  VALUES ?pathwayStart { reactome:Pathway2275 } # "Signaling by EGFR" in Reactome BioPAX v65 (Pathway165 in Reactome BioPAX v38)
  
  # DIRECT PATHWAY STEPS
  ?pathwayStart bp3:pathwayOrder ?direct_ps .
  ?pathwayStart ?pathway_order ?direct_ps .
  ?direct_ps rdf:type bp3:PathwayStep .
  
  # RELATIONS OF PATHWAY STEPS
  VALUES ?relations_of_interest { bp3:stepProcess bp3:nextStep }
  ?direct_ps ?relations_of_interest ?linked_to_ps .
  VALUES ?relations_of_interest_starting_from_step_process { bp3:left bp3:pathwayOrder bp3:pathwayComponent bp3:right bp3:controller bp3:controlled bp3:displayName bp3:conversionDirection bp3:participantStoichiometry }
  ?linked_to_ps ?relations_of_interest_starting_from_step_process ?step_process_relations .
}
"""
sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/6_pathway_steps_steps_processes.csv", "wb") as f:
    f.write(results)

## 7 - Description of biochemical reactions linked to direct subpathways

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?linked_to_direct_compo ?relations_br ?description_br
WHERE {
  VALUES ?pathwayStart { reactome:Pathway2275 } # "Signaling by EGFR" in Reactome BioPAX v65 (Pathway165 in Reactome BioPAX v38)
  VALUES ?pathway_compo_types { bp3:Pathway bp3:BiochemicalReaction }
  
  # DIRECT COMPONENTS
  ?pathwayStart bp3:pathwayComponent ?direct_component .
  ?pathwayStart ?pathway_component ?direct_component .
  ?direct_component rdf:type ?pathway_compo_types .
  
  # RELATIONS OF PATHWAY COMPONENTS
  VALUES ?relations_of_interest { bp3:left bp3:right bp3:displayName bp3:conversionDirection bp3:pathwayOrder bp3:pathwayComponent bp3:participantStoichiometry }
  ?direct_component ?relations_of_interest ?linked_to_direct_compo .
  
  ?linked_to_direct_compo rdf:type bp3:BiochemicalReaction .
  VALUES ?relations_br {bp3:left bp3:right bp3:displayName bp3:conversionDirection bp3:bp3:participantStoichiometry }
  ?linked_to_direct_compo ?relations_br ?description_br
  
}
"""
sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/7_linked_to_br_of_direct_pathway.csv", "wb") as f:
    f.write(results)

## 8 - Description of step processes of pathway steps linked to subpathways

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?pathwayStep ?relation ?stepProcess
WHERE {
  VALUES ?pathwayRoot { """ + pathwayIdent + """ } # Signaling by EGFR

  ?pathwayRoot bp3:pathwayComponent ?pathwayCompo .
  ?pathwayCompo bp3:pathwayOrder ?pathwayStep .
  ?pathwayStep bp3:stepProcess ?stepProcess .
  ?pathwayStep ?relation ?stepProcess .

}
"""

sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/8_processes_of_pathway_steps_linked_to_subpathways.csv", "wb") as f:
    f.write(results)

## Describe the processes linked to pathway steps linked to direct subpathways

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?stepProcess ?properties_of_interest ?description_step_process
WHERE {
  VALUES ?pathwayRoot { """ + pathwayIdent + """ } # Signaling by EGFR
  VALUES ?properties_of_interest { bp3:right bp3:left bp3:participantStoichiometry bp3:controlled bp3:controller bp3:displayName}

  ?pathwayRoot bp3:pathwayComponent ?pathwayCompo .
  ?pathwayCompo bp3:pathwayOrder ?pathwayStep .
  ?pathwayStep bp3:stepProcess ?stepProcess .
  ?stepProcess ?properties_of_interest ?description_step_process .

}
"""

sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)


sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/9_description_processes_pf_ps_linked_to_subpathways.csv", "wb") as f:
    f.write(results)

## 10 - Get next steps between pathway steps linked to direct subpathways

In [None]:
defaultPrefixValue = "http://www.reactome.org/biopax/{}/48887#".format(reactomeVersion)
pathwayIdent = "http://www.reactome.org/biopax/65/48887#Pathway2275"
if pathwayIdent.startswith(defaultPrefixValue):
        pathwayIdent = pathwayIdent.replace(defaultPrefixValue, "reactome" + ':')
if not(pathwayIdent.startswith("reactome" + ':')):
        pathwayIdent = "<" + pathwayIdent + ">"
query = """
SELECT DISTINCT ?pathwayStep ?property ?nextStep
WHERE {
  VALUES ?pathwayRoot { """ + pathwayIdent + """ } # Signaling by EGFR
  VALUES ?properties_of_interest { bp3:right bp3:left bp3:participantStoichiometry bp3:controlled bp3:controller bp3:displayName}

  ?pathwayRoot bp3:pathwayComponent ?pathwayCompo .
  ?pathwayCompo bp3:pathwayOrder ?pathwayStep .
  ?pathwayStep bp3:nextStep ?nextStep .
  ?pathwayStep ?property ?nextStep .

}
"""

sparql = SPARQLWrapper(endpointURL_reactome_v65)
sparql.setQuery(prefixes+query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
displaySparqlResults(results)

sparql.setReturnFormat(CSV)
results = sparql.query().convert()
with open("../Results/10_next_steps_between_PS_linked_to_subpathways.csv", "wb") as f:
    f.write(results)

# Merge query results in a single file

In [None]:
q1 = pd.read_csv("../Results/1_direct_pathway_compo.csv", header=None, sep=",")
q1 = q1.drop(q1.index[0]).reset_index(drop=True)
q2 = pd.read_csv("../Results/2_direct_pathway_compo_br.csv", header=None, sep=",")
q2 = q2.drop(q2.index[0]).reset_index(drop=True)
q3 = pd.read_csv("../Results/3_direct_pathway_compo_path.csv", header=None, sep=",")
q3 = q3.drop(q3.index[0]).reset_index(drop=True)
q4 = pd.read_csv("../Results/4_direct_pathway_steps.csv", header=None, sep=",")
q4 = q4.drop(q4.index[0]).reset_index(drop=True)
q5 = pd.read_csv("../Results/5_direct_pathway_steps_next_steps_and_step_process.csv", header=None, sep=",")
q5 = q5.drop(q5.index[0]).reset_index(drop=True)
q6 = pd.read_csv("../Results/6_pathway_steps_steps_processes.csv", header=None, sep=",")
q6 = q6.drop(q6.index[0]).reset_index(drop=True)
q7 = pd.read_csv("../Results/7_linked_to_br_of_direct_pathway.csv", header=None, sep=",")
q7 = q7.drop(q7.index[0]).reset_index(drop=True)
q8 = pd.read_csv("../Results/8_processes_of_pathway_steps_linked_to_subpathways.csv", header=None, sep=",")
q8 = q8.drop(q8.index[0]).reset_index(drop=True)
q9 = pd.read_csv("../Results/9_description_processes_pf_ps_linked_to_subpathways.csv", header=None, sep=",")
q9 = q9.drop(q9.index[0]).reset_index(drop=True)
q10 = pd.read_csv("../Results/10_next_steps_between_PS_linked_to_subpathways.csv", header=None, sep=",")
q10 = q10.drop(q10.index[0]).reset_index(drop=True)


concat_df = pd.concat([q1, q2, q3, q4, q5, q6, q7, q8, q9, q10], ignore_index=True)

concat_df = concat_df.replace({"http://www.reactome.org/biopax/65/48887#": "reactome:", "http://www.biopax.org/release/biopax-level3.owl#": "bp3:"}, regex=True)
print(concat_df)
concat_df.to_csv("../Results/Final_pathway_egf_completed.tsv", sep="\t", header=None, index=False)