Skip to content

Queries

LuisFranciscoHS edited this page May 18, 2019 · 3 revisions

We present next the Cypher language queries to find the numbers shown in the summary statistics. These can also be used to get the protein-pathway mapping used in PathwayMatcher.

To run these queries it is required to have Neo4j installed and running with the Reactome Graph database loaded.

Proteins

  • Count UniProt accessions related to human PhysicalEntity objects:
MATCH (re:ReferenceEntity{databaseName:'UniProt'})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:'Homo sapiens'})
RETURN count(DISTINCT re.identifier) as protein
  • Count number of human PhysicalEntity objects related to a UniProt accession distinguishing between isoform sequences:
MATCH (re:ReferenceEntity{databaseName:'UniProt'})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:'Homo sapiens'})
RETURN DISTINCT re.identifier, re.variantIdentifier
  • Number of proteins with a specific isoform:
MATCH (re:ReferenceEntity{databaseName:'UniProt'})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:'Homo sapiens'})
 WHERE re.variantIdentifier IS NOT NULL
 RETURN DISTINCT re.identifier, re.variantIdentifier
  • Number of proteins with more than one isoform:
MATCH (re:ReferenceEntity{databaseName:'UniProt'})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:'Homo sapiens'})
 WHERE re.variantIdentifier IS NOT NULL
 WITH DISTINCT re.identifier as accession, collect(DISTINCT re.variantIdentifier) as isoforms
 WHERE size(isoforms) > 1
 RETURN DISTINCT accession, isoforms
  • Number of reactions per protein:
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE r.speciesName = 'Homo sapiens'
WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet
RETURN DISTINCT protein, size(reactionSet) as reactionCount
  • Number of pathways for each protein:
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
MATCH (p:Pathway{speciesName:'Homo sapiens'})-[:hasEvent*]->(r:Reaction{speciesName:'Homo sapiens'})-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WITH re.identifier as protein, collect(DISTINCT p.stId) as pathwaySet
RETURN protein, size(pathwaySet) as pathwayCount
  • Number of reactions and pathways for each protein:
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
OPTIONAL MATCH (p:Pathway{speciesName:'Homo sapiens'})-[:hasEvent*]->(r:Reaction{speciesName:'Homo sapiens'})-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WITH re.identifier as protein, collect(DISTINCT p.stId) as pathwaySet, collect(DISTINCT r.stId) as reactionSet
RETURN protein, size(pathwaySet) as pathwayCount, size(reactionSet) as reactionCount
  • Statistics of reactions for all proteins:
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE r.speciesName = 'Homo sapiens'
WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet
WITH DISTINCT protein, size(reactionSet) as reactionCount
RETURN min(reactionCount) as minReactionCount, avg(reactionCount) as avgReactionCount, max(reactionCount) as maxReactionCount
  • Statistics of pathways for all proteins:
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT p.stId) as pathwaySet
WITH protein, size(pathwaySet) as pathwayCount
RETURN min(pathwayCount) as minPathwayCount, avg(pathwayCount) as avgPathwayCount, max(pathwayCount) as maxPathwayCount
  • Number of reactions and pathways for proteins with at least one ptm in any of its proteoforms, and that participate in at least one reaction and pathway:
MATCH (re:ReferenceEntity{databaseName:'UniProt'})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT re
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
RETURN DISTINCT re.identifier as protein,re.displayName as name, re.description as description, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY pathwayCount DESC, reactionCount DESC
LIMIT 10

Proteoforms

  • PTM sets per PhysicalEntity, including proteins without modifications (include empty sets)
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
WITH DISTINCT re.identifier as protein, pe, collect(DISTINCT p.stId) as pathwaySet
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(mr)
RETURN DISTINCT protein, pe.stId, pathwaySet, collect(mr.coordinate) as ptmSet
ORDER BY protein
  • Count PTMSets for each protein:
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(mr)
WITH DISTINCT re.identifier as protein, pe, collect(mr.coordinate) as ptmSet
WITH DISTINCT protein, size(collect(ptmSet)) as ptmSetCount
RETURN min(ptmSetCount), avg(ptmSetCount), max(ptmSetCount)
  • Get all proteoforms:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(type + ":" + CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END) AS ptms
WITH DISTINCT pe, re, ptms
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms
  • Get all proteoforms with at least one post translational modification: 3137 (22.44% of the proteoforms)
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe.stId AS physicalEntity,
                re.identifier AS protein,
                re.variantIdentifier AS isoform,
                tm.coordinate as coordinate, 
                mod.identifier as type ORDER BY type, coordinate
WITH DISTINCT physicalEntity,
				protein,
                CASE WHEN isoform IS NOT NULL THEN isoform ELSE protein END as isoform,
                COLLECT(type + ":" + CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END) AS ptms
WHERE size(ptms) > 0
                RETURN DISTINCT isoform, ptms
                ORDER BY isoform, ptms
  • Number of reactions of each proteoform:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe,
              re,
              tm.coordinate as coordinate, 
              mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT 
	        pe,
		    re,
            COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT pe, re, ptms
OPTIONAL MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, ptms, size(collect(DISTINCT r.stId)) as reactionCount
ORDER BY protein
  • Statistics of reactions of each proteoform:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe,
              re,
              tm.coordinate as coordinate, 
              mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT 
	        pe,
		    re,
            COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT pe, re, ptms
OPTIONAL MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
WITH DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms, size(collect(DISTINCT r.stId)) as reactionCount
RETURN min(reactionCount) as minReactionCount, avg(reactionCount) as avgReactionCount, max(reactionCount) as maxReactionCount
  • Number of pathways for each proteoform:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT pe, re, ptms
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, ptms, size(collect(DISTINCT p.stId)) as pathwayCount
  • Statistics of pathways for each proteoform:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe,
              re,
              tm.coordinate as coordinate, 
              mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT 
	        pe,
		    re,
            COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT pe, re, ptms
OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
WITH DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms, size(collect(DISTINCT p.stId)) as pathwayCount
RETURN min(pathwayCount) as minPathwayCount, avg(pathwayCount) as avgPathwayCount, max(pathwayCount) as maxPathwayCount
  • Note: They queries for statistics show for all the proteoforms. For the cases of "proteins with at least one reaction and pathway annotated" remove the "OPTIONAL" keyword at "OPTIONAL MATCH (r:Reaction)" or "OPTIONAL MATCH (p:Pathway)".

  • Note: Finds all the pathways using the reactions, no matter the level in the pathway hierarchy.

  • Top proteins participating in most reactions:

MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE r.speciesName = 'Homo sapiens'
WITH re.identifier as protein, re.displayName as name, re.description as description, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet
RETURN DISTINCT protein, name, description, size(reactionSet) as reactionCount 
ORDER BY reactionCount DESC
LIMIT 10
  • Top proteins participating in most pathways ant its containing reactions:
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
WITH re, pe
OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
WITH re.identifier as protein, re.displayName as name, re.description as description, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet, collect(DISTINCT p.stId) as pathwaySet
RETURN DISTINCT protein, name, description, size(reactionSet) as reactionCount, size(pathwaySet) as pathwayCount
ORDER BY pathwayCount DESC, reactionCount DESC, protein, name
LIMIT 10
  • Top proteoforms participating in most pathways and its containing reactions:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT pe, re, ptms
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, re.displayName as name, re.description as description, ptms, collect(DISTINCT pe.stId) as peSet, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY pathwayCount DESC, reactionCount DESC, protein, name
LIMIT 10
  • Number of reactions and pathways for proteoforms with at least one ptm, and that participate in at least one reaction and pathway. Note that proteoforms also differentiate between isoforms.
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(type + ":" + CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END) AS ptms
WITH DISTINCT pe, re, ptms
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, re.displayName as name, re.description as description, ptms, collect(DISTINCT pe.stId) as peSet, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY pathwayCount DESC, reactionCount DESC, protein, name
LIMIT 10
  • Number of proteoforms for each protein:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re.identifier as accession, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, collect(DISTINCT pe.stId) as equivalentPe, ptms
WITH DISTINCT accession, {isoform: isoform, ptms: ptms} as proteoform
RETURN accession, count(proteoform) as num_proteoforms
  • Number of proteins with more than one proteoform:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re.identifier as accession, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, collect(DISTINCT pe.stId) as equivalentPe, ptms
WITH DISTINCT accession, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT  accession, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
RETURN accession, num_proteoforms

Post translational modifications

  • Prevalence of the different PTM annotations in Reactome. PTM labels are extracted from the Reactome database and the number of proteins annotated with the PTM is displayed for each label. If a protein is carrying multiple instances of the PTM, the PTM is counted only once:
MATCH (re:ReferenceEntity{databaseName:'UniProt'})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
RETURN DISTINCT count(DISTINCT re) as frequency, mod.displayName as name
ORDER BY frequency DESC
LIMIT 20

Sensitivity analysis

  • Number of reactions and pathways mapped with each modified proteoforms of proteins that appear with more than one proteoform.
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
WITH re, proteoforms, num_proteoforms
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re)
WITH DISTINCT pe, re
MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(type + ":" + CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END) AS ptms
WITH DISTINCT pe, re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
WITH DISTINCT pe, re, isoform, ptms, r, p
RETURN DISTINCT isoform, ptms, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY pathwayCount DESC, reactionCount DESC, isoform, ptms
  • Similar to the previous, but agreggating the total for the modified proteoforms of each protein that has more than one proteoform.
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
WITH re, proteoforms, num_proteoforms
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re)
WITH DISTINCT pe, re
MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
RETURN DISTINCT re.identifier as protein, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY pathwayCount DESC, reactionCount DESC, protein
  • Number of reactions and pathways mapped by each protein that has more than one proteoform:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
WITH re, proteoforms, num_proteoforms
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re)
WITH DISTINCT pe, re
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
RETURN DISTINCT re.identifier as protein, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY pathwayCount DESC, reactionCount DESC, protein
  • Number of proteoforms for proteins with more than one proteoform:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re)
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re.identifier as accession, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, collect(DISTINCT pe.stId) as equivalentPe, ptms
WITH DISTINCT accession, {isoform: isoform, ptms: ptms} as proteoform
RETURN accession, count(proteoform) as num_proteoforms
  • Number of reactions and pathways for proteins with multiple proteoforms using only the unmodified proteoforms:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
WITH re, proteoforms, num_proteoforms
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re)
WITH DISTINCT pe, re
MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)
WITH DISTINCT pe, re, size(collect(tm)) as num_mods
WHERE num_mods = 0
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
WITH DISTINCT pe, re, r, p
RETURN DISTINCT re.identifier as protein, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY pathwayCount DESC, reactionCount DESC

Alternative version:

MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
WITH re, proteoforms, num_proteoforms
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT pe, re, ptms
WHERE size(ptms) = 0
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
RETURN re.identifier as protein, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY protein, pathwayCount DESC, reactionCount DESC
  • Number of reactions and pathways for each proteoform of a protein with multiple proteoforms:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
WITH re, proteoforms, num_proteoforms
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT pe, re, ptms
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, ptms, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY protein, ptms, pathwayCount DESC, reactionCount DESC
  • Number of reactions and pathways for proteins with multiple proteoforms using only the unmodified proteoforms:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms > 1
WITH re, proteoforms, num_proteoforms
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT pe, re, ptms
WHERE size(ptms) = 0
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
RETURN re.identifier as protein, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
ORDER BY protein, pathwayCount DESC, reactionCount DESC
  • Proteins with one proteoform:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms = 1
RETURN re.identifier, proteoforms, num_proteoforms
  • Proteins with a single proteoform, and that proteoform is modified:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms = 1 AND ANY (x IN proteoforms WHERE size(x.ptms) > 0)
RETURN re.identifier, proteoforms, num_proteoforms
  • Proteins with a single proteoform, and that proteoform is not modified:
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
WITH DISTINCT pe, re
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type 
ORDER BY type, coordinate
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
WITH DISTINCT re, CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as isoform, ptms
WITH DISTINCT re, {isoform: isoform, ptms: ptms} as proteoform
WITH DISTINCT re, collect(proteoform) as proteoforms, count(proteoform) as num_proteoforms
WHERE num_proteoforms = 1 AND ANY (x IN proteoforms WHERE size(x.ptms) = 0)
RETURN re.identifier, proteoforms, num_proteoforms