# DataNode Type Counts

In [2]:
from pip._internal import main as pip
try:
    import csv
except ImportError:
    pip(['install', 'csv'])
    import csv
try:
    from SPARQLWrapper import SPARQLWrapper, JSON
except  ImportError:
    pip(['install', 'sparqlwrapper'])
    from SPARQLWrapper import SPARQLWrapper, JSON
    
import pandas
    
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
DataNodeCount = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT Distinct (substr(str(?o),41) as ?DNType) (COUNT(?o) as ?DNCount)
      WHERE {
        ?entity a ?o . 
        ?entity a wp:DataNode . 
        ?entity dcterms:isPartOf ?pathway .
        ?pathway    a wp:Pathway ;
            wp:ontologyTag <http://vocabularies.wikipathways.org/wp#Curation:AnalysisCollection> .
         
      } GROUP BY ?o
        ORDER BY DESC (?DNCount)
    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	DataNodeCount = DataNodeCount.append({
		'DataNode Type': result["DNType"]["value"],
        'DataNode Count': result["DNCount"]["value"],
	 }, ignore_index=True)
    
DNCountInteger = DataNodeCount.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [3]:
DataNodeCount

Unnamed: 0,DataNode Type,DataNode Count
0,DataNode,78362
1,GeneProduct,63884
2,Protein,15063
3,Metabolite,10415
4,Rna,2008
5,Complex,189


# GPMLRDF datanodes WITHOUT a WPRDF datanode equivalent 

In [4]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
GpmlWoWP = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT (STR(?typeIRI) AS ?type) (COUNT(?datanode) AS ?count) WHERE {
          ?datanode a gpml:DataNode ; 
            gpml:type ?typeIRI .
          ?datanode dcterms:isPartOf ?pathway .
          ?wpPathway    a wp:Pathway ;
            wp:isAbout ?pathway ;
            wp:ontologyTag <http://vocabularies.wikipathways.org/wp#Curation:AnalysisCollection> .
          MINUS { ?wpDatanode wp:isAbout ?datanode } .
       } GROUP BY ?typeIRI
         ORDER BY DESC(?count)

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	GpmlWoWP = GpmlWoWP.append({
		'DataNode Type': result["type"]["value"],
        'DataNode Count': result["count"]["value"],
	 }, ignore_index=True)
    
GpmlWoWPInteger = GpmlWoWP.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [5]:
GpmlWoWP

Unnamed: 0,DataNode Type,DataNode Count
0,GeneProduct,7095
1,Metabolite,1738
2,Protein,697
3,Unknown,533
4,Pathway,485
5,Rna,156
6,Complex,25


# GPMLRDF datanodes WITH a WPRDF datanode equivalent

In [6]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
GpmlWWP = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT DISTINCT (STR(?typeIRI) AS ?type) (COUNT(?wpDatanode) AS ?count) WHERE {
          ?datanode a gpml:DataNode ; gpml:type ?typeIRI .
          ?datanode dcterms:isPartOf ?pathway . 
          ?wpDatanode wp:isAbout ?datanode .
          ?wpPathway    a wp:Pathway ;
            wp:isAbout ?pathway ;
            wp:ontologyTag <http://vocabularies.wikipathways.org/wp#Curation:AnalysisCollection> .
        } GROUP BY ?typeIRI
          ORDER BY DESC(?count)

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	GpmlWWP = GpmlWWP.append({
		'DataNode Type': result["type"]["value"],
        'DataNode Count': result["count"]["value"],
	 }, ignore_index=True)
    
GpmlWWPInteger = GpmlWWP.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [7]:
GpmlWWP

Unnamed: 0,DataNode Type,DataNode Count
0,GeneProduct,68065
1,Metabolite,13940
2,Protein,9188
3,Rna,984
4,Pathway,920
5,Complex,182


# % GPML RDF datanodes WITHOUT a WP RDF datanode equivalent

In [8]:
percentGPWoWP = ( GpmlWoWPInteger / (GpmlWoWPInteger + GpmlWWPInteger) ) * 100
formPercentGPWoWP = str(round(percentGPWoWP, 3)) + '% of GPML datanodes WITHOUT a WP datanode equivalent'

In [9]:
formPercentGPWoWP

'10.316% of GPML datanodes WITHOUT a WP datanode equivalent'

#  The Complex GPMLRDF datanodes without WPRDF equivalents

In [10]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
CompGpmlWoWp = pandas.DataFrame(columns=['Complex DataNodes'])

pathwayQuery = '''
      SELECT  (substr(str(?datanode),37 ) as ?datanodes ) ?label
      WHERE {
          ?datanode a gpml:DataNode ; gpml:type "Complex"^^xsd:string .
          ?datanode dcterms:isPartOf ?pathway .
          ?wpPathway    a wp:Pathway ;
              wp:isAbout ?pathway ;
              wp:ontologyTag <http://vocabularies.wikipathways.org/wp#Curation:AnalysisCollection> .
          MINUS { ?wpDatanode wp:isAbout ?datanode }
          OPTIONAL {?datanode gpml:textlabel ?label . }
          
        }

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	CompGpmlWoWp = CompGpmlWoWp.append({
		'Complex DataNodes': result["datanodes"]["value"],
        'Complex Labels': result["label"]["value"],
	 }, ignore_index=True)

In [11]:
CompGpmlWoWp

Unnamed: 0,Complex DataNodes,Complex Labels
0,WP391_r71373/DataNode/fc7a3,PKA
1,WP1493_r79802/DataNode/ec1f0,Rubisco activase
2,WP3134_r80692/DataNode/fc7a3,PKA
3,WP1263_r71748/DataNode/fc7a3,PKA
4,WP2583_r97638/DataNode/c951f,PD-L1
5,WP3226_r89410/DataNode/b328d,DRE region
6,WP3226_r89410/DataNode/c3d69,TATA
7,WP2586_r91687/DataNode/b328d,DRE region
8,WP2586_r91687/DataNode/c3d69,TATA
9,WP2583_r97638/DataNode/a8ece,B7-1/ B7-2
