# DataNode Type Counts

In [3]:
from pip._internal import main as pip
try:
    import csv
except ImportError:
    pip(['install', 'csv'])
    import csv
try:
    from SPARQLWrapper import SPARQLWrapper, JSON
except  ImportError:
    pip(['install', 'sparqlwrapper'])
    from SPARQLWrapper import SPARQLWrapper, JSON
    
import pandas
    
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
DataNodeCount = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT (substr(str(?o),41) as ?DNType) (COUNT(?o) as ?DNCount)
      WHERE {
        ?entity a ?o . 
        ?entity a wp:DataNode . 
         
      } GROUP BY ?o
        ORDER BY DESC (?DNCount)
    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	DataNodeCount = DataNodeCount.append({
		'DataNode Type': result["DNType"]["value"],
        'DataNode Count': result["DNCount"]["value"],
	 }, ignore_index=True)
    
DNCountInteger = DataNodeCount.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [4]:
DataNodeCount

Unnamed: 0,DataNode Type,DataNode Count
0,DataNode,65010
1,GeneProduct,34840
2,Protein,14896
3,Complex,11671
4,Metabolite,4992
5,Rna,674


# GPMLRDF datanodes WITHOUT a WPRDF datanode equivalent 

In [5]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
GpmlWoWP = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT (STR(?typeIRI) AS ?type) (COUNT(?datanode) AS ?count) WHERE {
          ?datanode a gpml:DataNode ; gpml:type ?typeIRI .
          MINUS { ?wpDatanode wp:isAbout ?datanode }
       } GROUP BY ?typeIRI
         ORDER BY DESC(?count)

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	GpmlWoWP = GpmlWoWP.append({
		'DataNode Type': result["type"]["value"],
        'DataNode Count': result["count"]["value"],
	 }, ignore_index=True)
    
GpmlWoWPInteger = GpmlWoWP.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [6]:
GpmlWoWP

Unnamed: 0,DataNode Type,DataNode Count
0,GeneProduct,6788
1,Unknown,5316
2,Metabolite,1734
3,Protein,709
4,Pathway,484
5,Rna,160
6,Complex,25


# GPMLRDF datanodes WITH a WPRDF datanode equivalent

In [7]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
GpmlWWP = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT (STR(?typeIRI) AS ?type) (COUNT(?datanode) AS ?count) WHERE {
          ?datanode a gpml:DataNode ; gpml:type ?typeIRI .
          ?wpDatanode wp:isAbout ?datanode .
        } GROUP BY ?typeIRI
          ORDER BY DESC(?count)

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	GpmlWWP = GpmlWWP.append({
		'DataNode Type': result["type"]["value"],
        'DataNode Count': result["count"]["value"],
	 }, ignore_index=True)
    
GpmlWWPInteger = GpmlWWP.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [8]:
GpmlWWP

Unnamed: 0,DataNode Type,DataNode Count
0,Protein,126354
1,GeneProduct,68261
2,Metabolite,32144
3,Complex,14594
4,Pathway,1350
5,Rna,1124


# % GPML RDF datanodes WITHOUT a WP RDF datanode equivalent

In [9]:
percentGPWoWP = ( GpmlWoWPInteger / (GpmlWoWPInteger + GpmlWWPInteger) ) * 100
formPercentGPWoWP = str(round(percentGPWoWP, 3)) + '% of GPML datanodes WITHOUT a WP datanode equivalent'

In [10]:
formPercentGPWoWP

'5.874% of GPML datanodes WITHOUT a WP datanode equivalent'

#  The Complex GPMLRDF datanodes without WPRDF equivalents

In [11]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
CompGpmlWoWp = pandas.DataFrame(columns=['Complex DataNodes'])

pathwayQuery = '''
      SELECT  (substr(str(?datanode),37 ) as ?datanodes ) ?label
      WHERE {
          ?datanode a gpml:DataNode ; gpml:type "Complex"^^xsd:string .
          MINUS { ?wpDatanode wp:isAbout ?datanode }
          OPTIONAL {?datanode gpml:textlabel ?label . }
        }

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	CompGpmlWoWp = CompGpmlWoWp.append({
		'Complex DataNodes': result["datanodes"]["value"],
        'Complex Labels': result["label"]
	 }, ignore_index=True)

In [12]:
CompGpmlWoWp

Unnamed: 0,Complex DataNodes
0,WP1493_r79802/DataNode/ec1f0
1,WP3226_r89410/DataNode/b328d
2,WP3226_r89410/DataNode/c3d69
3,WP3601_r89202/DataNode/c8eb7
4,WP3601_r89202/DataNode/e3358
5,WP2586_r91687/DataNode/b328d
6,WP2586_r91687/DataNode/c3d69
7,WP3134_r80692/DataNode/fc7a3
8,WP3282_r96324/DataNode/a8ece
9,WP3282_r96324/DataNode/c951f
