# DataNode Type Counts

In [1]:
from pip._internal import main as pip
try:
    import csv
except ImportError:
    pip(['install', 'csv'])
    import csv
try:
    from SPARQLWrapper import SPARQLWrapper, JSON
except  ImportError:
    pip(['install', 'sparqlwrapper'])
    from SPARQLWrapper import SPARQLWrapper, JSON
    
import pandas
    
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
DataNodeCount = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT (substr(str(?o),41) as ?DNType) (COUNT(?o) as ?DNCount)
      WHERE {
        ?entity a ?o . 
        ?entity a wp:DataNode . 
         
      } GROUP BY ?o
        ORDER BY DESC (?DNCount)
    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	DataNodeCount = DataNodeCount.append({
		'DataNode Type': result["DNType"]["value"],
        'DataNode Count': result["DNCount"]["value"],
	 }, ignore_index=True)
    
DNCountInteger = DataNodeCount.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [2]:
DataNodeCount

Unnamed: 0,DataNode Type,DataNode Count
0,DataNode,63005
1,GeneProduct,34273
2,Protein,14455
3,Complex,10951
4,Metabolite,4574
5,Rna,693


# GPMLRDF datanodes WITHOUT a WPRDF datanode equivalent 

In [3]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
GpmlWoWP = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT (STR(?typeIRI) AS ?type) (COUNT(?datanode) AS ?count) WHERE {
          ?datanode a gpml:DataNode ; gpml:type ?typeIRI .
          MINUS { ?wpDatanode wp:isAbout ?datanode }
       } GROUP BY ?typeIRI
         ORDER BY DESC(?count)

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	GpmlWoWP = GpmlWoWP.append({
		'DataNode Type': result["type"]["value"],
        'DataNode Count': result["count"]["value"],
	 }, ignore_index=True)
    
GpmlWoWPInteger = GpmlWoWP.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [4]:
GpmlWoWP

Unnamed: 0,DataNode Type,DataNode Count
0,GeneProduct,6528
1,Unknown,4935
2,Metabolite,1693
3,Protein,664
4,Pathway,453
5,Rna,153
6,Complex,27


# GPMLRDF datanodes WITH a WPRDF datanode equivalent

In [5]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
GpmlWWP = pandas.DataFrame(columns=['DataNode Type', 'DataNode Count'])

pathwayQuery = '''
      SELECT (STR(?typeIRI) AS ?type) (COUNT(?datanode) AS ?count) WHERE {
          ?datanode a gpml:DataNode ; gpml:type ?typeIRI .
          ?wpDatanode wp:isAbout ?datanode .
        } GROUP BY ?typeIRI
          ORDER BY DESC(?count)

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	GpmlWWP = GpmlWWP.append({
		'DataNode Type': result["type"]["value"],
        'DataNode Count': result["count"]["value"],
	 }, ignore_index=True)
    
GpmlWWPInteger = GpmlWWP.loc[:, "DataNode Count"].apply(pandas.to_numeric, errors = 'ignore').sum() 

In [6]:
GpmlWWP

Unnamed: 0,DataNode Type,DataNode Count
0,Protein,123897
1,GeneProduct,66880
2,Metabolite,32378
3,Complex,14386
4,Pathway,1272
5,Rna,1251


# % GPML RDF datanodes WITHOUT a WP RDF datanode equivalent

In [7]:
percentGPWoWP = ( GpmlWoWPInteger / (GpmlWoWPInteger + GpmlWWPInteger) ) * 100
formPercentGPWoWP = str(round(percentGPWoWP, 3)) + '% of GPML datanodes WITHOUT a WP datanode equivalent'

In [8]:
formPercentGPWoWP

'5.679% of GPML datanodes WITHOUT a WP datanode equivalent'

#  The Complex GPMLRDF datanodes without WPRDF equivalents

In [9]:
sparql = SPARQLWrapper("http://sparql.wikipathways.org")
CompGpmlWoWp = pandas.DataFrame(columns=['Complex DataNodes'])

pathwayQuery = '''
      SELECT  (substr(str(?datanode),37 ) as ?datanodes ) WHERE {
          ?datanode a gpml:DataNode ; gpml:type "Complex"^^xsd:string .
          MINUS { ?wpDatanode wp:isAbout ?datanode }
        }

    '''
sparql.setQuery(pathwayQuery)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
	CompGpmlWoWp = CompGpmlWoWp.append({
		'Complex DataNodes': result["datanodes"]["value"],
	 }, ignore_index=True)

In [10]:
CompGpmlWoWp

Unnamed: 0,Complex DataNodes
0,WP2509_r92315/DataNode/c040f
1,WP391_r71373/DataNode/fc7a3
2,WP1493_r79802/DataNode/ec1f0
3,WP3134_r80692/DataNode/fc7a3
4,WP3601_r89202/DataNode/c8eb7
5,WP3601_r89202/DataNode/e3358
6,WP1263_r71748/DataNode/fc7a3
7,WP2583_r97638/DataNode/c951f
8,WP3226_r89410/DataNode/b328d
9,WP3226_r89410/DataNode/c3d69
