# Explore the SQA Data

In this notebook, we explore the SPARQL queries. Especially, we extract all the entities and relations present in the SQA dataset. Mainly to seed the Radom Walks thorughhe DBpedia graph.

In [1]:
import json 

idata = open('lcquad_qaldformat.json', 'rt', encoding='utf-8')
idata = json.load(idata)

iquestions = idata['questions']

In [2]:
#extract the questions, the SPARQL queries and the answers. 
#The answers are either a list of entities, number (for How many .. questions) or a boolean.
question_strings = []
spqrql_queries = []
answers = []

qlens = []
alens = []
for iquestion in iquestions:
    question_string = iquestion['question'][0]['string']
    sparql_query = iquestion['query']['sparql']
    answer = iquestion['answers']
    qlens.append(len(iquestion['question']))
    alens.append(len(answer))
    question_strings.append(question_string)
    spqrql_queries.append(sparql_query)
    answers.append(answer)


In [3]:
#parse sparql
#import sys
#!{sys.executable} -m pip install rdflib
import rdflib
from rdflib.plugins.sparql.parser import parseQuery
from pprint import pprint

parsed_queries = []
for query in spqrql_queries:
    try:
        query = query.replace('COUNT(?uri)', '(COUNT(*) AS ?callret)') #make the count queries compatible with rdflib 
        q = parseQuery(query)[1]
        parsed_queries.append(q)
    except:
         print(query)

In [4]:
#example of parse query
q = parseQuery('SELECT DISTINCT (COUNT(*) AS ?callret) WHERE { ?x <http://dbpedia.org/ontology/builder> <http://dbpedia.org/resource/PCL_Construction> . ?x <http://dbpedia.org/ontology/tenant> ?uri  . }')[1]

q['where']['part'][0]['triples'][0][1]['part'][0]['part'][0]['part'] #extremely ugly

rdflib.term.URIRef('http://dbpedia.org/ontology/builder')

In [5]:
#how often is the DISTINCT modifier present in the dataset. 
from collections import Counter

modifiers = [q.modifier for q in parsed_queries]
Counter(modifiers)

Counter({'DISTINCT': 4632, None: 368})

In [6]:
#print the query types in the dataset. 
query_types = [q.name for q in parsed_queries]
Counter(query_types)

Counter({'SelectQuery': 4632, 'AskQuery': 368})

In [7]:
#Here, we look at how many triples there are in the SPARQL queries, where a triple is e.g. ?x rel subject
triplets = [q['where']['part'][0]['triples'] for q in parsed_queries]
Counter([len(triple) for triple in triplets])

Counter({2: 2103, 1: 1368, 3: 1529})

In [8]:
projection = [q.get('projection', None) for q in parsed_queries]
pprint(projection[15])
pprint(projection[16])
pprint(projection[0])

[{'evar': rdflib.term.Variable('callret'),
  'expr': {'expr': {'expr': {'expr': {'expr': {'expr': {'distinct': ([], {}),
                                                        'vars': '*'}}}}}}}]
'projection'
[vars_{'var': rdflib.term.Variable('uri')}]


In [9]:
parsed_triplets = []
for triples in triplets:
    parsed_triples = []
    for triple in triples:
        rel = triple[1]['part'][0]['part'][0]['part']
        parsed_triples.append((triple[0], rel, triple[2]))
    parsed_triplets.append(parsed_triples)

In [10]:
parsed_triplets[144]

[(rdflib.term.Variable('x'),
  rdflib.term.URIRef('http://dbpedia.org/ontology/ingredient'),
  rdflib.term.URIRef('http://dbpedia.org/resource/Shallot')),
 (rdflib.term.Variable('x'),
  rdflib.term.URIRef('http://dbpedia.org/ontology/country'),
  rdflib.term.Variable('uri')),
 (rdflib.term.Variable('uri'),
  rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
  rdflib.term.URIRef('http://dbpedia.org/ontology/Country'))]

In [11]:
parsed_triplets[144][0][1].n3()

'<http://dbpedia.org/ontology/ingredient>'

In [12]:
#get all the entities present in the SQA dataset - use them later as seeds for the random walks
entities = set()
relations = set()

for triples in parsed_triplets:
    for e1, r, e2 in triples:
        #add all entities which are not variables
        if not e1.n3() == '?uri' and not e1.n3() == '´?x':
            entities.add(e1.n3())
        if not e2.n3() == '?uri' and not e2.n3() == '´?x':
            entities.add(e2.n3())
        relations.add(r.n3())
print(len(entities), len(relations))

4156 597


In [13]:
import random

random.sample(list(entities), k=10)

['<http://dbpedia.org/resource/John_Smith_Griffin>',
 '<http://dbpedia.org/resource/Kimberly_Stewart>',
 '<http://dbpedia.org/resource/Drum_kit>',
 '<http://dbpedia.org/resource/Marvin_Bush>',
 '<http://dbpedia.org/resource/New_Jersey>',
 '<http://dbpedia.org/resource/Beşiktaş_JK_(wheelchair_basketball)>',
 '<http://dbpedia.org/resource/Martin_Pugh>',
 '<http://dbpedia.org/ontology/Disease>',
 '<http://dbpedia.org/resource/Boston_Bruins>',
 '<http://dbpedia.org/resource/Charles_LeMaire>']

# Creation of different dataset files

In [14]:
#store the list of entities in the SQA datafolder
ofile = open('sqa_entities.txt', 'wt', encoding='utf-8')
ofile.writelines([x + '\n' for x in list(entities)])
ofile.close()

In [15]:
#store the list of questions in the SQA datafolder
ofile = open('sqa_questions.txt', 'wt', encoding='utf-8')
ofile.writelines([x + '\n' for x in question_strings])
ofile.close()

In [16]:
#store the list of SPARQL queries in the SQA datafolder
ofile = open('sqa_sparql.txt', 'wt', encoding='utf-8')
ofile.writelines([x[0][1].n3() + '\n' for x in parsed_triplets])
ofile.close()

In [17]:
#extract the questions, the SPARQL queries and the answers, including index
question_strings = []
spqrql_queries = []
answers = []

qlens = []
alens = []

ofile = open('sqa_questions_index.txt', 'wt', encoding='utf-8')
ofile2 = open('sqa_sparql_index.txt', 'wt', encoding='utf-8')
for i, iquestion in enumerate(iquestions):
    question_string = iquestion['question'][0]['string']
    sparql_query = iquestion['query']['sparql']
    answer = iquestion['answers']
    qlens.append(len(iquestion['question']))
    alens.append(len(answer))
    question_strings.append(question_string)
    spqrql_queries.append(sparql_query)
    answers.append(answer)
    ofile.writelines(str(i) + ',' + question_string + '\n' )
    ofile2.writelines(str(i) + ',' + sparql_query + '\n' )
    
ofile.close()
ofile2.close()

In [18]:
#store the list of SPARQL queries with index
ofile = open('sqa_sparql_parsed.txt', 'wt', encoding='utf-8')
for i, x in enumerate(parsed_triplets):
    ofile.writelines(str(i) + ',' + str(x) + '\n')
    
ofile.close()

In [19]:
#get all the entities present in the SQA dataset - use them later as seeds for the random walks
entities = set()
relations = set()

ofile = open('sqa_entities_index.txt', 'wt', encoding='utf-8')
ofile2 = open('sqa_relations_index.txt', 'wt', encoding='utf-8')

for i, triples in enumerate(parsed_triplets):
    for e1, r, e2 in triples:
        #add all entities which are not variables
        if not e1.n3() == '?uri' and not e1.n3() == '?x':
            entities.add(e1.n3())
            ofile.writelines(str(i) + ',' + e1.n3() + '\n')
        if not e2.n3() == '?uri' and not e2.n3() == '?x':
            entities.add(e2.n3())
            ofile.writelines(str(i) + ',' + e2.n3() + '\n')
        relations.add(r.n3())
        ofile2.writelines(str(i) + ',' + r.n3() + '\n')
print(len(entities), len(relations))
ofile.close()
ofile2.close()

4155 597
