## Project Description
@todo


### Nomenclature
Property names are derived out of entities they join.

The ground/root entity is called ent0
Add the word out/in when required as shown in photo here: 
<img src="files/resources/nomenclature.png">
@todo


In [1]:
#Importing some external libraries
from pprint import pprint
import networkx as nx
import pickle
import json
import copy
import traceback

#Importing internal classes/libraries
import utils.dbpedia_interface as db_interface
import utils.natural_language_utilities as nlutils
import utils.subgraph as subgraph

#@TODO: put this class there


Airport
garrison
Mohnish Dubey


In [2]:
'''
    Initializing some stuff. Namely: DBpedia interface class.
    Reading the list of 'relevant' properties.
'''

dbp = None	#DBpedia interface object #To be instantiated when the code is run by main script/unit testing script
relevant_properties = open('resources/relation_whitelist.txt').read().split('\n')    #Contains the whitelisted props types
templates = json.load(open('templates.py'))   #Contains all the templates existing in templates.py
sparqls = {}   #Dict of the generated SPARQL Queries.

In [3]:
'''
    Some SPARQL Queries.
    Since this part of the code requires sending numerous convoluted queries to DBpedia, 
        we best not clutter the DBpedia interface class and rather simply declare them here.
        
    Note: The names here can be confusing. Refer to the diagram above to know what each SPARQL query tries to do.
'''

one_triple_right = '''
            SELECT DISTINCT ?p ?e 
            WHERE { 
                <%(e)s> ?p ?e 
            }'''

one_triple_left = '''
            SELECT DISTINCT ?e ?p
            WHERE {
                ?e ?p <%(e)s>
            }'''

In [4]:
'''
    This cell houses the script which will build a subgraph as shown in picture above for each a given URI.
    @TODO: do something in cases where certain nodes of the local subgraph are not found. 
            Will the code throw errors? How to you take care of them?
'''

def insert_triple_in_subgraph(G, _results, _labels, _direction, _origin_node, _filter_properties = True, _filter_literals = True):
    '''
        Function used to push the results of different queries into the subgraph.
        USAGE: only within the get_local_subgraph function.
        
        INPUTS:
        _subgraph: the subgraph object within which the triples are to be pushed
        _results: a result list which contains the sparql variables 'e' and 'p'. 
                They can be of either left or right queries as the cell above
        _labels: a tuple with three strings, which depict the nomenclature of the resources to be pushed
        _direction: True -> one triple right; False -> one triple left
        _origin_node: the results variable only gives us one p and one e. 
                Depending on the direction, this node will act as the other e to complete the triple  
        _filter_properties: if True, only properties existing in properties whitelist will be pushed in.
    '''
    
    
    for result in _results[u'results'][u'bindings']:
        #Parse the results into local variables (for readibility)
        prop = result[u'p'][u'value']
        ent = result[u'e'][u'value']
        
        if _filter_literals:
        	if nlutils.has_literal(ent):
        		continue

        if _filter_properties:
            
            #Filter results based on important properties
            if not prop.split('/')[-1] in relevant_properties:
                continue
        
        #Finally, insert, based on direction
        if _direction == True:
            #Right
            subgraph.insert(G=G, data=[ (_labels[0],_origin_node), (_labels[1],prop), (_labels[2],ent) ])
            
        elif _direction == False:
            #Left
            subgraph.insert(G=G, data=[(_labels[0],ent), (_labels[1],prop), (_labels[2],_origin_node) ])       

            
            
def get_local_subgraph(_uri):
    #Collecting required variables: DBpedia interface, and a new subgraph
    global dbp
    
    #Create a new graph
    G = nx.DiGraph()
    access = subgraph.accessGraph(G)
    
    
    ########### e ?p ?e (e_to_e_out and e_out) ###########
    
    results = dbp.shoot_custom_query(one_triple_right % {'e': _uri})
    labels = ('e','e_to_e_out','e_out')

    #Insert results in subgraph
    insert_triple_in_subgraph(G, _results=results, 
                             _labels=labels, _direction=True, 
                             _origin_node=_uri, _filter_properties=True)
    
    ########### ?e ?p e (e_in and e_in_to_e) ###########
    
    results = dbp.shoot_custom_query(one_triple_left % {'e':_uri} )
    labels = ('e_in', 'e_in_to_e','e')
    
    #Insert results in subgraph
    insert_triple_in_subgraph(G, _results=results, 
                             _labels=labels, _direction=False, 
                             _origin_node=_uri, _filter_properties=True)
                        
    ########### e p eout . eout ?p ?e (e_out_to_e_out_out and e_out_out) ###########
    
    #Get all the eout nodes back from the subgraph.
    e_outs = []
    op = access.return_outnodes('e')
    for x in op: 
        for tup in x:
            e_outs.append(tup[1].getUri())
            
    labels = ('e_out','e_out_to_e_out_out','e_out_out')
    
    for e_out in e_outs:
        results = dbp.shoot_custom_query(one_triple_right % {'e' : e_out})
        
        #Insert results in subgraph
        insert_triple_in_subgraph(G, _results=results, 
                                 _labels=labels, _direction=True, 
                                 _origin_node=e_out, _filter_properties=True)
    
    ########### e p eout . ?e ?p eout  (e_out_in and e_out_in_to_e_out) ###########
    
    #Use the old e_outs variable
    labels = ('e_out_in','e_out_in_to_e_out','e_out')
    
    for e_out in e_outs:
        results = dbp.shoot_custom_query(one_triple_left % {'e' : e_out})
        
        #Insert results in subgraph
        insert_triple_in_subgraph(G, _results=results, 
                                 _labels=labels, _direction=False, 
                                 _origin_node=e_out, _filter_properties=True)
        
    ########### ?e ?p ein . ein p e  (e_in_in and e_in_in_to_e_in) ###########
    
    #Get all the ein nodes back from subgraph
    e_ins = []
    op = access.return_innodes('e')
    for x in op:
        for tup in x:
            e_ins.append(tup[0].getUri())
    
    
    labels = ('e_in_in','e_in_in_to_e_in','e_in')
    
    for e_in in e_ins:
        results = dbp.shoot_custom_query(one_triple_left % {'e': e_in})
        
        #Insert results in subgraph
        insert_triple_in_subgraph(G, _results=results, 
                                 _labels=labels, _direction=False, 
                                 _origin_node=e_in, _filter_properties=True)
        
    ########### ein ?p ?e . ein p e  (e_in_to_e_in_out and e_in_out) ###########
    
    #Use the old e_ins variable
    labels = ('e_in','e_in_to_e_in_out','e_in_out')
    
    for e_in in e_ins:
        results = dbp.shoot_custom_query(one_triple_right % {'e': e_in })
        
        #Insert results in subgraph
        insert_triple_in_subgraph(G, _results=results, 
                                 _labels=labels, _direction=True, 
                                 _origin_node=e_in, _filter_properties=True)
    
    
    #Pushed all the six kind of nodes in the subgraph. Done!
    return G

In [5]:
def fill_specific_template(_template_id, _mapping, _debug = False):
    '''
        Function to fill a specific template.
        Given the template ID, it is expected to fetch the template from the set 
            and juxtapose the mapping on the template.
            
        Moreover, it also has certain functionalities that help the future generation of verbalizings.
             -> Returns the answer of the query, and the answer type
             -> In some templates, it also fetches the intermediate hidden variable and it's types too.
    
        -> create copy of template from the list
        -> get the needed metadata
        -> push it in the list
    '''
    
    global sparql, templates, outputfile
    
    #Create a copy of the template
    template = [x for x in templates if x['id'] == _template_id][0]
    template = copy.copy(template)
    
    #From the template, make a rigid query using mappings
    try:
        template['query'] = template['template'] % _mapping
    except KeyError:
        print "fill_specific_template: ERROR. Mapping does not match."
        return False
    
    #Include the mapping within the template object
    template['mapping'] = _mapping
    
    #Get the Answer of the query
    #get_answer now returns a dictionary with appropriate variable bindings. 
    template['answer'] = dbp.get_answer(template['query'])
    
    #Get the most specific type of the answers.
    '''
        ATTENTION: This can create major problems in the future. 
        We are assuming that the most specific type of one 'answer' would be the most specific type of all answers.
        In cases where answers are like Bareilly (City), Uttar Pradesh (State) and India (Country), 
            the SPARQL and NLQuestion would not be the same. 
            (We might expect all in the answers, but the question would put a domain restriction on answer.)
            
        @TODO: attend to this!
    '''
    template['answer_type'] = {}
    for variable in template['answer']:
        template['answer_type'][variable] = dbp.get_most_specific_class(template['answer'][variable][0])
    
    
    if _debug:
        pprint(template)
    
    #Push it onto the SPARQL List
    try:
        sparqls[_template_id].append(template)
        print len(sparqls[_template_id])
        if len(sparqls[_template_id]) > 100:
            print "in if condition"
            print "tempalte id is " , str(_template_id)
            with open('output/template%s.txt' % str(_template_id), "a+") as out:
                pprint(sparqls[_template_id], stream=out)
            with open('output/template%s.json' % str(_template_id), "a+") as out:
                json.dump(sparqls[_template_id],out)
            sparqls[_template_id] = []
    except:
        print traceback.print_exc()
        sparqls[_template_id] = [ template ]

    #@TODO: Periodic writes to disk
    
    return True

In [None]:
def fill_templates(_graph,_uri):
    '''
        This function is programmed to traverse through the Subgraph and create mappings for templates

        Per template traverse the graph, and pick out the needed stuff in local variables
    '''
    
    global dbp
    
    access = subgraph.accessGraph(_graph)
    
    ''' 
        Template #1: 
            SELECT DISTINCT ?uri WHERE {?uri <%(e_to_e_out)s> <%(e_out)s> } 
        Find e_out and e_to_e_out.
    '''
    counter_template1 = 0
    
    #Query the graph for outnodes from e
    op = access.return_outnodes('e')
    
    for triple in op[0]:
        
        #Making the variables explicit (for the sake of readability)
        e_out = triple[1].getUri()
        e_to_e_out = triple[2]['object'].getUri()
    
        #Create a mapping (in keeping with the templates' placeholder names)
        mapping = {'e_out': e_out, 'e_to_e_out': e_to_e_out }
        
        #Throw it to a function who will put it in the list with appropriate bookkeeping
        try:
            fill_specific_template(_template_id=1, _mapping=mapping)
            print str(counter_template1) , "tempalte1"
            counter_template1 = counter_template1 + 1
        except:
            print "check error stack"
            continue
        if counter_template1 > 10:
            pass
#             break
    
    ''' 
        Template #2: 
            SELECT DISTINCT ?uri WHERE { <%(e_in)s> <%(e_in_to_e)s> ?uri }
        Find e_in and e_in_to_e.
    '''
    
    #Query the graph for innodes to e
    op = access.return_innodes('e')
    counter_template2 = 0
    for triple in op[0]:
    
        #Making the variables explicit (for the sake of readability)
        e_in = triple[0].getUri()
        e_in_to_e = triple[2]['object'].getUri()
        
        #Create a mapping (in keeping with the templates' placeholder names)
        mapping = {'e_in':e_in, 'e_in_to_e': e_in_to_e}
        
        #Throw it to a function who will put it in the list with appropriate bookkeeping
        try:
            fill_specific_template( _template_id=2, _mapping=mapping)
            counter_template2 = counter_template2 + 1
            print str(counter_template2) , "tempalte2"
        except:
            print traceback.print_exc()
            continue
        if counter_template2 > 10:
            pass
#             break
        
    ''' 
        Template #3: 
            SELECT DISTINCT ?uri WHERE { <%(e_in_in)s> <%(e_in_in_to_e_in)s> ?x . ?x <%(e_in_to_e)s> ?uri }
        Find e_in and e_in_to_e.
    '''
    
    #Query the graph for innode to e and relevant properties
    op = access.return_innodes('e')
    counter_template3 = 0
    #Create a list of all these (e_in, e_in_to_e)
    one_triple_left_map = { triple[0].getUri(): triple[2]['object'].getUri()  for triple in op[0] }
    pprint(one_triple_left)
        
    #Collect all e_in_in and e_in_in_to_e_in 
    op = access.return_innodes('e_in')
        
    #This 'op' has the e_in_in and the prop for all e_in's. We now need to map one to the other.
    for list_of_triples in op:

        #Some triple are simply empty. Ignore them.
        if len(list_of_triples) == 0:
            continue

        ### Mapping e_in_in's to relevant e_in's ###
        
        #Pick one triple from the list.
        e_in = list_of_triples[0][1].getUri()
        e_in_to_e = one_triple_left_map[e_in]
        #Find the relevant property from the map
        
        #Given this information, lets create mappings of template three 
        for triple in list_of_triples:
            
            #Making the variables explicit (for the sake of readability)
            e_in_in = triple[0].getUri()
            e_in_in_to_e_in = triple[2]['object'].getUri()
            
            #Create a mapping (in keeping with the templates' placeholder names)
            mapping = { 'e_in_in':e_in_in, 'e_in_in_to_e_in': e_in_in_to_e_in, 'e_in_to_e':e_in_to_e, 'e_in': e_in }
        
            #Throw it to a function who will put it in the list with appropriate bookkeeping
            try:
                fill_specific_template( _template_id=3, _mapping=mapping)
                counter_template3 = counter_template3 + 1
                print str(counter_template3) , "tempalte3"
                if counter_template1 > 10:
                    pass
#                     break
            except:
                print "check error stack"
                traceback.print_exc()
                continue
    ''' 
        Template #4: 
            SELECT DISTINCT ?uri WHERE { <%(e_out_in)s> <%(e_out_in_to_e_out)s> ?x . ?uri <%(e_to_e_out)s> ?x }
        Find e_in and e_in_to_e.
    '''       
    
    #Query the graph for outnodes from e and relevant properties
    op = access.return_outnodes('e')
    counter_template4 = 0
    #Create a list of all these (e_to_e_out, e_out)
    one_triple_right_map = { triple[1].getUri(): triple[2]['object'].getUri() for triple in op[0] }
    pprint(one_triple_right_map)
    
    #Collect all e_out_in and e_out_in_to_e_out 
    op = access.return_innodes('e_out')
    
    #This 'op' has the e_out_in and the prop for all e_out's. We now need to map one to the other.
    for list_of_triples in op:

        #Some triple are simply empty. Ignore them.
        if len(list_of_triples) == 0:
            continue

        ### Mapping e_out_in's to relevant e_out's ###
    
        #Pick one triple from the list.
        e_out = list_of_triples[0][1].getUri()
        e_to_e_out = one_triple_right_map[e_out]   #Find the relevant property from the map
        
         #Given this information, lets create mappings of template four 
        for triple in list_of_triples:
            
            #Making the variables explicit (for the sake of readability)
            e_out_in = triple[0].getUri()
            e_out_in_to_e_out = triple[2]['object'].getUri()
            
            #Create a mapping (in keeping with the templates' placeholder names)
            mapping = { 'e_out_in':e_out_in, 'e_out_in_to_e_out': e_out_in_to_e_out, 'e_to_e_out':e_to_e_out, 'e_out': e_out }
        
            #Throw it to a function who will put it in the list with appropriate bookkeeping
            try:
                fill_specific_template( _template_id=4, _mapping=mapping, _debug=False)
                counter_template4 = counter_template4 + 1
                print str(counter_template4) , "tempalte4"
            except:
                print "check error stack"
                continue    
            if counter_template4 > 10:
                pass
#                 break

In [None]:
'''
    Testing the ability to create subgraph given a URI
    Testing the ability to generate sparql templates
'''
sparqls = {}
dbp =  db_interface.DBPedia(_verbose = True)
uri = 'http://dbpedia.org/resource/Bareilly'

#Generate the local subgraph
graph = get_local_subgraph(uri)

#Generate SPARQLS based on subgraph
fill_templates(graph,_uri=uri)

#Write the SPARQLs to disk in Pretty Print format
for i in range(1,5):
    with open('output/template%d.txt' % i, 'wt') as out:
        pprint(sparqls[i], stream=out)
for i in range(1,5):
    f = open('output/template%s.json' % i, 'wt')
    json.dump(sparqls[i],f)
    f.close()
print "DONE"

Traceback (most recent call last):
  File "<ipython-input-5-589e19919285>", line 56, in fill_specific_template
    sparqls[_template_id].append(template)
KeyError: 1


None
0 tempalte1
2
1 tempalte1
3
2 tempalte1
4
3 tempalte1
5
4 tempalte1
None
1 tempalte2
2
2 tempalte2
3
3 tempalte2
4
4 tempalte2
5
5 tempalte2


Traceback (most recent call last):
  File "<ipython-input-5-589e19919285>", line 56, in fill_specific_template
    sparqls[_template_id].append(template)
KeyError: 2


6
6 tempalte2
7
7 tempalte2
8
8 tempalte2
9
9 tempalte2
10
10 tempalte2
11
11 tempalte2
12
12 tempalte2
13
13 tempalte2
14
14 tempalte2
15
15 tempalte2
16
16 tempalte2
17
17 tempalte2
18
18 tempalte2
19
19 tempalte2
20
20 tempalte2
21
21 tempalte2
22
22 tempalte2
23
23 tempalte2
24
24 tempalte2
25
25 tempalte2
26
26 tempalte2
27
27 tempalte2
28
28 tempalte2
29
29 tempalte2
30
30 tempalte2
31
31 tempalte2
32
32 tempalte2
33
33 tempalte2
34
34 tempalte2
35
35 tempalte2
36
36 tempalte2
37
37 tempalte2
38
38 tempalte2
39
39 tempalte2
40
40 tempalte2
41
41 tempalte2
42
42 tempalte2
43
43 tempalte2
44
44 tempalte2
45
45 tempalte2
46
46 tempalte2
47
47 tempalte2
48
48 tempalte2
49
49 tempalte2
50
50 tempalte2
51
51 tempalte2
52
52 tempalte2
53
53 tempalte2
54
54 tempalte2
55
55 tempalte2
56
56 tempalte2
'\n            SELECT DISTINCT ?e ?p\n            WHERE {\n                ?e ?p <%(e)s>\n            }'


Traceback (most recent call last):
  File "<ipython-input-5-589e19919285>", line 56, in fill_specific_template
    sparqls[_template_id].append(template)
KeyError: 3


None
1 tempalte3
2
2 tempalte3
3
3 tempalte3
4
4 tempalte3
5
5 tempalte3
6
6 tempalte3
7
7 tempalte3
8
8 tempalte3
9
9 tempalte3
10
10 tempalte3
11
11 tempalte3
12
12 tempalte3
13
13 tempalte3
14
14 tempalte3
15
15 tempalte3
16
16 tempalte3
17
17 tempalte3
18
18 tempalte3
19
19 tempalte3
20
20 tempalte3
21
21 tempalte3
22
22 tempalte3
23
23 tempalte3
24
24 tempalte3
25
25 tempalte3
26
26 tempalte3
27
27 tempalte3
28
28 tempalte3
29
29 tempalte3
30
30 tempalte3
31
31 tempalte3
32
32 tempalte3
33
33 tempalte3
34
34 tempalte3
35
35 tempalte3
36
36 tempalte3
37
37 tempalte3
38
38 tempalte3
39
39 tempalte3
40
40 tempalte3
41
41 tempalte3
42
42 tempalte3
43
43 tempalte3
44
44 tempalte3
45
45 tempalte3
46
46 tempalte3
47
47 tempalte3
48
48 tempalte3
49
49 tempalte3
50
50 tempalte3
51
51 tempalte3
52
52 tempalte3
53
53 tempalte3
54
54 tempalte3
55
55 tempalte3
56
56 tempalte3
57
57 tempalte3
58
58 tempalte3
59
59 tempalte3
60
60 tempalte3
61
61 tempalte3
62
62 tempalte3
63
63 tempalte3
64
64 t

Traceback (most recent call last):
  File "<ipython-input-5-589e19919285>", line 56, in fill_specific_template
    sparqls[_template_id].append(template)
KeyError: 4



None
1 tempalte4
2
2 tempalte4
3
3 tempalte4
4
4 tempalte4
5
5 tempalte4
6
6 tempalte4
7
7 tempalte4
8
8 tempalte4
9
9 tempalte4
10
10 tempalte4
11
11 tempalte4
12
12 tempalte4
13
13 tempalte4
14
14 tempalte4
15
15 tempalte4
16
16 tempalte4
17
17 tempalte4
18
18 tempalte4
19
19 tempalte4
20
20 tempalte4
21
21 tempalte4
22
22 tempalte4
23
23 tempalte4
24
24 tempalte4
25
25 tempalte4
26
26 tempalte4
27
27 tempalte4
28
28 tempalte4
29
29 tempalte4
30
30 tempalte4
31
31 tempalte4
32
32 tempalte4
33
33 tempalte4
34
34 tempalte4
35
35 tempalte4
36
36 tempalte4
37
37 tempalte4
38
38 tempalte4
39
39 tempalte4
40
40 tempalte4
41
41 tempalte4
42
42 tempalte4
43
43 tempalte4
44
44 tempalte4
45
45 tempalte4
46
46 tempalte4
47
47 tempalte4
48
48 tempalte4
49
49 tempalte4
50
50 tempalte4
51
51 tempalte4
52
52 tempalte4
53
53 tempalte4
54
54 tempalte4
55
55 tempalte4
56
56 tempalte4
57
57 tempalte4
58
58 tempalte4
59
59 tempalte4
60
60 tempalte4
61
61 tempalte4
62
62 tempalte4
63
63 tempalte4
64
64 