This file outlines the method for creating an **embedding based** question answering system.
Following are the major steps followed
* Find the entity
* Create a set of core chain candidates
* Rank the core chain candidate

In [16]:
# imports (external library)
import json
import numpy as np
import requests
from pprint import pprint
import create_data_node as cdn
from utils import natural_language_utilities as nlutils

#### Entity linking

We employ [EARL](http://sda.cs.uni-bonn.de/projects/earl/) which returns a set of candidate entities of which we use the top most one. 

In [17]:
def get_entities(question,show_internals=False):
    """
        uses EARL to find all the entites present in the question.
        :param question: a natural language question.
        :return: entities list.
    """

    headers = {
        'Content-Type': 'application/json',
    }

    data = '{"nlquery":"%(p)s"}'% {"p":question}
    response = requests.post(' http://asknow02.sda.tech/earl/api/processQuery', headers=headers, data=data)
    a = json.loads(response.content)
    if show_internals:
        pprint(a)
    entity_list = []
    for i in range(len(a['ertypes'])):
        if a['ertypes'][i] == 'entity':
            entity_list.append(a['rerankedlists'][str(i)][0][1]) # return the top most one. 
    return entity_list

print(get_entities('Who is the president of India ?',False))

['http://dbpedia.org/resource/India']


#### Sub Graph creation.

In [18]:
# predicate blacklist to exclude some of the meta data information
pb = open('resources/predicate.blacklist').readlines()
pb[-1] = pb[-1] + '\n'
pb = [r[:-1] for r in pb]


cd_node = cdn.CreateDataNode(_predicate_blacklist=pb, _relation_file={}, _qald=False)

Label Cache not found. Creating a new one


Traceback (most recent call last):
  File "/home/gaurav/codes/QA-Tutorial/utils/dbpedia_interface.py", line 137, in __init__
    self.labels = pickle.load(open('resources/labels.pickle'))
TypeError: a bytes-like object is required, not 'str'


In [19]:
hop1,hop2  = cd_node.create_subgraph.subgraph\
            (['http://dbpedia.org/resource/Michael_Crichton'],[],_use_blacklist=True,_qald=False)

if False:
    print('few examples of hop1 candidates')
    pprint(hop1[:5])
    print('few examples of hop2 candidates')
    pprint(hop2[:5])

#### Scoring function.

In [33]:
def get_vector(query):
    query_json = {'question':query}
    v = requests.get("http://localhost:3500/vec", json=query_json)
    v = np.asarray(v.json())
    v = np.mean(v.astype(np.float), axis=0)
    return v


def assign_score(core_chain,question):
    if len(core_chain) == 2:
        # corechain looks like ['+', 'http:.../abc']
        predicate = nlutils.get_label_via_parsing(core_chain[1],lower=True)
    else:
        # corechain looks like ['+' 'http:../abc', '-', 'http:../pqr']
        predicate = [nlutils.get_label_via_parsing(core_chain[1],lower=True),
                     nlutils.get_label_via_parsing(core_chain[3],lower=True)]
        predicate = " ".join(predicate)
    
    question_vector = get_vector(query=question)
    predicate_vector = get_vector(query=predicate)
    
    if np.sum(question_vector) == 0.0 or np.sum(predicate_vector) == 0:
        return np.float64(0.0)
    else:
        print('here')
        return np.dot(predicate_vector, question_vector) / (np.linalg.norm(predicate_vector) *
                                                            np.linalg.norm(question_vector))
    
    # predicate is a string seperated by space -- 'abc pqr'
    

#### Loading dataset. 

The dataset consists of 2000 questions without any rdf constraint or count or ask. It is just composed of single hop or two hop query.

In [34]:
dataset = json.load(open('resources/dataset_with_paths.json'))


# query = {'question':'Who is the president of India ?'}
# v = requests.get("http://localhost:3500/vec", json=query)
# v = np.asarray(v.json())

#### Evaluation

The code snippets for evaluating sparql with respect to ground truth sparql

In [35]:
def sparql_answer(sparql,dbi=None):
    '''
        Executes the sparql on dbpedia and returns answer as a list.
        :param sparql: SPARQL which will be executed.
        :param dbi: The dbpedia interface object which can be used for accesing dbpedia.
    '''
    if not dbi:
        dbi = dbp
    test_answer = []
    interface_test_answer = dbi.get_answer(sparql)
    for key in interface_test_answer:
        test_answer = test_answer + interface_test_answer[key]
    return list(set(test_answer))


def _evaluate_sparqls_(test_sparql, true_sparql, type, ground_type,dbp):
    # @TODO: If the type of test and true are differnt code would return an error.
    """
        Fmeasure for ask and count are 0/1.
        Also assumes the variable to be always uri.
        :param test_sparql: SPARQL generated by the pipeline
        :param true_sparql: True SPARQL
        :param type: COUNT/ASK/LIST
        :return: f1,precision,recall
    """
    test_answer = sparql_answer(test_sparql,dbp)
    true_answer = sparql_answer(true_sparql,dbp)
    total_retrived_resutls = len(test_answer)
    total_relevant_resutls = len(true_answer)
    common_results = total_retrived_resutls - len(list(set(test_answer ) -set(true_answer)))
    if total_retrived_resutls == 0:
        precision = 0
    else:
        precision = common_results *1.0 /total_retrived_resutls
    if total_relevant_resutls == 0:
        recall = 0
    else:
        recall = common_results *1.0 /total_relevant_resutls
    if precision == 0 and recall == 0:
        f1 = 0
    else:
        f1 = (2.0 * (precision * recall)) / (precision + recall)
    return f1 ,precision ,recall

In [37]:
core_chain = dataset[0]['hop1'][1]
question = 'Who is the president of India ?'
assign_score(core_chain,question)

here


0.4739299003444137

In [32]:
core_chain

['+', 'http://dbpedia.org/ontology/office']