# Answer Ranking
After you have extracted a set of entities from your sentence, you will rank them to choose the best answer. The ranking should be based on three factors. First, answers whose content words all appear in the question should be ranked lowest. Second, answers which match the question type should be ranked higher than those that don't; for this, you should build a simple rule-based question type classifier based on key words (e.g. questions which contain "who" are people). Third, among entities of the same type, the prefered entity should be the one which is closer in the sentence to a open-class word from the question.

In [332]:
import operator
import nltk

# A simple rule-based question type classifier based on key words 
def get_question_type(question):
    # TODO: HAND-CODED, NEED TO BE REFINED!!
    # TODO: need to low-case to compare?
    
    type_rules = {'PERSON':["Who", "Whose", "Whom"],
                  'LOCATION':["Where"],
                  'NUMBER':["When", "few", "little", "much", "many",
                            "young", "old", "long", "year", "years"]
                 }
    q_type = None
    for question_type, key_words in type_rules.items():
        for key_word in key_words:
            if key_word in question:
                q_type = question_type
                break
        if q_type == None:
            q_type = 'OTHER'
    return q_type

In [333]:
from nltk import word_tokenize

# among entities of the same type, the prefered entity should be 
# the one which is closer in the sentence to a open-class word
# from the question.
# ----> nouns, verbs, adjectives, and adverbs.
def get_preferred_entity(entity_list, sentence, question):
    preferred_entity = None
    sentence_text = word_tokenize(sentence)
    question_text = word_tokenize(question)
    sentence_tag = nltk.pos_tag(sentence_text,tagset='universal')
    question_tag = nltk.pos_tag(question_text,tagset='universal')
    
    # initialize a list for comparing, and set all elements as 0
    is_open_word = [0] * len(sentence_text)
    # find an open word in the question
    for word, tag in question_tag:
        if tag in ['ADJ','NOUN','VERB','ADV']:
            # if the open word appears in the sentence, then mark as 1
            for i in range(len(sentence_text)):
                if sentence_text[i] == word:
                    is_open_word[i] = 1
#     print is_open_word
    
    # find the closest distance to an open-class word for an entity
    def get_distance(entity):
        # get the position of entity, and find the open class word 
        # from the nearest at both sides
        distance = None
        position = sentence_text.index(entity)
        for i in range(1, len(sentence_text)):
            if position - i >= 0:
                if is_open_word[position - i] == 1:  # find an open-class word on the left
                    distance = i
                    break
                elif position + i < len(is_open_word):  # find an open-class word on the right
                    if is_open_word[position + i] == 1:
                        distance = i
                        break
                else:
                    distance = len(sentence_text) + 1  # didn't find open-class words
        return distance
    
    # get distance for each entity and choose the best one
    all_distance = []
    for entity in entity_list:
        all_distance.append(get_distance(entity))
        preferred_entity = entity_list[all_distance.index(min(all_distance))]

    return preferred_entity
    

In [334]:
def get_ranked_ans(entities_dic, question, sentence):
    
    # identify if the entity set is empty. If True, return nothing
    is_empty = True
    for values in entities_dic.values():
        if len(values) != 0:
            is_empty = False
            
    if is_empty == False:
        q_type = get_question_type(question)
        tmp_rank = {}
        for ent_type,entities in entities_dic.items():
            # answers whose content words all appear in the question should be ranked lowest.
            for entity in entities:
                if entity in question:
                    tmp_rank[entity] = tmp_rank.setdefault(entity,0) - 1
            # Answers which match the question type should be ranked higher than those that don't
            if ent_type == q_type:
                for entity in entities:
                    tmp_rank[entity] = tmp_rank.setdefault(entity,0) + 1
                ######## TODO: Apply this to all types?
                # entity closer in the sentence to a closed-class word should be preferred
                tmp_rank[get_preferred_entity(entities, sentence, question)] = tmp_rank.setdefault(entity,0) + 1
        # sort and choose the best answer
        sorted_ans = sorted(tmp_rank.items(), key=operator.itemgetter(1), reverse=True)
        best_ans = sorted_ans[0][0]
        return best_ans
    else:
        return ''

In [335]:
# Demo
####### TODO: 'Jiawei Zhang' is not in the list

demo_dict = {'PERSON':['Jiawei Zhang','Elon Musk'],
             'LOCATION':['Melbourne'],
             'NUMBER':['2020','2017'],
             'OTHER':['What The Hell']
            }
demo_sentence = 'Jiawei Zhang and Elon Musk went to Melbourne in early 2017 and will be back in 2020 What The Hell!'
demo_question_1 = 'Where did Jiawei Zhang and Elon Musk go to?'
demo_question_2 = 'When did Jiawei Zhang and Elon Musk go to Melbourne?'
demo_question_3 = 'When will Jiawei Zhang and Elon Musk be back from Melbourne?'
# demo_question_4 = 'Who did Jiawei Zhang go to Melbourne with?'

print 'Q: ' + str(demo_question_1)
print 'A: ' + str(get_ranked_ans(demo_dict, demo_question_1, demo_sentence)) + '\n'

print 'Q: ' + str(demo_question_2)
print 'A: ' + str(get_ranked_ans(demo_dict, demo_question_2, demo_sentence)) + '\n'

print 'Q: ' + str(demo_question_3)
print 'A: ' + str(get_ranked_ans(demo_dict, demo_question_3, demo_sentence)) + '\n'

Q: Where did Jiawei Zhang and Elon Musk go to?
A: Melbourne

Q: When did Jiawei Zhang and Elon Musk go to Melbourne?
A: 2017

Q: When will Jiawei Zhang and Elon Musk be back from Melbourne?
A: 2020

