# Sentence Retrieval

In [1]:
import json
import math
import nltk
from time import sleep
from tqdm import tqdm

train_file = open("data/train.json",'r')
# dev  ----> test
dev_file = open("data/dev.json",'r')
test_file=open("data/test.json",'r')
train = json.loads(train_file.read())
dev = json.loads(dev_file.read())
test = json.loads(test_file.read())

In [2]:
from nltk.corpus import stopwords
# may be stem?

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
stopword =  stopwords.words()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def doc_word_dict(doc):
    word_dict = set()
    for sent in doc['sentences']:
        for word in  word_tokenizer.tokenize(sent):
            word = lemmatize(word.lower())
            if word not in stopword:
                word_dict.add(word)
    return word_dict

In [4]:
def get_BOW(sent):
    term_dict={}
    for word in word_tokenizer.tokenize(sent):
        word = lemmatize(word.lower())
        if word not in stopword:
#             term_dict[word]=term_dict.get(word,0)+1
            term_dict[word]=1 # 放弃词频
    return term_dict

def cal_BOW(doc):
    doc_term_matrix = [] 
    for sent in doc['sentences']:
        temp = get_BOW(sent)
        doc_term_matrix.append(temp)
    return doc_term_matrix

In [5]:
def get_best_doc_num2(query):
    query =  transformer.transform(vectorizer.transform(get_BOW(query)))
    result={}
    for x in range(term_matrix.shape[0]):
         result[x]=cos_distance(query.toarray(),term_matrix[x].toarray())
            
    minvalue=1
    first=0
    for item in result:
        if minvalue > result[item]:
            minvalue=result[item]
            first=item     
    del result[first]
    
    minvalue=1
    second=0
    for item in result:
        if minvalue > result[item]:
            minvalue=result[item]
            second=item     
    return first,second

In [97]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine as cos_distance

vectorizer = DictVectorizer()
transformer = TfidfTransformer(smooth_idf=False,norm='l2')

# store guessed question sentence no
match_sent= [] #[[(best_match_sent_no,second_match_sent_no),...][second doc]]
count = 0

for dev_doc in tqdm(dev, desc='Extracting sentences from documents'):
    count += 1
    doc_match_sent = []
    term_matrix = transformer.fit_transform(vectorizer.fit_transform(cal_BOW(dev_doc)))
    for qa in dev_doc['qa']:
        doc_match_sent.append(get_best_doc_num2(qa['question']))
    match_sent.append(doc_match_sent)

Extracting sentences from documents: 100%|██████████| 40/40 [14:45<00:00, 16.74s/it]


In [98]:
# a list of set(mentioned_sent_number in the guessed )
mentioned_sent = []

for doc in match_sent:
    tmp_doc = set()
    for first,second in doc:
        tmp_doc.add(first)
        tmp_doc.add(second)
    mentioned_sent.append(tmp_doc)

# Entity Extraction

In [117]:
from nltk.tag.stanford import StanfordNERTagger
# st = StanfordNERTagger('/Users/ZhangJiaWei/Downloads/stanford-ner-2016-10-31/classifiers/english.muc.7class.distsim.crf.ser.gz',
#                '/Users/ZhangJiaWei/Downloads/stanford-ner-2016-10-31/stanford-ner.jar') 
st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
               '/usr/share/stanford-ner/stanford-ner.jar') 

# tokenize sentence
ner_tag_1 = []
for i in range(len(dev)):
    doc_tag = []
    for j in range(len(dev[i]['sentences'])):
        if j in mentioned_sent[i]:# mentioned in the guess
            sentence = dev[i]['sentences'][j]
            if '"' in sentence:
                sentence = sentence.replace('"',"")
            if sentence[:-1]==".":
                sentence = sentence[:-1]
            doc_tag.append(word_tokenizer.tokenize(sentence))
        else:
            doc_tag.append([])
            
    doc_tag = st.tag_sents(doc_tag)
    ner_tag_1.append(doc_tag)

In [118]:
symbol_list = ["(",")","[","]"]

def hasSymbol(inputString):
    for char in inputString:
        if char in symbol_list:
#             print char
            return False
        if not (char.isalpha() or char.isdigit()):
            return True
    return False

def hasNumbers(inputString):
    if any(char.isdigit() for char in inputString):
        return True
    else:
        for word in word_tokenizer.tokenize(inputString):
            if word in ['one', 'two', 'three', 'four', 'five', 'six', 'seven',
                  'eight', 'nine', 'eleven', 'twelve', 'thirteen',
                  'fourteen', 'fifteen', 'sixteen',
                  'ten', 'twenty', 'thirty', 'forty', 'fifty', 'sixty',
                  'seventy', 'eighty', 'ninety', 'seventeen', 'eighteen',
                  'nineteen']:
                return True
    return False

In [119]:
def combine_same_word(tag_list):
    for k in range(len(tag_list)):
        for i in range(len(tag_list[k])):
            j = 0
            while j < len(tag_list[k][i])-2:
                term,tag = tag_list[k][i][j]
                term_n,tag_n = tag_list[k][i][j+1]
                term_n2,tag_n2 = tag_list[k][i][j+2]
                tmp = term+term_n+term_n2
                
                if hasSymbol(tmp) and tmp in dev[k]["sentences"][i]:
                    if tag_n2 != 'O':
                        temp =  (tmp,tag_n2)
                    elif tag_n != 'O':
                        temp =  (tmp,tag_n)
                    elif tag != 'O':
                        temp = (tmp,tag)
                    else:
                        temp = (tmp,"OTHER")
                    tag_list[k][i][j] = temp
                    del tag_list[k][i][j+2]
                    del tag_list[k][i][j+1]
                    j -= 1
                j += 1

In [120]:
def combine_entity(tag_list):
    for k in range(len(tag_list)):
        for i in range(len(tag_list[k])):
            j = 0
            while j < len(tag_list[k][i])-1:
                term,tag = tag_list[k][i][j]
                term_n,tag_n = tag_list[k][i][j+1]
                tmp = term+term_n
                
                if tag == tag_n and tag != "O" and term_n!="," and term_n !=";":
                    if tmp in dev[k]["sentences"][i]:
                        temp =  (tmp,tag)
                    else:
                        temp =  (term + " " + term_n,tag)
                    tag_list[k][i][j] = temp
                    del tag_list[k][i][j+1]
                    j -= 1
                j += 1

In [121]:
# turn O and ORGANIZATION into OTHER; digit into NUMBER
def tune_other_and_number(tag_list):
    for i in range(len(tag_list)): # each document
        for j in range(len(tag_list[i])): # each sentence
            for k in range(len(tag_list[i][j])): # each question
                term,tag = tag_list[i][j][k] 
                if term!='' and (tag == "ORGANIZATION"  or (len(term)>0 and (term,tag)!=tag_list[i][j][0] and tag == 'O' and term[0].isupper())):
                    tag_list[i][j][k] = (term,"OTHER")
                if  hasNumbers(term):
                    tag_list[i][j][k] = (term,"NUMBER")

In [122]:
import copy
ner_tag = []
ner_tag = copy.deepcopy(ner_tag_1)

combine_same_word(ner_tag)
combine_entity(ner_tag)
tune_other_and_number(ner_tag)
combine_entity(ner_tag)

In [123]:
# tags = ["PERSON","LOCATION","NUMBER","OTHER","ORGANIZATION","PERCENT"]
tags = ["PERSON","LOCATION","NUMBER","OTHER"]


def create_entity(first,second):
    sent_tag_dict = dict.fromkeys(tags,[])
    for k in [first]:
        for j in ner_tag[i][k]:
            term,tag = j
            if term =='':
                continue
#             if tag ==  "DATE":
#                 regex = re.compile(r'[0-9]{4}')
#                 year = regex.findall(term)
#                 if year != []:
#                     sent_tag_dict["YEAR"] = sent_tag_dict["YEAR"]+ [year]
            if tag in tags:
                sent_tag_dict[tag] = sent_tag_dict[tag]+ [term]
            elif tag == "DATE" or tag == "TIME" or tag == "PERCENT" or hasNumbers(term):
                sent_tag_dict["NUMBER"] = sent_tag_dict["NUMBER"]+ [term]
    for tag in tags:
        sent_tag_dict[tag] = list(set(sent_tag_dict[tag]))
    return sent_tag_dict

def remove_tag(list_tag):
    list_tmp = []
    for tup in list_tag:
        term,tag = tup
        if term != '':
            list_tmp.append(term)
    return list_tmp

In [124]:
question_sent_list = []
entity_pool = []

for i in range(len(match_sent)):
    doc = dev[i]
    sent_pool = doc['sentences']
    test_list_tmp = []
    entity_doc_pool = []
    
    for first,second in match_sent[i]:
        tmp =  ner_tag[i][first]
        test_list_tmp.append(remove_tag(tmp))
        entity_doc_pool.append(create_entity(first,second))
        
    question_sent_list.append(test_list_tmp)
    entity_pool.append(entity_doc_pool)

# Answer Ranking

In [125]:
import re
import nltk
import operator
from nltk import word_tokenize

def get_ranked_ans(entities_dic, question, sentence_token):
    # identify if the entity set is empty. If True, return nothing
    tmp_rank = {}
    is_empty = True
    for values in entities_dic.values():
        if len(values) != 0:
            is_empty = False
            
    if is_empty == False:
        q_type = get_question_type(question)
        # count the number of 'OTHER' type for analysis
        if q_type == 'OTHER':
            global OTHER_count 
            OTHER_count += 1
        for ent_type,entities in entities_dic.items():
            # answers whose content words all appear in the question should be ranked lowest.
            for entity in entities:
                tmp_rank[entity] = tmp_rank.setdefault(entity,0)
                
                #TODO: this should be removed later to be handled in second section
                if ('%' in entity) and ('percentage' in question):
                    tmp_rank[entity] += 50
                if bool(re.match('\d{4}', entity)) and ('hat year' in question):
                    tmp_rank[entity] += 50
                    
                if (entity.lower() in question.lower()) or (entity.lower().replace('-', ' ') in question.lower()):
                    tmp_rank[entity] -= 999
            # Answers which match the question type should be ranked higher than those that don't
            if ent_type == q_type and ent_type != 'OTHER':
                for entity in entities:
                    tmp_rank[entity] += 1
                ######## TODO: Apply this to all types?
            # entity closer in the sentence to a closed-class word should be preferred
            preferred_entity = get_preferred_entity(entities, sentence_token, question)
            if preferred_entity != None:
                tmp_rank[preferred_entity] += 1
        # sort and choose the best answer
        sorted_ans = sorted(tmp_rank.items(), key=operator.itemgetter(1), reverse=True)
        
        # log for error analysis
        output_file.write('Q_type: ' + '\t' + q_type + '\n')
        output_file.write('Ranked Answers: ' + '\t' + str(sorted_ans).encode('utf-8') + '\n\n')
        
        # TODO: bug here. list out of index??? why?
        if len(sorted_ans) != 0:
            best_ans = sorted_ans[0][0]
        else:
            best_ans = ''
        return best_ans
       
    else:
        return ''

In [126]:
# A simple rule-based question type classifier based on key words 

def get_question_type(question):
    # TODO: HAND-CODED, NEED TO BE REFINED!!
    # TODO: need to low-case to compare?

    type_rules = [
        ('PERSON', ["Who", "who", "Whose", "whose", "Whom", "whom"]),
        ('LOCATION', ["Where", "where", "area", "city", "province", "located",
                     "location"]),
        ('NUMBER', ["When","when", "few", "little", "much", "many", "size",
                   "young", "old", "long", "year", "years", "day", "era",
                   "early", "century", "population", "cost", "How far", 
                    "how far", "sizes", "time", "month", "century", "percentage"])
    ]

    q_type = None
    for question_type, key_words in type_rules:
        if q_type == None:
            for key_word in key_words:
                if key_word in question:
                    q_type = question_type
                    break
    if q_type == None:
        q_type = 'OTHER'

    return q_type

In [127]:
test_sent = "Digital cameras often use infrared blockers."
print  nltk.pos_tag(word_tokenizer.tokenize(test_sent),tagset='universal')
print
print  nltk.pos_tag(word_tokenizer.tokenize(test_sent))

[('Digital', u'NOUN'), ('cameras', u'NOUN'), ('often', u'ADV'), ('use', u'VERB'), ('infrared', u'ADJ'), ('blockers', u'NOUN'), ('.', u'.')]

[('Digital', 'NNP'), ('cameras', 'NNS'), ('often', 'RB'), ('use', 'VBP'), ('infrared', 'JJ'), ('blockers', 'NNS'), ('.', '.')]


In [128]:
# among entities of the same type, the prefered entity should be 
# the one which is closer in the sentence to a open-class word
# from the question.
# ----> nouns, verbs, adjectives, and adverbs.

import sys

def get_preferred_entity(entity_list, sentence_token, question):
    preferred_entity = None
    question_text = word_tokenize(question)
    sentence_tag = nltk.pos_tag(sentence_token,tagset='universal')
    question_tag = nltk.pos_tag(question_text,tagset='universal')
    
    # initialize a list for comparing, and set all elements as 0
    is_open_word = [0] * len(sentence_token)
    # find an open word in the question
    for word, tag in question_tag:
        if tag in ['ADJ','NOUN','VERB','ADV']:
            # if the open word appears in the sentence, then mark as 1
            for i in range(len(sentence_token)):
                if sentence_token[i] == word:
                    is_open_word[i] = 1

##############################################################################

    # find the closest distance to open-class words for an entity
    def get_distance(entity):
        # get the position of entity, and find the open class words
        distances_to_OCW = 0
        position = sentence_token.index(entity)

        # find distances to all open-class words
        for i in range(len(sentence_token)):
            if is_open_word[i] == 1:            # find an open-class word
                distances_to_OCW += abs(position - i)
        if distances_to_OCW == 0:             # didn't find open-class words
            distances_to_OCW = sys.maxint       
        return distances_to_OCW
    
    # get distance for each entity and choose the best one
    all_distance = []
    for entity in entity_list:
        all_distance.append(get_distance(entity))
        preferred_entity = entity_list[all_distance.index(min(all_distance))]

    return preferred_entity

In [129]:

# import sys

# def get_preferred_entity_2(entity_list, sentence_token, question):
#     preferred_entity = None
#     question_text = word_tokenize(question)
#     sentence_tag = nltk.pos_tag(sentence_token,tagset='universal')
#     question_tag = nltk.pos_tag(question_text,tagset='universal')
    
#     # initialize a list for comparing, and set all elements as 0
#     is_open_word = [0] * len(sentence_token)
#     # find an open word in the question
#     for word, tag in question_tag:
#         if tag in ['ADJ','NOUN','VERB','ADV']:
#             # if the open word appears in the sentence, then mark as 1
#             for i in range(len(sentence_token)):
#                 if sentence_token[i] == word:
#                     is_open_word[i] = 1
                    
# ##############################################################################

#     # find the entity which is closest to open-class words
#     def get_distance(entity):
#         # get the position of entity, and find the open class words
#         covered_OCW = 0
#         position = sentence_token.index(entity)
        
#         # find number of covered open-class words in a given range
#         #TODO: find the best window parameter
#         length = len(sentence_token)/3
#         window_min = position - length
#         window_max = position + length
#         # when touch the start or the end of the sentence
#         if window_min < 0:
#             window_min = 0
#             window_max += (0-window_min)
#         if window_max > (len(sentence_token) - 1):
#             window_min -= (window_max - len(sentence_token) + 1)
#             window_max = (len(sentence_token) - 1)
        
#         # get the total number of covered open-class words
#         for i in range(window_min, window_max + 1):
#             if is_open_word[i] == 1:            # find an open-class word
#                 covered_OCW += 1
                
#         if covered_OCW == 0:             # didn't find open-class words
#             covered_OCW = -(sys.maxint)
        
#         return covered_OCW
    
#     # get distance for each entity and choose the best one
#     all_distance = []
#     for entity in entity_list:
#         all_distance.append(get_distance(entity))
#         all_best_index = [i for i, x in enumerate(li) if x == max(li)]
        
#         ??? preferred_entity = entity_list[all_distance.index()]
    
    
#     return preferred_entity

In [None]:
num = 0
count = 1
correct_sum = 0
corr_sen_retr_count = 0
OTHER_count = 0

with open("result.txt",'w') as output_file:
    for i in tqdm(range(len(match_sent)), desc='Answering'):
        for j in range(len(match_sent[i])):
            result = get_ranked_ans(entity_pool[i][j], dev[i]["qa"][j]['question'], question_sent_list[i][j])
            output_file.write('Retrieved Entities: ' + '\t' + str(entity_pool[i][j]) + '\n\n')
            count += 1
            cor_answer = dev[i]["qa"][j]['answer']
            Q = dev[i]["qa"][j]['question']
            A_sentence = dev[i]["sentences"][dev[i]["qa"][j]['answer_sentence']]
            sent_1, sent_2 = match_sent[i][j]
            guessed_sentence = dev[i]['sentences'][sent_1]         # + ' ' + dev[i]['sentences'][sent_2]
            
            if result == cor_answer:
                correct_sum += 1
            else:
                string1 = 'Retrieved Sentence: ' + '\t' + guessed_sentence.encode('utf-8')+"\n\n"
                string1_1 = '==== WRONG SENTENCES! ==== \n' + 'Guessed_Sentence: ' + '\t' + guessed_sentence.encode('utf-8')+"\n\n"
                string1_2 = 'CORRECT_Sentence: ' + '\t' + A_sentence.encode('utf-8')+"\n\n"
                string2 = 'Q: ' + '\t' + Q.encode('utf-8') + '\n\n'
                string3 = 'CORRECT_ANSWER: ' + '\t' + cor_answer.encode('utf-8') + '\n'
                string4 = 'GUESSED_ANSWER: ' + '\t' + result.encode('utf-8')+"\n"
                
            if A_sentence not in guessed_sentence:
                output_file.write(string1_1)
                output_file.write(string1_2)
            else:
                corr_sen_retr_count += 1
                output_file.write(string1)
            output_file.write(string2)
            output_file.write('='*60 + '\n')
            output_file.write(string3)
            output_file.write(string4)
            output_file.write('='*60 + '\n\n')
    print 'correct sum: ' + str(correct_sum)
    print 'Sentence Recall: ' + str((corr_sen_retr_count+0.0)/count)
    print "'OTHER': " + str(OTHER_count)
    
for i in dev:
    for j in i["qa"]:
        num += 1
print (correct_sum+0.0)/num

Answering:  68%|██████▊   | 27/40 [01:15<00:37,  2.89s/it]

In [268]:
# # run on test data

# with open("result_to_kaggle.txt",'w') as output_file:
#     output_file.write('id,answer'+'\n')
#     for i in tqdm(range(len(match_sent)), desc='Answering'):
#         for j in range(len(match_sent[i])):
#             result = get_ranked_ans(entity_pool[i][j], dev[i]["qa"][j]['question'], question_sent_list[i][j])
#             result = result.encode('utf-8')
#             reuslt = result.replace('" ','')
#             result = result.replace('"','')
#             result = result.replace(",","-COMMA-")
#             q_id = dev[i]["qa"][j]['id']
#             output_file.write(str(q_id) + ',' + str(result) + '\n')

In [None]:
# demo_entity_list = [u'pre-set']
# demo_sentence = 'For example, for any pre-set emissivity value, objects with higher emissivity will appear hotter, \
#                     and those with a lower emissivity will appear cooler.'
# demo_sentence_token = demo_sentence.split()
# question = 'How will the infrared image of an object with a higher emissivity appear in relation to one with lower emissivity?'
