# Sentence Retrieval

In [1]:
import json
import math
import nltk
from time import sleep
from tqdm import tqdm

train_file = open("data/train.json",'r')
dev_file = open("data/dev.json",'r')
test_file=open("data/test.json",'r')
train = json.loads(train_file.read())
dev = json.loads(dev_file.read())
test = json.loads(test_file.read())

In [2]:
import math
from six import iteritems
from six.moves import xrange


# BM25 parameters.
PARAM_K1 = 0.0
PARAM_B = 1.0
EPSILON = 0.0

class BM25(object):

    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores


def get_bm25_weights(corpus):
    bm25 = BM25(corpus)
    average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)
    return weights

In [3]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def tokenization(sent):
    word_dict_sent = []
    
    try:
        token = nltk.pos_tag(word_tokenizer.tokenize(sent.decode('utf-8')))
    except:
        token = nltk.pos_tag(word_tokenizer.tokenize(sent))
        
    for word,tag in token:
        if tag in ["CD","RP","TO","FW","JJ","JJR","JJS","NN","NNS","NNP","NNPS","PDT",
                   "RB","RBR","RBS","VB","VBD","VBG","VBN","VBP","VBZ"]:
            word = lemmatize(word.lower())
            word_dict_sent.append(word)
    return word_dict_sent


def get_corpus(doc):
    corpus = []
    for sent in doc:
        corpus.append(tokenization(sent))
    return corpus

In [4]:
question_tfidf = [] 

for doc in dev:
    bm25Model = BM25(get_corpus(doc["sentences"]))
    average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())
    question_tfidf_doc = []
    for ques in doc["qa"]:
        scores = bm25Model.get_scores(tokenization(ques["question"]), average_idf)
        question_tfidf_doc.append(scores)
    question_tfidf.append(question_tfidf_doc)

In [5]:
import operator
import copy
question_index_rank = copy.deepcopy(question_tfidf)

for i in range(len(question_index_rank)):
    for j in range(len(question_index_rank[i])):
        question = question_index_rank[i][j]
        rank = {}
        for index in range(len(question)):
            rank[index] = question[index]
        question = sorted(rank.items(), key=operator.itemgetter(1),reverse=True)
        question_index_rank[i][j] = []
        for index,value in question:
            question_index_rank[i][j].append(index)

In [6]:
# def get_best_doc_num2(query):
#     query =  transformer.transform(vectorizer.transform(get_BOW(query)))
#     result={}
#     for x in range(term_matrix.shape[0]):
#          result[x]=cos_distance(query.toarray(),term_matrix[x].toarray())
            
#     minvalue=1
#     first=0
#     for item in result:
#         if minvalue > result[item]:
#             minvalue=result[item]
#             first=item     
#     del result[first]
    
#     minvalue=1
#     second=0
#     for item in result:
#         if minvalue > result[item]:
#             minvalue=result[item]
#             second=item     
#     return first,second

In [7]:
# from sklearn.feature_extraction import DictVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from scipy.spatial.distance import cosine as cos_distance

# vectorizer = DictVectorizer()
# transformer = TfidfTransformer(smooth_idf=False,norm=None)

# # store guessed question sentence no
# match_sent= [] #[[(best_match_sent_no,second_match_sent_no),...][second doc]]
# count = 0

# for dev_doc in tqdm(dev, desc='Extracting sentences from documents'):
#     count += 1
#     doc_match_sent = []
#     term_matrix = transformer.fit_transform(vectorizer.fit_transform(cal_BOW(dev_doc)))
#     for qa in dev_doc['qa']:
#         doc_match_sent.append(get_best_doc_num2(qa['question']))
#     match_sent.append(doc_match_sent)

In [8]:
# cor_sent = []
# cor_num = 0
# total = 0
# for i in range(len(dev)):
#     cor_per_doc = []
#     for j in range(len(dev[i]["qa"])):
#         cor_per_doc.append(dev[i]["qa"][j]["answer_sentence"])
# #         total += 1
#         if dev[i]["qa"][j]["answer_sentence"] in question_index_rank[i][j][:1]:
#             cor_num += 1
#     cor_sent.append(cor_per_doc)
# print (cor_num+0.0)/total

In [9]:
def first_n(question_index_rank,n):
    match_sent = []
    for i in range(len(question_index_rank)):
        match_sent.append([])
        for j in range(len(question_index_rank[i])):
            match_sent[i].append(question_index_rank[i][j][:n])
    return match_sent

match_sent = first_n(question_index_rank,4)

# Entity Extraction

In [10]:
from nltk.tag.stanford import StanfordNERTagger
st = StanfordNERTagger('/Users/ZhangJiaWei/Downloads/stanford-ner-2016-10-31/classifiers/english.muc.7class.distsim.crf.ser.gz',
               '/Users/ZhangJiaWei/Downloads/stanford-ner-2016-10-31/stanford-ner.jar') 
# st = StanfordNERTagger('/Users/Luna/Downloads/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
#                '/Users/Luna/Downloads/stanford-ner/stanford-ner.jar') 

# tokenize sentence
ner_tag_1 = []
for i in range(len(dev)):
    doc_tag = []
    for j in range(len(dev[i]['sentences'])):
        sentence = dev[i]['sentences'][j]
        if '"' in sentence:
            sentence = sentence.replace('"',"")
        if sentence[:-1]==".":
            sentence = sentence[:-1]
        try:
            doc_tag.append(word_tokenizer.tokenize(sentence.decode("utf-8")))
        except:
            doc_tag.append(word_tokenizer.tokenize(sentence))
            
    doc_tag = st.tag_sents(doc_tag)
    ner_tag_1.append(doc_tag)

In [11]:
symbol_list_discard = ["(",")","[","]"]
symbol_list = [",","-","/",".",u'\u2013',u'\u002C',u'\u002E',u'\u2215']

def hasSymbol(inputString):
    
    for char in symbol_list_discard:
        if char in inputString:
            return False
    for char in symbol_list:
        if char in inputString:
            return True
    return False

def hasNumbers(inputString):
    if any((char.isdigit() or ('$' in char)) for char in inputString):
        return True
    else:
        for word in word_tokenizer.tokenize(inputString):
            if word in ['one', 'two', 'three', 'four', 'five', 'six', 'seven',
                  'eight', 'nine', 'eleven', 'twelve', 'thirteen',
                  'fourteen', 'fifteen', 'sixteen',
                  'ten', 'twenty', 'thirty', 'forty', 'fifty', 'sixty',
                  'seventy', 'eighty', 'ninety', 'seventeen', 'eighteen',
                  'nineteen',
                  'nm','Hz','millions','million','hundred',u'\xb0C',u'\xb0F']:
                return True
    return False

In [12]:
def combine_same_word(tag_list):
    for k in range(len(tag_list)):
        for i in range(len(tag_list[k])):
            j = 0
            while j < len(tag_list[k][i])-2:
                term,tag = tag_list[k][i][j]
                term_n,tag_n = tag_list[k][i][j+1]
                term_n2,tag_n2 = tag_list[k][i][j+2]
                tmp = term+term_n+term_n2
                
                if hasSymbol(tmp) and tmp in dev[k]["sentences"][i]:
                    if tag_n2 != 'O':
                        temp =  (tmp,tag_n2)
                    elif tag_n != 'O':
                        temp =  (tmp,tag_n)
                    elif tag != 'O':
                        temp = (tmp,tag)
                    else:
                        temp = (tmp,"OTHER")
                    tag_list[k][i][j] = temp
                    del tag_list[k][i][j+2]
                    del tag_list[k][i][j+1]
                    j -= 1
                j += 1

In [13]:
def combine_entity(tag_list):
    for k in range(len(tag_list)):
        for i in range(len(tag_list[k])):
            j = 0
            while j < len(tag_list[k][i])-1:
                term,tag = tag_list[k][i][j]
                term_n,tag_n = tag_list[k][i][j+1]
                tmp = term+term_n
                
                if tag == tag_n and tag != "O" and term_n!="," and term_n !=";":
                    if tmp in dev[k]["sentences"][i]:
                        temp =  (tmp,tag)
                    else:
                        temp =  (term + " " + term_n,tag)
                    tag_list[k][i][j] = temp
                    del tag_list[k][i][j+1]
                    j -= 1
                j += 1

In [14]:
# turn O and ORGANIZATION into OTHER; digit into NUMBER

def tune_other_and_number(tag_list):
    for i in range(len(tag_list)): # each document
        for j in range(len(tag_list[i])): # each sentence
            for k in range(len(tag_list[i][j])): # each question
                term,tag = tag_list[i][j][k] 
                if term!='' and len(term)>0 and (term,tag)!=tag_list[i][j][0] and tag == 'O' and term[0].isupper():
                    tag_list[i][j][k] = (term,"OTHER")
                if  hasNumbers(term) and (tag == "O" or tag =="OTHER"):
                    tag_list[i][j][k] = (term,"NUMBER")

In [15]:
stopword = ["of"]
tags = ["LOCATION","ORGANIZATION","OTHER"]

def link_stopword(tag_list):
    for k in range(len(tag_list)):
        for i in range(len(tag_list[k])):
            j = 0
            while j < len(tag_list[k][i])-2:
                term,tag = tag_list[k][i][j]
                term_n,tag_n = tag_list[k][i][j+1]
                term_n2,tag_n2 = tag_list[k][i][j+2]
                tmp = term+" "+term_n+" "+term_n2
                
                if term_n in stopword and (tag in tags and tag_n2 in tags):
                    if tag_n2 != 'OTHER':
                        temp =  (tmp,tag_n2)
                    elif tag != 'OTHER':
                        temp = (tmp,tag)
                    else:
                        temp = (tmp,"OTHER")
                    tag_list[k][i][j] = temp
                    del tag_list[k][i][j+2]
                    del tag_list[k][i][j+1]
                    j -= 1
                j += 1

In [16]:
import copy
ner_tag = []
ner_tag = copy.deepcopy(ner_tag_1)

combine_same_word(ner_tag)
combine_entity(ner_tag)
tune_other_and_number(ner_tag)
combine_entity(ner_tag)
link_stopword(ner_tag)

In [17]:
tags = ["PERSON","LOCATION","NUMBER","OTHER","ORGANIZATION","PERCENT","DATE"]

def create_entity(first,second):
    sent_tag_dict = dict.fromkeys(tags,[])
    for k in [first]:
        for j in ner_tag[i][k]:
            term,tag = j
            if term =='' or len(term) == 1:
                continue
            if tag in tags:
                sent_tag_dict[tag] = sent_tag_dict[tag]+ [term]
            elif tag == "TIME":
                sent_tag_dict["NUMBER"] = sent_tag_dict["NUMBER"]+ [term]
    for tag in tags:
        sent_tag_dict[tag] = list(set(sent_tag_dict[tag]))
    return sent_tag_dict

def remove_tag(list_tag):
    list_tmp = []
    for tup in list_tag:
        term,tag = tup
        if term != '':
            list_tmp.append(term)
    return list_tmp
def remove_tag(list_tag):
    list_tmp = []
    for tup in list_tag:
        term,tag = tup
        if term != '':
            list_tmp.append(term)
    return list_tmp

In [18]:
question_sent_list = []
entity_pool = []

for i in range(len(match_sent)):
    doc = dev[i]
    sent_pool = doc['sentences']
    test_list_doc = []
    entity_pool_doc = []
    
    for ques in match_sent[i]:
        test_list_sent = []
        entity_pool_sent = []
        for index in ques:
            test_list_sent.append(remove_tag(ner_tag[i][index]))
            entity_pool_sent.append(create_entity(index,i))
        test_list_doc.append(test_list_sent)
        entity_pool_doc.append(entity_pool_sent)
        
    question_sent_list.append(test_list_doc)
    entity_pool.append(entity_pool_doc)

# Answer Ranking

In [26]:
import csv
import nltk
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
vectorizer = DictVectorizer()
svm = SVC(kernel='linear', C=1)

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

import ast

def train():
    f1 = open('output.csv')
    csv1 = csv.DictReader(f1)
    texts = []
    target = []

    for item in csv1:

        BOW = {}
        try:
            question = item["new"]
            questionx = ast.literal_eval(question)
            # print x[0]
            # print type(x)
            if questionx == [] or questionx == None:
                continue
            for word in questionx:
                BOW[word] = BOW.get(word, 0) + 1
            texts.append(BOW)
            target.append(item['Type'])
        except:
            continue

    brown_matrix = vectorizer.fit_transform(texts).toarray()
    svm.fit(brown_matrix, target)
train()
def predict(sentenct):
    BOW = {}
    for word in word_tokenizer.tokenize(sentenct):
        word=lemmatize(word)
        BOW[word] = BOW.get(word, 0) + 1
#     print vectorizer.transform([BOW])
    sentenctlist=vectorizer.transform([BOW]).toarray()
    return svm.predict(sentenctlist)

In [66]:
import re
import nltk
import operator
from nltk import word_tokenize

def get_ranked_ans(entities_dic, question, sentence_token):
    # identify if the entity set is empty. If True, return nothing
    tmp_rank = {}
    is_empty = True
   
    for values in entities_dic.values():
        if len(values) != 0:
            is_empty = False
            
    q_type = predict(question)[0]
    if is_empty == False:
        resultTpye={"PERSON":{},"LOCATION":{},"NUMBER":{},"OTHER":{},"ORGANIZATION":{},"PERCENT":{},"DATE":{}}
        entitiesList=[]
        for ent_type,entities in entities_dic.items():
            for entity in entities:
                resultTpye[ent_type][entity]=0
                if (entity.lower() in question.lower()) or (entity.lower().replace('-', ' ') in question.lower()):
                    resultTpye[ent_type][entity] -= 999
                entitiesList.append(entity)
                
        preferred_entity = get_preferred_entity_list(entitiesList, sentence_token, question)
        if not preferred_entity==None and not preferred_entity== {}:
          
            maxvalue=max([i for i in preferred_entity.values()]) 
            if maxvalue!=0:
                for ent_type in resultTpye:
                    for word in resultTpye[ent_type]:
                        try:
                            resultTpye[ent_type][word] += round(preferred_entity[word]*1.00,2)/maxvalue
                        except:
                            continue

        if q_type =='PERSON':
            resultTpye=setWeight(resultTpye,[1,0.4,0.2,0.4,0.8,0.2,0.2])
                
        elif q_type =='ORGANIZATION':
            resultTpye=setWeight(resultTpye,[0.9,0.5,0.2,0.5,1,0.2,0.2])
        
        elif q_type =='LOCATION':
            resultTpye=setWeight(resultTpye,[0.8,1,0.2,0.5,0.7,0.2,0.2])
                
        elif q_type =='PERCENT':
            resultTpye=setWeight(resultTpye,[0.8,0.5,0.8,0.4,0.7,1,0.6])
                
        elif q_type =='DATE':
            
            resultTpye=setWeight(resultTpye,[0.8,0.5,0.5,0.5,0.7,0.7,1])
            for item in resultTpye['DATE']:
                if bool(re.match('(?<!\d)\d{4}[s]?(?!\d)', item)) and ('year' in question):
                    resultTpye['DATE'][item]+=1
                    
        elif q_type =='NUMBER':
            resultTpye=setWeight(resultTpye,[0.8,0.5,1,0.5,0.7,0.5,0.5])
            if ('percentage' in question) or ('percent' in question):
                 for item in resultTpye['PERCENT']:
                    resultTpye['PERCENT'][item]+=1
        
        # sort and choose the best answer
        sorted_ans = sorted(tmp_rank.items(), key=operator.itemgetter(1), reverse=True)
        
        # log for error analysis
#         output_file.write('Q_type: ' + '\t' + q_type + '\n')
#         output_file.write('Ranked Answers: ' + '\t' + str(resultTpye) + '\n\n')
      
        best=0
        best_ans=''
        for ent_type in resultTpye:
            for word in resultTpye[ent_type]:
                if resultTpye[ent_type][word] > best:
                    best_ans=word
                    best=resultTpye[ent_type][word]
        
        if q_type =='DATE' and ('year' in question):
            regex = re.compile(r'(?<!\d)\d{4}[s]?(?!\d)')
            year = regex.findall(best_ans)
            if year != []:
                best_ans = year[0]
                
        print entitiesList
        
        return best_ans, best, entitiesList
    else:
        return '', 0, []

In [67]:
def setWeight(resultTpye,weight):
            for item in resultTpye['PERSON']:
                resultTpye['PERSON'][item]+=weight[0]
            for item in resultTpye['LOCATION']:
                resultTpye['LOCATION'][item]+=weight[1]
            for item in resultTpye['NUMBER']:
                resultTpye['NUMBER'][item]+=weight[2]
            for item in resultTpye['OTHER']:
                resultTpye['OTHER'][item]+=weight[3]
            for item in resultTpye['ORGANIZATION']:
                resultTpye['ORGANIZATION'][item]+=weight[4]
            for item in resultTpye['PERCENT']:
                resultTpye['PERCENT'][item]+=weight[5]
            for item in resultTpye['DATE']:
                resultTpye['DATE'][item]+=weight[6]
            return resultTpye

In [68]:
def get_question_type(question):
    # TODO: HAND-CODED, NEED TO BE REFINED!!
    # TODO: need to low-case to compare?

    result = predict(question)[0]
    q_type=''
    if result=='TIME':
        q_type=='NUMBER'
    else:
        q_type == result
#     print q_type
    return q_type

In [None]:
def get_preferred_entity_list(entity_list, sentence_token, question):
    preferred_entity_list = []
    question_text = word_tokenize(question)
    sentence_tag = nltk.pos_tag(sentence_token)
    question_tag = nltk.pos_tag(question_text)
    
    # initialize a list for comparing, and set all elements as 0
    is_open_word = [0] * len(sentence_token)
    # find an open word in the question
    for word, tag in question_tag:
        if tag in ['JJ','JJR','JJS','FW','NN','NNS','NNPS','NNP','VBP','VB',
                   'VBG','VBN','VBZ','VBP','RB','RBR','RBS']:
            # if the open word appears in the sentence, then mark as 1
            for i in range(len(sentence_token)):
                if sentence_token[i] == word:
                    is_open_word[i] = 1
                    
# --------------------------------------------------------------------

    # find the entity which is closest to open-class words
    def get_distance(entity):
        # get the position of entity, and find the open class words
        covered_OCW = 0
        position = sentence_token.index(entity)
        
        # find number of covered open-class words in a given range
        #TODO: find the best window parameter
        length = len(sentence_token)/5
        window_min = position - length
        window_max = position + length 
        # when touch the start or the end of the sentence
        if window_min < 0:
            window_min = 0
            window_max += (0-window_min)
        if window_max > (len(sentence_token) - 1):
            window_min -= (window_max - len(sentence_token) + 1)
            window_max = (len(sentence_token) - 1)
        
        # get the total number of covered open-class words
        for i in range(window_min, window_max + 1):
            if is_open_word[i] == 1:            # find an open-class word
                covered_OCW += 1
        return covered_OCW
    
    # get distance for each entity and choose the best one
    all_distance = {}
    for entity in entity_list:
        try:
            all_distance[entity]=get_distance(entity)
        except:
            continue

    return all_distance

In [None]:
# run on development data

with open("result.txt",'w') as output_file:
#     limit = 0
    all_count = 0
    correct_sum = 0
    corr_sen_retr_count = 0
    wrong_but_in_pool = 0
#     no_entity_in_first_sent = 0

#     print '='*78
#     print 'Use top %d sentences: ' % (limit + 1)
    for i in tqdm(range(len(match_sent)), desc='Answering'):
        for j in range(len(match_sent[i])):
            result = ''
            backoff = 0

            all_count += 1
            cor_answer = dev[i]["qa"][j]['answer']
            Q = dev[i]["qa"][j]['question']
            A_sentence = dev[i]["sentences"][dev[i]["qa"][j]['answer_sentence']]

            result = get_ranked_ans(entity_pool[i][j][0], Q, question_sent_list[i][j][0])
            guessed_sentence = dev[i]['sentences'][match_sent[i][j][0]]
            guessed_entities = entity_pool[i][j][0]

            
#             if result == '':
#                 no_entity_in_first_sent += 1
            
#             # If didn't got answer in current sentence, then backoff to next candidate sentence
#             # until getting the correct answer
            
#             while (result == '') and (backoff < limit):
#                 backoff += 1
#                 result = get_ranked_ans(entity_pool[i][j][backoff], Q, question_sent_list[i][j][backoff])
#                 guessed_sentence = dev[i]['sentences'][match_sent[i][j][backoff]]
#                 guessed_entities = entity_pool[i][j][backoff]

            result1, score1, pool1 = get_ranked_ans(entity_pool[i][j][0], Q, question_sent_list[i][j][0])
            result2, score2, pool2 = get_ranked_ans(entity_pool[i][j][1], Q, question_sent_list[i][j][1])
            
            if score2 * 0.2 > score1:
                result = result2
                if (result != cor_answer) and (result in pool2):
                    wrong_but_in_pool += 1
            else:
                result = result1
                if (result != cor_answer) and (result in pool1):
                    wrong_but_in_pool += 1
#             output_file.write('Retrieved Entities: ' + '\t' + str(guessed_entities) + '\n\n')

            if result == cor_answer:
                correct_sum += 1
                if A_sentence == guessed_sentence:
                    corr_sen_retr_count += 1
            else:
                guess_print = 'guessed sentence: ' + '\t' + guessed_sentence.encode('utf-8')+"\n\n"
                wrong_1 = '======== WRONG SENTENCES! ======== \n' + 'guessed sentence: ' + '\t' + guessed_sentence.encode('utf-8')+"\n\n"
                wrong_2 = 'CORRECT SENTENCE: ' + '\t' + A_sentence.encode('utf-8')+"\n\n"
                Q_print = 'Q: ' + '\t' + Q.encode('utf-8') + '\n\n'
                corr_ans_print = 'CORRECT ANSWER: ' + '\t' + cor_answer.encode('utf-8') + '\n'
                guessed_ans_print = 'guessed answer: ' + '\t' + result.encode('utf-8')+"\n"

                if A_sentence != guessed_sentence:
                    output_file.write(wrong_1)
                    output_file.write(wrong_2)
                else:
                    corr_sen_retr_count += 1
                    output_file.write(guess_print)
                output_file.write(Q_print)
                output_file.write('='*60 + '\n')
                output_file.write(corr_ans_print)
                output_file.write(guessed_ans_print)
                output_file.write('='*60 + '\n\n')

    print 'correct sum: ' + str(correct_sum)
    print 'Sentence Recall: ' + str((corr_sen_retr_count + 0.0) / all_count)
#     print "%d No.1 ranked sentences has no entity extracted at all." % no_entity_in_first_sent
    print 'Wrong but in pool: ', wrong_but_in_pool
    print "【 SCORE : " + str((correct_sum + 0.0) / all_count) + ' 】\n\n'



Answering:   0%|          | 0/40 [00:00<?, ?it/s][A[A

[u'near-infrared', u'Night-vision']
[u'near-infrared', u'Night-vision']
[u'in-the-dark']
[u'sensor-equipped', u'red-shifted']
[u'sensor-equipped', u'red-shifted']
[u'sensor-equipped', u'red-shifted']
[u'sensor-equipped', u'red-shifted']
[u'William Herschel', u'early 19th century']
[u'thermal-imaging']
[u'thermal-imaging']
[u'near-infrared', u'Night-vision']
[u'4000\u2013400', u'\u22121,', u'mid-infrared']
[u'700 nm']
[u'700 nm']
[u'Near-infrared', u'far-infrared']
[u'780 nm', u'near-IR', u'IR', u'IR LED', u'e.g']
[u'780 nm', u'near-IR', u'IR', u'IR LED', u'e.g']
[u'780 nm', u'near-IR', u'IR', u'IR LED', u'e.g']
[u'780 nm', u'near-IR', u'IR', u'IR LED', u'e.g']
[u'700 nm']
[u'780 nm', u'near-IR', u'IR', u'IR LED', u'e.g']
[u'780 nm', u'near-IR', u'IR', u'IR LED', u'e.g']
[u'1050 nm', u'near-IR']
[u'780 nm', u'near-IR', u'IR', u'IR LED', u'e.g']
[u'780 nm', u'near-IR', u'IR', u'IR LED', u'e.g']
[u'IR-glowing', u'IR', u'Wood', u'IR-filter', u'IR-passing']
[u'25', u'Wien']
[u'pre-set']
[u'



Answering:   2%|▎         | 1/40 [00:05<03:48,  5.85s/it]

[u'IR-glowing', u'IR', u'Wood', u'IR-filter', u'IR-passing']
[u'Gu\xe5h\xe5n', u'Chamorro', u'Territory of Guam', u'United States', u'IPA']
[u'Gu\xe5h\xe5n', u'Chamorro', u'Territory of Guam', u'United States', u'IPA']
[u'five', u'one', u'American', u'Pacific Ocean', u'Guam']
[u':20\u201321', u'The']
[u':20\u201321', u'The']
[u'one', u'subscriber-based', u'CareJet', u'Guam']
[u'161,785', u'Guam', u'2015']
[u'161,785', u'Guam', u'2015']
[u'129', u'three', u'Ladrones', u'Islas', u'Those', u'Islands']


[A[A

[u'Hag\xe5t\xf1a', u'Dededo']
[u'Hag\xe5t\xf1a', u'Dededo']
[u'2016', u'Guam Regional Medical City']
[u'Proas', u'Islands', u'Velas Latinas', u'Guam Islas', u'Lateen', u'Magellan']
[u'Proas', u'Islands', u'Velas Latinas', u'Guam Islas', u'Lateen', u'Magellan']
[u'18', u'one', u'Antonio Pigafetta', u'Island of Sails', u'Magellan']
[u'4,000', u'Chamorros', u'Guam']
[u'4,000', u'Chamorros', u'Guam']
[u'18', u'one', u'Antonio Pigafetta', u'Island of Sails', u'Magellan']
[u'4,000', u'Chamorros', u'Guam']
[u'4,000', u'Chamorros', u'Guam']
[u'three', u'Chamorro', u'Mexican', u'Spanish', u'Guam', u'Northern Marianas']
[u'European', u'March 6, 1521', u'Ferdinand Magellan']
[u'European', u'March 6, 1521', u'Ferdinand Magellan']
[u'Portuguese', u'European', u'Guam', u'King of Spain', u'March 6, 1521', u'Ferdinand Magellan']
[u'Diego Luis de San Vitores', u'Catholic', u'1668']
[u'Diego Luis de San Vitores', u'Catholic', u'1668']
[u'4,000', u'Chamorros', u'Guam']
[u'Spanish\u2013American War', u'Un


[A

[u'Apolinario Mabini', u'Emilio Aguinaldo', u'Following', u'Sumay', u'Philippine\u2013American War', u'1901.:vi', u'Guam', u'1899', u'1901.:13', u'Navy', u'Piti', u'U.S']
[u'three', u'American', u'World War II', u'Pacific Ocean', u'Guam', u'Samoa', u'Hawaii', u'the Philippines']
[u'Chamorros', u'Japanese', u'Guam', u'Northern Marianas']
[u'Chamorros', u'Japanese', u'Guam', u'Northern Marianas']
[u':77\u201378', u'Quiroga', u'Guam']
[u'Guamanian Chamorros', u'Japanese']
[u'Guamanian Chamorros', u'Japanese']
[u'Chamorros', u'Guamanian Chamorros', u'Northern Marianas']
[u'30', u'Chamorros', u'Japan', u'Northern Mariana Chamorros']
[u'30', u'Chamorros', u'Japan', u'Northern Mariana Chamorros']
[u'108-A', u'23', u'Executive Order', u'American', u'Japan', u'Philippines', u'Germany', u'Northern Mariana Islands', u'December 1898', u'Navy', u'U.S']
[u'Guam Organic Act', u'U.S', u'World War II', u'United States', u'Guam', u'1950']
[u'Guam Organic Act', u'U.S', u'World War II', u'United States', 

[u'Republic', u'United States', u'Federated States of Micronesia', u'Pacific Islands', u'The Compacts of Free Association', u'Marshall Islands', u'Trust Territory', u'Republic of Palau']
[u'Republic', u'United States', u'Federated States of Micronesia', u'Pacific Islands', u'The Compacts of Free Association', u'Marshall Islands', u'Trust Territory', u'Republic of Palau']
[u'Guam']
[u'U.S', u'Spanish', u'Guam', u'1899']
[u'U.S', u'Spanish', u'Guam', u'1899']
[u'Guam']
[u'Guam']
[u'Democratic', u'Republican', u'Guam']
[u'96910\u201396932', u'Postal System', u'GU', u'U.S', u'ZIP', u'Guam']
[u'96910\u201396932', u'Postal System', u'GU', u'U.S', u'ZIP', u'Guam']
[u'5.0', u'8.7', u'Guam']
[u'FedEx', u'Guam', u'DHL', u'UPS']
[u'FedEx', u'Guam', u'DHL', u'UPS']
[u'96910\u201396932', u'Postal System', u'GU', u'U.S', u'ZIP', u'Guam']
[u'Guam']
[u'Guam']
[u'Fourth-class', u'Hawaii']
[u'two', u'Priority', u'first-class']
[u'two', u'Priority', u'first-class']
[u'Guam']
[u'U.S', u'Spanish', u'Guam',



Answering:   5%|▌         | 2/40 [00:13<04:08,  6.53s/it]

[u'Tamuning', u'Government of Guam', u'Guam Memorial Hospital']
[u'Tamuning', u'Government of Guam', u'Guam Memorial Hospital']
[u'December', u'June']
[u'Tamuning', u'Government of Guam', u'Guam Memorial Hospital']
[u'Tamuning', u'Government of Guam', u'Guam Memorial Hospital']
[u'Proas', u'Islands', u'Velas Latinas', u'Guam Islas', u'Lateen', u'Magellan']
[u'2016', u'Guam Regional Medical City']
[u'2016', u'Guam Regional Medical City']
[u'United States', u'Guam']
[u'405-line', u'4:3.', u'240-line']


[A[A

[u'405-line', u'4:3.', u'240-line']
[u'30', u'August 1936']
[u'30', u'August 1936']
[u'30', u'August 1936']
[u'WHD-TV', u'HDTV', u'HD Model Station', u'D.C', u'United States', u'Washington', u'Raleigh', u'North Carolina', u'July 31, 1996', u'July 23, 1996', u'WRAL-TV', u'NBC', u'WRC-TV', u'WRAL-HD']
[u'8K', u'4k', u'5k', u'last.In', u'HDTV', u'HD', u'20th century', u'21st century']
[u'8K', u'4k', u'5k', u'last.In', u'HDTV', u'HD', u'20th century', u'21st century']
[u'720p', u'1080i', u'720', u'750-line', u'SMPTE', u'HDTV', u'According', u'US', u'1920', u'1280', u'ITU']
[u'8K', u'4k', u'5k', u'last.In', u'HDTV', u'HD', u'20th century', u'21st century']
[u'8K', u'4k', u'5k', u'last.In', u'HDTV', u'HD', u'20th century', u'21st century']
[u'819', u'higher-resolution', u'France', u'1949']
[u'8K', u'4k', u'5k', u'last.In', u'HDTV', u'HD', u'20th century', u'21st century']
[u'8K', u'4k', u'5k', u'last.In', u'HDTV', u'HD', u'20th century', u'21st century']
[u'1080p 24, 1080i 30, 1080i 25,', u'

[u'Japanese', u'high-definition', u'1979', u'NHK']
[u'60 Hz', u'Color', u'1972', u'1125', u'NHK']
[u'Japanese', u'high-definition', u'1979', u'NHK']
[u'Japanese', u'high-definition', u'1979', u'NHK']
[u'Japanese', u'MUSE', u'United States', u'1981']
[u'BS-9ch', u'1991', u'November 25, 1994', u'1989', u'NHK']
[u'BS-9ch', u'1991', u'November 25, 1994', u'1989', u'NHK']
[u'four', u'1080i', u'sub-Nyquist', u'NTSC', u'Multiple', u'MUSE', u'Hi-Vision', u'1125']
[u'BS-9ch', u'1991', u'November 25, 1994', u'1989', u'NHK']
[u'BS-9ch', u'1991', u'November 25, 1994', u'1989', u'NHK']
[u'Euro1080', u'HD1', u'Concert', u'New Year', u'Belgian', u'Vienna', u'January 1, 2004']
[u'four', u'1080i', u'sub-Nyquist', u'NTSC', u'Multiple', u'MUSE', u'Hi-Vision', u'1125']
[u'four', u'1080i', u'sub-Nyquist', u'NTSC', u'Multiple', u'MUSE', u'Hi-Vision', u'1125']
[u'BS-9ch', u'1991', u'November 25, 1994', u'1989', u'NHK']
[u'Japanese', u'MUSE', u'United States', u'1981']
[u'Japanese', u'MUSE', u'United States',

[u'30', u'August 1936']
[u'Ronald Reagan', u'President', u'HDTV', u'MUSE', u'Washington', u'US']
[u'Ronald Reagan', u'President', u'HDTV', u'MUSE', u'Washington', u'US']
[u'United States', u'Sarnoff', u'1980s', u'1993', u'Philips', u'General Instrument', u'AT&T Bell Labs', u'Digital HDTV Grand Alliance', u'Zenith', u'Massachusetts Institute of Technology', u'Thomson']
[u'United States', u'Sarnoff', u'1980s', u'1993', u'Philips', u'General Instrument', u'AT&T Bell Labs', u'Digital HDTV Grand Alliance', u'Zenith', u'Massachusetts Institute of Technology', u'Thomson']
[u'United States', u'Sarnoff', u'1980s', u'1993', u'Philips', u'General Instrument', u'AT&T Bell Labs', u'Digital HDTV Grand Alliance', u'Zenith', u'Massachusetts Institute of Technology', u'Thomson']
[u'HD1', u'direct-to-home HDTV', u'New Year', u'Day', u'Europe', u'September 2003', u'IBC']
[u'United States', u'Sarnoff', u'1980s', u'1993', u'Philips', u'General Instrument', u'AT&T Bell Labs', u'Digital HDTV Grand Alliance',

[u'Euro1080', u'kick-start HDTV', u'HDTV', u'Alfacam', u'Belgian TV', u'pan-European', u'HD', u'HD TVs', u'Europe']
[u'D-book', u'DTG', u'European', u'United Kingdom', u'December 2009', u'DVB-T2', u'Digital TV Group']
[u'D-book', u'DTG', u'European', u'United Kingdom', u'December 2009', u'DVB-T2', u'Digital TV Group']
[u'27%', u'20 million', u'200', u'185 million', u'60 million', u'16 million', u'TV', u'TVs', u'Satellite Monitor', u'HDTV', u'European HD', u'HD', u'European', u'Europe', u'2010', u'SES', u'Astra']
[u'D-book', u'DTG', u'European', u'United Kingdom', u'December 2009', u'DVB-T2', u'Digital TV Group']
[u'D-book', u'DTG', u'European', u'United Kingdom', u'December 2009', u'DVB-T2', u'Digital TV Group']
[u'360', u'Windows Media Center HTPC', u'PlayStation', u'Zune', u'HD', u'Xbox', u'Blu-ray', u'Sony', u'Microsoft', u'Netflix']
[u'D-book', u'DTG', u'European', u'United Kingdom', u'December 2009', u'DVB-T2', u'Digital TV Group']
[u'D-book', u'DTG', u'European', u'United Kingdom

[u'1080p 24, 1080i 30, 1080i 25,', u'720p', u'1080p', u'1080i', u'30', u'Blu-ray Disc', u'progressive-scan', u'Non-cinematic HDTV', u'high-definition', u'Internet']
[u'1080p 24, 1080i 30, 1080i 25,', u'720p', u'1080p', u'1080i', u'30', u'Blu-ray Disc', u'progressive-scan', u'Non-cinematic HDTV', u'high-definition', u'Internet']
[u'27%', u'20 million', u'200', u'185 million', u'60 million', u'16 million', u'TV', u'TVs', u'Satellite Monitor', u'HDTV', u'European HD', u'HD', u'European', u'Europe', u'2010', u'SES', u'Astra']
[u'1080p 24, 1080i 30, 1080i 25,', u'720p', u'1080p', u'1080i', u'30', u'Blu-ray Disc', u'progressive-scan', u'Non-cinematic HDTV', u'high-definition', u'Internet']
[u'1080p 24, 1080i 30, 1080i 25,', u'720p', u'1080p', u'1080i', u'30', u'Blu-ray Disc', u'progressive-scan', u'Non-cinematic HDTV', u'high-definition', u'Internet']
[u'1,080', u'1080p', u'709', u'1080i', u'16', u'BT.709-2', u'ITU-R Recommendation ITU-R', u'( Rec']
[u'ATSC', u'US', u'2009']
[u'ATSC', u'US',



Answering:   8%|▊         | 3/40 [00:30<05:55,  9.62s/it][A[A


[u'United States', u'Boston', u'Commonwealth of Massachusetts']
[u'United States', u'Boston', u'Commonwealth of Massachusetts']
[u'Boston', u'Massachusetts']
[u'24th', u'124 km2', u'48', u'655,884', u'United States', u'New England', u'2014']
[u'24th', u'124 km2', u'48', u'655,884', u'United States', u'New England', u'2014']
[u'29%', u'57%', u'25%', u'Protestant', u'Roman Catholic', u'Christians', u'2014', u'Pew Research Center']
[u'24th', u'124 km2', u'48', u'655,884', u'United States', u'New England', u'2014']
[u'24th', u'124 km2', u'48', u'655,884', u'United States', u'New England', u'2014']
[u'46,226', u'4.7', u'12 km2', u'Boston']
[u'24th', u'124 km2', u'48', u'655,884', u'United States', u'New England', u'2014']
[u'24th', u'124 km2', u'48', u'655,884', u'United States', u'New England', u'2014']
[u'29%', u'57%', u'25%', u'Protestant', u'Roman Catholic', u'Christians', u'2014', u'Pew Research Center']
[u'24th', u'124 km2', u'48', u'655,884', u'United States', u'New England', u'2014

[u'Paul Revere', u'Bunker Hill', u'American Revolution', u'Siege of Boston', u'Boston', u'Lexington', u'Concord', u'Boston Massacre', u'Boston Tea Party']
[u'1.2 million', u'2 million', u'Boston']
[u'Embargo Act', u'War', u'Boston', u'1807', u'1812', u'Napoleonic Wars']
[u'Embargo Act', u'War', u'Boston', u'1807', u'1812', u'Napoleonic Wars']
[u'four', u'130', u'British', u'Indian Wars', u'French', u'North America']
[u'Embargo Act', u'War', u'Boston', u'1807', u'1812', u'Napoleonic Wars']
[u'Embargo Act', u'War', u'Boston', u'1807', u'1812', u'Napoleonic Wars']
[u'Boston']
[u'Embargo Act', u'War', u'Boston', u'1807', u'1812', u'Napoleonic Wars']
[u'Embargo Act', u'War', u'Boston', u'1807', u'1812', u'Napoleonic Wars']
[u'mid-19th century']
[u'mid-19th century']
[u'mid-19th century']
[u'American', u'Great Britain']
[u'one', u'leather-goods', u'Boston', u'20th century']
[u'one', u'leather-goods', u'Boston', u'20th century']
[u'Northeast Corridor', u'Back Bay', u'South Station', u'Chicago

[u'264', u'two', u'three', u'Chechen Islamist', u'Boston Marathon', u'April 15, 2013']
[u'264', u'two', u'three', u'Chechen Islamist', u'Boston Marathon', u'April 15, 2013']
[u'42.2', u'26.2-mile', u'Patriots', u'Boston Marathon', u'April', u"' Day"]
[u'two', u'three', u'April 15, 2013']
[u'two', u'three', u'April 15, 2013']
[u'264', u'two', u'three', u'Chechen Islamist', u'Boston Marathon', u'April 15, 2013']
[u'20.4%', u'28.8%', u'18', u'65']
[u'20.4%', u'28.8%', u'18', u'65']
[u'0.7%', u'4.9%', u'1.8%', u'4.2%', u'1.1%', u'1.0%', u'25,648', u'10,850', u'30,506', u'5,961', u'6,649', u'4,451', u'Colombian', u'Guatemalan', u'Mexican', u'Dominican', u'Salvadoran', u'Boston', u'Puerto Rican']
[u'46.0', u'89.6', u'106.7 km2', u'41.2', u'125.4 km2', u'232.1 km2', u'48.4', u'54.0', u'Boston']
[u'46.0', u'89.6', u'106.7 km2', u'41.2', u'125.4 km2', u'232.1 km2', u'48.4', u'54.0', u'Boston']
[u'24th', u'124 km2', u'48', u'655,884', u'United States', u'New England', u'2014']
[u'46.0', u'89.6',

[u'13 percent', u'American']
[u'21.9%', u'20.4%', u'10.1%', u'33.2%', u'14.3%', u'24', u'25', u'20', u'19', u'44', u'45', u'64', u'65']
[u'21.9%', u'20.4%', u'10.1%', u'33.2%', u'14.3%', u'24', u'25', u'20', u'19', u'44', u'45', u'64', u'65']
[u'13 percent', u'American']
[u'21.9%', u'20.4%', u'10.1%', u'33.2%', u'14.3%', u'24', u'25', u'20', u'19', u'44', u'45', u'64', u'65']
[u'21.9%', u'20.4%', u'10.1%', u'33.2%', u'14.3%', u'24', u'25', u'20', u'19', u'44', u'45', u'64', u'65']
[u'13 percent', u'American']
[u'2.1%', u'1%', u'Future', u'Five', u'Future Best City', u'Boston', u'2008', u'2009', u'2000']
[u'2.1%', u'1%', u'Future', u'Five', u'Future Best City', u'Boston', u'2008', u'2009', u'2000']
[u'million', u'U.S']
[u'Boston']
[u'Boston']
[u'United States']
[u'Boston']
[u'Boston']
[u'16.0%', u'21.4%']
[u'540', u'544', u'Full-time year-round', u'full-time year-round']
[u'540', u'544', u'Full-time year-round', u'full-time year-round']
[u'451-bed', u'full-service', u'Chinatown', u'Bost

[u'one', u'South Church', u'United States', u'Boston']
[u'Old South Church', u'Park Street Church', u'Basilica', u'1733', u'1723', u'1809', u'1874', u'Christ Church', u'Jubilee Christian Church', u'Old North Church', u'Trinity Church', u'Shrine of Our Lady of Perpetual Help on Mission Hill (1878).']
[u'30', u'Boston']
[u'30', u'Boston']
[u'one', u'United States']
[u'mid-19th century']
[u'mid-19th century']
[u'30', u'1970s']
[u'12th-largest', u'sixth-largest', u'Greater Boston']
[u'12th-largest', u'sixth-largest', u'Greater Boston']
[u'Boston']
[u'12th-largest', u'sixth-largest', u'Greater Boston']
[u'12th-largest', u'sixth-largest', u'Greater Boston']
[u'Boston']
[u'Boston College', u'Boston Conservatory', u'Boston University', u'Harvard Medical School', u'Wentworth Institute of Technology', u'Northeastern University', u'Berklee College of Music']
[u'Boston College', u'Boston Conservatory', u'Boston University', u'Harvard Medical School', u'Wentworth Institute of Technology', u'Northea

[u'School', u'Design', u'Museum of Fine Arts', u'United States', u'Boston', u'New England Institute of Art', u'Massachusetts College of Art', u'New England School of Art and Design (Suffolk University), Longy School of Music of Bard College', u'New England Conservatory', u'Lesley University College of Art']
[u'United States', u'Boston', u'US', u'1635', u'Boston Latin School']
[u'Boston', u'Boston Conservatory', u'Berklee College of Music']
[u'Boston', u'Boston Conservatory', u'Berklee College of Music']
[u"'Neill,", u'Catholics', u'Irish', u'Kennedys', u'Fitzgerald', u'Tip O', u'Boston', u'early 20th century', u'John F']
[u'Boston']
[u'Boston']
[u'11', u'20', u'North Atlantic', u'spring']
[u'Boston', u'Cambridge', u'Charles River', u'Harvard University']
[u'Boston', u'Cambridge', u'Charles River', u'Harvard University']
[u'School', u'Design', u'Museum of Fine Arts', u'United States', u'Boston', u'New England Institute of Art', u'Massachusetts College of Art', u'New England School of Ar

[u'The Graves', u'Lovells Island', u'Little Calf Island', u'Green Island', u'Rainsford Island', u'Spectacle Island', u'Little Brewster Island', u'Long Island', u'Thompson Island', u'Gallops Island', u'Boston Harbor', u'Outer Brewster Island', u'Middle Brewster Island', u'Great Brewster Island', u'Calf Island', u'Shag Rocks', u'Boston Harbor Islands National Recreation Area', u'Nixes Mate']
[u'Freedom Trail']
[u'Freedom Trail']
[u'multi-family', u'single-family']
[u'Diller Scofidio + Renfro', u'Seaport District', u'Institute of Contemporary Art']
[u'Diller Scofidio + Renfro', u'Seaport District', u'Institute of Contemporary Art']
[u'School', u'Design', u'Museum of Fine Arts', u'United States', u'Boston', u'New England Institute of Art', u'Massachusetts College of Art', u'New England School of Art and Design (Suffolk University), Longy School of Music of Bard College', u'New England Conservatory', u'Lesley University College of Art']
[u'one', u'Athenaeum', u'Museum', u'United States', u'

[u'Central Artery', u'Big Dig', u'Boston', u'2006', u'Neill Tunnel']
[u'two', u'FleetCenter', u'since-demolished', u'North Station', u'Boston Garden', u'Boston Bruins', u'TD Garden', u'Boston Celtics of the National Basketball Association', u'National Hockey League']
[u'two', u'FleetCenter', u'since-demolished', u'North Station', u'Boston Garden', u'Boston Bruins', u'TD Garden', u'Boston Celtics of the National Basketball Association', u'National Hockey League']
[u'Emerald Necklace', u'Frederick Law Olmsted', u'Boston Public Garden']
[u'two', u'FleetCenter', u'since-demolished', u'North Station', u'Boston Garden', u'Boston Bruins', u'TD Garden', u'Boston Celtics of the National Basketball Association', u'National Hockey League']
[u'two', u'FleetCenter', u'since-demolished', u'North Station', u'Boston Garden', u'Boston Bruins', u'TD Garden', u'Boston Celtics of the National Basketball Association', u'National Hockey League']
[u'17,565', u'18,624']
[u'17,565', u'18,624']
[u'17,565', u'18

[u'Thomas P', u'John F', u'District of Massachusetts', u'Kennedy Federal Office Building', u'Federal Reserve Bank of Boston', u'United States Court of Appeals', u'Neill Federal Building', u'First Circuit', u'United States District Court']
[u'Thomas P', u'John F', u'District of Massachusetts', u'Kennedy Federal Office Building', u'Federal Reserve Bank of Boston', u'United States Court of Appeals', u'Neill Federal Building', u'First Circuit', u'United States District Court']
[u"'Neill,", u'Catholics', u'Irish', u'Kennedys', u'Fitzgerald', u'Tip O', u'Boston', u'early 20th century', u'John F']
[u'Thomas P', u'John F', u'District of Massachusetts', u'Kennedy Federal Office Building', u'Federal Reserve Bank of Boston', u'United States Court of Appeals', u'Neill Federal Building', u'First Circuit', u'United States District Court']
[u'Thomas P', u'John F', u'District of Massachusetts', u'Kennedy Federal Office Building', u'Federal Reserve Bank of Boston', u'United States Court of Appeals', u'

[u'Medical Center', u'Elizabeth', u'Center', u'Brighton']
[u'Medical Center', u'Elizabeth', u'Center', u'Brighton']
[u'451-bed', u'full-service', u'Chinatown', u'Boston', u'Somerville', u'Medford', u'Floating Hospital for Children', u'Tufts Medical Center', u'Tufts University']
[u'Massachusetts', u'Boston Public Health Commission']
[u'Massachusetts', u'Boston Public Health Commission']
[u'Boston']
[u'Boston']
[u'Joslin Diabetes Center', u'Hospital', u'Academic Area', u'Children', u'Women', u'Fenway district', u'Boston', u'Dana-Farber Cancer Institute', u'Brigham', u'Massachusetts College of Pharmacy and Health Sciences', u'Harvard Medical School', u'Beth Israel Deaconess Medical Center', u'Longwood Medical']
[u'Academic Area', u'Massachusetts General Hospital', u'Longwood Medical', u'Harvard Medical School']
[u'Academic Area', u'Massachusetts General Hospital', u'Longwood Medical', u'Harvard Medical School']
[u'Joslin Diabetes Center', u'Hospital', u'Academic Area', u'Children', u'Wome



Answering:  10%|█         | 4/40 [00:58<09:03, 15.11s/it]

[u'Hubway', u'Brookline', u'Cambridge', u'Somerville', u'summer 2012']
[u'140,000', u'Hubway', u'July 2011']
[u'140,000', u'Hubway', u'July 2011']
[u'20 million', u'Faneuil Hall']
[u'Prohibition', u'1920s']
[u'Prohibition', u'1920s']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'United States Supreme Court']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'United States Supreme Court']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'United States Supreme Court']
[u'U.S Supreme Court']


[A[A

[u'Communications Act', u'Prohibition', u'non-consensual', u'1934', u'Congress']
[u'Communications Act', u'Prohibition', u'non-consensual', u'1934', u'Congress']
[u'Boggs', u'Congress']
[u'Nardone', u'United States', u'1939', u'1934', u'FBI']
[u'Nardone', u'United States', u'1939', u'1934', u'FBI']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'United States Supreme Court']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'United States Supreme Court']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'United States Supreme Court']
[u'Katz', u'Omnibus Crime Control Act', u'United States', u'1967', u'1927', u'Congress']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'United States Supreme Court']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'United States Supreme Court']
[u'Prohibition', u'1920s']
[u'Olmstead', u'Fourth Amendment', u'United States', u'1927', u'FBI', u'Un

[u'President', u'Director of National Intelligence (DNI']
[u'President', u'Director of National Intelligence (DNI']
[u'President', u'Director of National Intelligence (DNI']
[u'Terrorism Prevention Act', u'President', u'United States', u'September 11', u'Intelligence Reform', u'FBI']
[u'17,000', u'Uniform Crime Reports', u'UCR']
[u'17,000', u'Uniform Crime Reports', u'UCR']
[u'93%', u'FBI', u'UCR']
[u'UCR']
[u'UCR']
[u'56', u'400', u'United States', u'FBI', u'Central Intelligence Agency (CIA']
[u'1920s', u'UCR']
[u'1920s', u'UCR']
[u'2007', u'Virgil Griffith', u'Wikipedia', u'neural-systems', u'August', u'Caltech']
[u'85%', u'Martin Luther King', u'Orlando Bosch', u'Cuban', u'Weathermen', u'Students', u'COINTELPRO', u'Advancement of Colored People', u'American Indian Movement', u'Democratic Society', u'Jr', u'Cuban Power', u'War', u'New Left', u'United Ireland', u'Vietnam', u'Puerto Rico', u'Congress of Racial Equality', u'Civil Rights Movement', u'National Lawyers Guild', u'National A

[u'four', u'Joseph Barboza', u'Nancy Gertner', u'U.S', u'District Judge', u'Boston', u'July 2007']
[u'Cold War', u'well-established', u'FBI']
[u'four', u'32', u'30', u'two', u'36']
[u'four', u'32', u'30', u'two', u'36']
[u'four', u'U.S', u'Government']
[u'four', u'U.S', u'Government']
[u'Venona', u'Soviet', u'British', u'UK code-breaking', u'US', u'FBI']
[u'World Trade Center', u'counter-terrorism', u'New York City', u'Oklahoma City', u'New York', u'1995 Oklahoma City', u'Oklahoma', u'1993', u'1996', u'FBI', u'Unabomber']
[u'World Trade Center', u'counter-terrorism', u'New York City', u'Oklahoma City', u'New York', u'1995 Oklahoma City', u'Oklahoma', u'1993', u'1996', u'FBI', u'Unabomber']
[u'PATRIOT Act', u'1970s', u'FBI']
[u'World Trade Center', u'counter-terrorism', u'New York City', u'Oklahoma City', u'New York', u'1995 Oklahoma City', u'Oklahoma', u'1993', u'1996', u'FBI', u'Unabomber']
[u'World Trade Center', u'counter-terrorism', u'New York City', u'Oklahoma City', u'New York', 

[u'DNA']
[u'Trilogy', u'IT', u'2000', u'FBI']
[u'Trilogy', u'IT', u'2000', u'FBI']
[u'code-named Sentinel', u'2009', u'March 2005', u'FBI']
[u'Trilogy', u'IT', u'2000', u'FBI']
[u'Trilogy', u'IT', u'2000', u'FBI']
[u'CITAC', u'Internet-related', u'Infrastructure Threat Assessment Center', u'Investigations', u'US', u'National Infrastructure Protection Center (NIPC']
[u'three']
[u'three']
[u'Trilogy', u'IT', u'2000', u'FBI']
[u'two', u'January 2005', u'FBI']
[u'two', u'January 2005', u'FBI']
[u'Case File', u'VCF']
[u'40', u'22', u'23', u'Glock Model', u'Glock', u'FBI']
[u'40', u'22', u'23', u'Glock Model', u'Glock', u'FBI']
[u'45', u'Weapons', u'SWAT', u'Tactics Teams', u'FBI HRT (Hostage Rescue Team', u'Springfield Professional Model 1911A1', u'FBI', u'ACP']
[u'19', u'17', u'Glock']
[u'19', u'17', u'Glock']
[u'40', u'New Agent Class', u'Glock', u'98-1', u'October 1997', u'May 1997', u'FBI']
[u'40', u'New Agent Class', u'Glock', u'98-1', u'October 1997', u'May 1997', u'FBI']
[u'40', u'Ne

[u'non-violent', u'United States']
[u'President', u'Director of National Intelligence (DNI']
[u'President', u'Director of National Intelligence (DNI']
[u'U.S', u'Attorney General', u'the Director of National Intelligence', u'FBI', u'U.S', u'Intelligence Community', u'Department of Justice']
[u'200', u'U.S', u'FBI']
[u'200', u'U.S', u'FBI']
[u'46', u'22', u'Group A']
[u'48', u'Edgar Hoover', u'Director', u'DOI', u'1924', u'1972', u'BOI', u'FBI']
[u'48', u'Edgar Hoover', u'Director', u'DOI', u'1924', u'1972', u'BOI', u'FBI']
[u'Edgar Hoover', u'Calvin Coolidge', u'longest-serving', u'1924', u'1972']
[u'48', u'Edgar Hoover', u'Director', u'DOI', u'1924', u'1972', u'BOI', u'FBI']
[u'48', u'Edgar Hoover', u'Director', u'DOI', u'1924', u'1972', u'BOI', u'FBI']
[u'ten', u'Hoover', u'Directors', u'FBI', u'Congress']
[u'1932', u'FBI Laboratory', u'Scientific Crime Detection Laboratory']
[u'1932', u'FBI Laboratory', u'Scientific Crime Detection Laboratory']
[u'Edgar Hoover', u'Building', u'Labor

[u'Controlled Substances Act', u'1970', u'Drug Enforcement Administration (DEA', u'FBI']
[u'Controlled Substances Act', u'1970', u'Drug Enforcement Administration (DEA', u'FBI']
[u'200', u'U.S', u'FBI']
[u'Controlled Substances Act', u'1970', u'Drug Enforcement Administration (DEA', u'FBI']
[u'Controlled Substances Act', u'1970', u'Drug Enforcement Administration (DEA', u'FBI']
[u'Mann Act', u'White Slave Traffic Act', u'June 25, 1910']
[u'Federal', u'U.S', u'Customs', u'National Transportation Safety Board', u'FBI', u'Border Protection (CBP', u'Coast Guard (USCG']
[u'Federal', u'U.S', u'Customs', u'National Transportation Safety Board', u'FBI', u'Border Protection (CBP', u'Coast Guard (USCG']
[u'CITAC', u'Internet-related', u'Infrastructure Threat Assessment Center', u'Investigations', u'US', u'National Infrastructure Protection Center (NIPC']
[u'Federal', u'U.S', u'Customs', u'National Transportation Safety Board', u'FBI', u'Border Protection (CBP', u'Coast Guard (USCG']
[u'Federal',

[u'USA PATRIOT Act', u'Internet', u'FBI']
[u'1932', u'United States Bureau of Investigation']
[u'1932', u'United States Bureau of Investigation']
[u'Clinton', u'FBI']
[u'Carnivore', u'mid-January 2005', u'2001', u'FBI', u'NarusInsight', u'Associated Press']
[u'Carnivore', u'mid-January 2005', u'2001', u'FBI', u'NarusInsight', u'Associated Press']
[u'two', u'January 2005', u'FBI']
[u'Clinton', u'FBI']
[u'Clinton', u'FBI']
[u'Carnivore', u'mid-January 2005', u'2001', u'FBI', u'NarusInsight', u'Associated Press']
[u'President', u'Directors', u'United States']
[u'President', u'Directors', u'United States']
[u'Edgar Hoover', u'Calvin Coolidge', u'longest-serving', u'1924', u'1972']
[u'Soviet', u'Americans', u'United States']
[u'Soviet', u'Americans', u'United States']
[u'five', u'ten', u'President', u'United States Senate']
[u'82', u'90\u2013351', u'197', u'10-year', u'two 5-year', u'Stat', u'Omnibus Crime Control', u'Directors', u'Safe Streets Act Pub.L', u'1968', u'June 19, 1968', u'Senat



Answering:  12%|█▎        | 5/40 [01:17<09:25, 16.16s/it]

[u'Filiberto Ojeda R\xedos', u'2005', u'fugitive Puerto Rican Nationalist', u'FBI']
[u'Filiberto Ojeda R\xedos', u'2005', u'fugitive Puerto Rican Nationalist', u'FBI']
[u'Commission', u'U.S', u'Review of FBI Security Programs', u'US Department of Justice']
[u'FBI']
[u'FBI']
[u'four', u'32', u'30', u'two', u'36']
[u'Commonwealth', u'Puerto Rico Justice Department', u'FBI', u'US Attorney General']
[u'Commonwealth', u'Puerto Rico Justice Department', u'FBI', u'US Attorney General']
[u'Filiberto Ojeda R\xedos', u'2005', u'fugitive Puerto Rican Nationalist', u'FBI']
[u'one', u'Homo sapiens', u'Homo erectus', u'Italian', u'Eritrea', u'Buya']


[A[A

[u'one', u'Homo sapiens', u'Homo erectus', u'Italian', u'Eritrea', u'Buya']
[u'1 million']
[u'1 million']
[u'1 million']
[u'one', u'Homo sapiens', u'Homo erectus', u'Italian', u'Eritrea', u'Buya']
[u'Homo erectus', u'Eritrea', u'Danakil Depression']
[u'Homo erectus', u'Eritrea', u'Danakil Depression']
[u'one', u'Homo sapiens', u'Homo erectus', u'Italian', u'Eritrea', u'Buya']
[u'one', u'Homo sapiens', u'Homo erectus', u'Italian', u'Eritrea', u'Buya']
[u'1 million']
[u'one', u'Homo sapiens', u'Homo erectus', u'Italian', u'Eritrea', u'Buya']
[u'one', u'Homo sapiens', u'Homo erectus', u'Italian', u'Eritrea', u'Buya']
[u'1 million']
[u'two', u'Medri Bahri', u'James Bruce', u'Scottish', u'Abyssinia', u'1770']
[u'two', u'Medri Bahri', u'James Bruce', u'Scottish', u'Abyssinia', u'1770']
[u'one', u'Homo sapiens', u'Homo erectus', u'Italian', u'Eritrea', u'Buya']
[u'two', u'Medri Bahri', u'James Bruce', u'Scottish', u'Abyssinia', u'1770']
[u'two', u'Medri Bahri', u'James Bruce', u'Scottish', u'

[u'10', u'30', u'Gash River', u'December 2001']
[u'10', u'30', u'Gash River', u'December 2001']
[u'1955', u'2001']
[u'tree-top']
[u'tree-top']
[u'10', u'30', u'Gash River', u'December 2001']
[u'1955', u'2001']
[u'1955', u'2001']
[u'Eritrean', u'Eritrean War of Independence', u'Ethiopia', u'April 1993']
[u'three', u'Eritrea']
[u'three', u'Eritrea']
[u'3000m']
[u'3,018', u'9,902', u'three', u'one', u'Afar Triangle', u'another.The', u'Eritrea', u'Danakil Depression of Eritrea', u'Emba Soira']
[u'3,018', u'9,902', u'three', u'one', u'Afar Triangle', u'another.The', u'Eritrea', u'Danakil Depression of Eritrea', u'Emba Soira']
[u'Adam', u'Umar Din', u'Imam', u'Aussa', u'1672']
[u'3,018', u'9,902', u'three', u'one', u'Afar Triangle', u'another.The', u'Eritrea', u'Danakil Depression of Eritrea', u'Emba Soira']
[u'3,018', u'9,902', u'three', u'one', u'Afar Triangle', u'another.The', u'Eritrea', u'Danakil Depression of Eritrea', u'Emba Soira']
[u'one', u'Homo sapiens', u'Homo erectus', u'Italian

[u'Red Sea', u'Eritrea']
[u'Red Sea', u'Eritrea']
[u'ultra-nationalist', u'African', u'Eritrea', u'Reporters Without Borders', u'BBC']
[u'45,406', u'117,600 km2', u'Dahlak Archipelago', u'Hanish Islands']
[u'45,406', u'117,600 km2', u'Dahlak Archipelago', u'Hanish Islands']
[u'2,540,000', u'5,254,000', u'Tigrinya', u'Eritrea', u'2006']
[u'Tigrinya']
[u'Tigrinya']
[u'pre-Aksumite', u'Agordat', u'Eritrea', u'Gash Group']
[u'Eritrean']
[u'Eritrean']
[u'Eritrean', u'Eritrea']
[u'Helen Meles', u'Eritrean', u'Tigrinya']
[u'Helen Meles', u'Eritrean', u'Tigrinya']
[u'Eritrea']
[u'Helen Meles', u'Eritrean', u'Tigrinya']
[u'Helen Meles', u'Eritrean', u'Tigrinya']
[u'two', u'Medri Bahri', u'James Bruce', u'Scottish', u'Abyssinia', u'1770']
[u'Eritrea']
[u'Eritrea']
[u'Eritrea']
[u'Medri Bahri', u'sea-land', u'Middle Ages', u'Eritrea']
[u'Medri Bahri', u'sea-land', u'Middle Ages', u'Eritrea']
[u'seven', u'13', u'Eritrea']
[u'two', u'Medri Bahri', u'James Bruce', u'Scottish', u'Abyssinia', u'1770']

In [None]:
# # run on test data

# with open("result_to_kaggle.txt",'w') as output_file:
# #     limit = 3
#     output_file.write('id,answer'+'\n')
#     for i in tqdm(range(len(match_sent)), desc='Answering'):
#         for j in range(len(match_sent[i])):
#             result = ''
#             backoff = 0
            
#             Q = dev[i]["qa"][j]['question']
            
# #             result = get_ranked_ans(entity_pool[i][j][0], Q, question_sent_list[i][j][0])
            
#             result1,score1 = get_ranked_ans(entity_pool[i][j][0], Q, question_sent_list[i][j][0])
#             result2,score2 = get_ranked_ans(entity_pool[i][j][1], Q, question_sent_list[i][j][1])
            
#             if score2 * 0.2>score1:
#                 result=result2
#             else:
#                 result=result1
            
# #             while (result == '') and (backoff < limit):
# #                 backoff += 1
# #                 result = get_ranked_ans(entity_pool[i][j][backoff], Q, question_sent_list[i][j][backoff])
            
#             result = result.encode('utf-8')
#             reuslt = result.replace('" ','')
#             result = result.replace('"','')
#             result = result.replace(",","-COMMA-")
#             q_id = dev[i]["qa"][j]['id']
#             output_file.write(str(q_id) + ',' + str(result) + '\n')