# Sentence Retrieval

In [1]:
import json,math,nltk

train_file = open("train.json",'r')
dev_file = open("dev.json",'r')
test_file=open("dev.json",'r')
train = json.loads(train_file.read())
dev = json.loads(dev_file.read())
test = json.loads(test_file.read())
# train[0]['sentences']
# train[0]['qa']

In [2]:
from nltk.corpus import stopwords
# may be stem?

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
stopword =  stopwords.words()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def doc_word_dict(doc):
    word_dict = set()
    for sent in doc['sentences']:
        for word in  word_tokenizer.tokenize(sent):
            word = lemmatize(word.lower())
            if word not in stopword:
                word_dict.add(word)
    return word_dict

In [3]:
def get_BOW(sent):
    term_dict={}
    for word in word_tokenizer.tokenize(sent):
        word = lemmatize(word.lower())
        if word not in stopword:
            term_dict[word]=term_dict.setdefault(word,0)+1
    return term_dict

def cal_BOW(doc):
    doc_term_matrix = [] 
    for sent in doc['sentences']:
        temp = get_BOW(sent)
        doc_term_matrix.append(temp)
    return doc_term_matrix

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine as cos_distance

vectorizer = DictVectorizer()
transformer = TfidfTransformer(smooth_idf=False,norm=None)

term_matrix = transformer.fit_transform(vectorizer.fit_transform(cal_BOW(train[0])))
query =  transformer.transform(vectorizer.transform(get_BOW(train[0]['qa'][0]['question'])))

In [5]:
def get_best_doc_num(query):
    query =  transformer.transform(vectorizer.transform(get_BOW(query)))
    f = lambda x: cos_distance(query.toarray(),term_matrix[x].toarray())
    best_doc = min(range(term_matrix.shape[0]),key=f)
    return best_doc

In [14]:
vectorizer = DictVectorizer()
transformer = TfidfTransformer(smooth_idf=False,norm=None)

correct_rate = []
# for doc in train[0]:
doc_correct_rate = []
term_matrix = transformer.fit_transform(vectorizer.fit_transform(cal_BOW(train[0])))
for qa in train[0]['qa']:
    doc_correct_rate.append(get_best_doc_num(qa['question']))
correct_rate.append(doc_correct_rate)

In [None]:
correct_rate

# Entity Extraction

In [35]:
from nltk.tag.stanford import StanfordNERTagger
st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
               '/usr/share/stanford-ner/stanford-ner.jar') 

train_sent_list = []
for i in range(len(correct_rate)):
    for j in correct_rate[i]:
        train_sent_list.append(word_tokenizer.tokenize(train[i]['sentences'][j]))
train_sent_list = st.tag_sents(train_sent_list) 

In [38]:
def tune_other(train_sent_list):
    for i in range(len(train_sent_list)):
        for j in range(len(train_sent_list[i])):
            term,tag = train_sent_list[i][j]
            if tag == "ORGANIZATION"  or (len(term)>0 and (term,tag)!=train_sent_list[i][0] and tag == 'O' and term[0].isupper()):
                train_sent_list[i][j] = (term,"OTHER")

tune_other(train_sent_list)

In [39]:
def combine_entity(train_sent_list):
    for i in range(len(train_sent_list)):
        j = 0
        while j < len(train_sent_list[i])-2:
            term,tag = train_sent_list[i][j]
            term_n,tag_n = train_sent_list[i][j+1]
            if tag == tag_n and tag != "O":
                temp =  (term + " " + term_n,tag)
                train_sent_list[i][j] = temp
                del train_sent_list[i][j+1]
            j += 1

combine_entity(train_sent_list)

In [48]:
train_tuple = train_sent_list
train_list = []
for i in train_tuple:
    train_ = []
    for term,tag in i:
        train_.append(term)
    train_list.append(train_)

In [49]:
train_list

[[u'Standard',
  u'practice',
  u'for',
  u'LPs',
  u'was',
  u'to',
  u'place',
  u'the',
  u'LP',
  u'in',
  u'a',
  u'paper',
  u'or',
  u'plastic',
  u'inner',
  u'cover',
  u'.'],
 [u'Breakage',
  u'was',
  u'very',
  u'common',
  u'in',
  u'the',
  u'shellac',
  u'era',
  u'.'],
 [u'Abandoning',
  u'Berliner',
  u"'",
  u's',
  u'"',
  u'Gramophone',
  u'"',
  u'trademark',
  u'for',
  u'legal',
  u'reasons',
  u',',
  u'in',
  u'1901',
  u'Johnson',
  u"'",
  u's',
  u'and',
  u'Berliner',
  u"'",
  u's',
  u'separate',
  u'companies',
  u'reorganized',
  u'to',
  u'form',
  u'the',
  u'Victor Talking',
  u'Machine Company',
  u',',
  u'whose',
  u'products',
  u'would',
  u'come',
  u'to',
  u'dominate',
  u'the',
  u'market',
  u'for',
  u'many',
  u'years',
  u'.'],
 [u'In',
  u'the',
  u'mid',
  u'-',
  u'1940s',
  u',',
  u'special',
  u'DJ',
  u'copies',
  u'of',
  u'records',
  u'started',
  u'being',
  u'made',
  u'of',
  u'vinyl',
  u'also',
  u',',
  u'for',
  u'the',


In [41]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

tags = ["PERSON","LOCATION","NUMBER","OTHER"]
entity_pool = []
for i in train_sent_list:
    sent_tag_dict = dict.fromkeys(tags,[])
    for j in range(len(i)):
        term,tag = i[j]
        if tag == "PERSON" or tag == "LOCATION" or tag == "OTHER":
            sent_tag_dict[tag] = sent_tag_dict[tag]+ [term]
        elif tag == "DATE" or tag == "TIME" or tag == "PERCENT" or hasNumbers(term):
            sent_tag_dict["NUMBER"] = sent_tag_dict["NUMBER"]+ [term]
    entity_pool.append(sent_tag_dict)

In [42]:
entity_pool

[{'LOCATION': [], 'NUMBER': [], 'OTHER': [u'LPs', u'LP'], 'PERSON': []},
 {'LOCATION': [], 'NUMBER': [], 'OTHER': [], 'PERSON': []},
 {'LOCATION': [],
  'NUMBER': [u'1901'],
  'OTHER': [u'Berliner',
   u'Gramophone',
   u'Johnson',
   u'Victor Talking',
   u'Machine Company'],
  'PERSON': [u'Berliner']},
 {'LOCATION': [], 'NUMBER': [u'1940s'], 'OTHER': [u'DJ'], 'PERSON': []},
 {'LOCATION': [], 'NUMBER': [], 'OTHER': [], 'PERSON': []},
 {'LOCATION': [],
  'NUMBER': [u'20th century', u'1880s', u'1920s', u'1920s'],
  'OTHER': [],
  'PERSON': []},
 {'LOCATION': [], 'NUMBER': [u'1980s', u'1991'], 'OTHER': [], 'PERSON': []},
 {'LOCATION': [],
  'NUMBER': [u'1990s', u'2010s'],
  'OTHER': [u'DJ'],
  'PERSON': []},
 {'LOCATION': [],
  'NUMBER': [u'early 21st',
   u'century \u2013',
   u'9',
   u'2',
   u'2014',
   u'260 %',
   u'2009'],
  'OTHER': [u'U', u'S'],
  'PERSON': []},
 {'LOCATION': [], 'NUMBER': [u'1970s'], 'OTHER': [], 'PERSON': []},
 {'LOCATION': [], 'NUMBER': [], 'OTHER': [], 'PERS

# Answer Ranking

In [50]:
import operator
import nltk


# A simple rule-based question type classifier based on key words 
def get_question_type(question):
    # TODO: HAND-CODED, NEED TO BE REFINED!!
    # TODO: need to low-case to compare?
    
    type_rules = {'PERSON':["Who", "Whose", "Whom"],
                  'LOCATION':["Where"],
                  'NUMBER':["When", "few", "little", "much", "many",
                            "young", "old", "long", "year", "years"]
                 }
    q_type = None
    for question_type, key_words in type_rules.items():
        for key_word in key_words:
            if key_word in question:
                q_type = question_type
                break
        if q_type == None:
            q_type = 'OTHER'
    return q_type

In [56]:
from nltk import word_tokenize

# among entities of the same type, the prefered entity should be 
# the one which is closer in the sentence to a open-class word
# from the question.
# ----> nouns, verbs, adjectives, and adverbs.
def get_preferred_entity(entity_list, sentence, question):
    sentence_text = word_tokenize(sentence)
    question_text = word_tokenize(question)
    
    sentence_tag = nltk.pos_tag(sentence_text,tagset='universal')
    question_tag = nltk.pos_tag(question_text,tagset='universal')
    
    # initialize a list for comparing, and set all elements as 0
    is_open_word = [0] * len(sentence_text)
    # find an open word in the question
    for word, tag in question_tag:
        if tag in ['ADJ','NOUN','VERB','ADV']:
            # if the open word appears in the sentence, then mark as 1
            for i in range(len(sentence_text)):
                if sentence_text[i] == word:
                    is_open_word[i] = 1
#     print is_open_word
    
    # find the closest distance to an open-class word for an entity
    def get_distance(entity):
        # get the position of entity, and find the open class word 
        # from the nearest at both sides
        position = sentence_text.index(entity)
        for i in range(1, len(sentence_text)):
            distance = 0
            if position - i >= 0:
                if is_open_word[position - i] == 1:  # find an open-class word on the left
                    distance = i
                    break
                elif position + i < len(is_open_word):  # find an open-class word on the right
                    if is_open_word[position + i] == 1:
                        distance = i
                        break
                else:
                    distance = len(sentence_text) + 1  # didn't find open-class words
        return distance
    
    # get distance for each entity and choose the best one
    all_distance = []
    for entity in entity_list:
        all_distance.append(get_distance(entity))
        preferred_entity = entity_list[all_distance.index(min(all_distance))]

    return preferred_entity
    

In [57]:
def get_ranked_ans(entities_dic, demo_question, sentence):
    q_type = get_question_type(demo_question)
    tmp_rank = {}
    for ent_type,entities in entities_dic.items():
        # answers whose content words all appear in the question should be ranked lowest.
        for entity in entities:
            if entity in demo_question:
                tmp_rank[entity] = tmp_rank.setdefault(entity,0) - 1
        # Answers which match the question type should be ranked higher than those that don't
        if ent_type == q_type:
            for entity in entities:
                tmp_rank[entity] = tmp_rank.setdefault(entity,0) + 1
            # entity closer in the sentence to a closed-class word should be preferred
            tmp_rank[get_preferred_entity(entities, sentence, demo_question)] = tmp_rank.setdefault(entity,0) + 1
    
    # sort and choose the best answer
    sorted_ans = sorted(tmp_rank.items(), key=operator.itemgetter(1), reverse=True)
    best_ans = sorted_ans[0][0]
    
    return best_ans

In [61]:
print train[0]["qa"][3]['answer_sentence']
print correct_rate[0][3]
print entity_pool[3]
print train[0]["qa"][3]['question']
print train[0]["qa"][3]['answer']
print train[0]["sentences"][correct_rate[0][3]]
print get_ranked_ans(entity_pool[3], train[0]["qa"][3]['question'], train[0]["sentences"][correct_rate[0][3]])

1
137
{'PERSON': [], 'OTHER': [u'DJ'], 'LOCATION': [], 'NUMBER': [u'1940s']}
Where does the groove on a vinyl record typically start?
near the periphery
In the mid-1940s, special DJ copies of records started being made of vinyl also, for the same reason.


UnboundLocalError: local variable 'preferred_entity' referenced before assignment

In [59]:
for i in range(len(correct_rate[0])):
    print i
    print entity_pool[i],train[0]["qa"][i]['question'],train[0]["sentences"][correct_rate[0][i]]
    print get_ranked_ans(entity_pool[i], train[0]["qa"][i]['question'], train[0]["sentences"][correct_rate[0][i]])

0
{'PERSON': [], 'OTHER': [u'LPs', u'LP'], 'LOCATION': [], 'NUMBER': []} What does LP stand for when it comes to time capacity? Standard practice for LPs was to place the LP in a paper or plastic inner cover.
LPs
1
{'PERSON': [], 'OTHER': [], 'LOCATION': [], 'NUMBER': []} What are common diameters found in phonograph records? Breakage was very common in the shellac era.


UnboundLocalError: local variable 'entity' referenced before assignment