In [16]:
import spacy
import pickle
import json
import numpy as np
from pyemd import emd 
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.metrics import pairwise_distances 

In [17]:
tags_weight_matrix = np.load("wmd_weight_matrix.npy")
with open("unique_list.txt", "rb") as fp:
    unique_ques_sent_tags = pickle.load(fp)
with open('word2vec.json') as json_data:
    word2vec_dict = json.load(json_data)

In [18]:
def get_emd_with_tags(qtag, atag):
    ques_tagged = {}
    sent_tagged = {}
    for word in unique_ques_sent_tags:
        if word in qtag.keys():
            ques_tagged[word] = 1
        else:
            ques_tagged[word] = 0

        if word in atag.keys():
            sent_tagged[word] = atag[word]
        else:
            sent_tagged[word] = 0
    tag_left_vector = [ques_tagged[key] for key in sorted(ques_tagged)]
    tag_right_vector = [sent_tagged[key] for key in sorted(sent_tagged)]
    tag_right_vector = tag_right_vector/np.sum(tag_right_vector)
    tag_left_vector = np.array(tag_left_vector, dtype=float)
    tag_right_vector = np.array(tag_right_vector, dtype=float)
    emd_score = emd(tag_left_vector,tag_right_vector,np.array(tags_weight_matrix))
    return emd_score

In [19]:
def getting_question_type_user_question(ques):
    question_type = []
    ques_ans_token = []
    ques_types = ["how", "which", "where", "what", "when", "who", "how long", "how big", "how heavy", "how much", \
         "how many", "how high", "what date", "what year", "what state", "what country", "how old"]
    quest_type = "others"
    for q_type in ques_types:
        if q_type in ques.lower():
            quest_type = q_type
    return {quest_type : 1}

In [20]:
def getting_ner_tags_answer(answer):
    ner = {}
    nlp = spacy.load('en')
    doc = nlp(answer)
    for each_word in doc.ents:
        if each_word.label_ == 'GPE' or each_word.label_ == 'FAC':
                ner['PLACE'] = ner.get('PLACE', 0) + 1
        elif each_word.label_ == 'CARDINAL' or each_word.label_ == 'ORDINAL':
            ner['NUMBER'] = ner.get('NUMBER', 0) + 1

        else:
            if each_word.label_ in unique_ques_sent_tags:
                ner[each_word.label_] = ner.get(each_word.label_, 0) + 1
    return ner            


In [21]:
def get_similarity(question, answer):
    distance_list = []
    for each_answer_word in word_tokenize(answer.lower()): 
        each_word_dist = []
        for each_ques_word in word_tokenize(question.lower()):
            if each_answer_word in word2vec_dict.keys() and each_ques_word in word2vec_dict.keys():
                word2vec_answer = word2vec_dict[each_answer_word]
                word2vec_question = word2vec_dict[each_ques_word]
                word2vec_answer = np.array(word2vec_answer).reshape((1,100))
                word2vec_question = np.array(word2vec_question).reshape((1,100))
            else:
                word2vec_answer = np.random.random((1,100))
                word2vec_question = np.random.random((1,100))
            dist = pairwise_distances(word2vec_answer, word2vec_question)
            each_word_dist.append(dist[0][0])
        distance_list.append(each_word_dist)    
    return distance_list

In [22]:
def get_emd(user_question, sent1):
    unique_words_in_ques_sent = list(set(word_tokenize(user_question) + word_tokenize(sent1)))
    left_vector_dict = {}
    right_vector_dict = {}
    
    for w in unique_words_in_ques_sent:
        
        if w in word_tokenize(user_question):
            left_vector_dict[w] = left_vector_dict.get(w, 0) + 1
            
        else:
            left_vector_dict[w] = 0
            
    for w in unique_words_in_ques_sent:
        
        if w in word_tokenize(sent1):
            right_vector_dict[w] = right_vector_dict.get(w, 0) + 1
        else:
            right_vector_dict[w] = 0
            
    left_vector = [left_vector_dict[key] for key in sorted(left_vector_dict)]
    right_vector = [right_vector_dict[key] for key in sorted(right_vector_dict)]  
    sent_words_in_qa = ' '.join([word for word in unique_words_in_ques_sent])
    qa_distance = np.array(get_similarity(sent_words_in_qa, sent_words_in_qa))
    emd_dist = emd(np.array(left_vector, dtype=np.float)/sum(np.array(left_vector, dtype=np.float)),
    np.array(right_vector, dtype=np.float)/sum(np.array(right_vector, dtype=np.float)), qa_distance)
    return emd_dist


In [34]:
def get_wmd_for_tags_test(question, answer): # MAIN METHOD THAT TAKES QUESTION AND ANSWER SENTENCE
    qtag = getting_question_type_user_question(question)
    atag = getting_ner_tags_answer(answer)
    emd_val = get_emd_with_tags(qtag, atag)
    return emd_val * 2.5

In [24]:
question = 'Who discovered that magnetic and electric could self-generate?'
answer = 'This led Maxwell to discover that electric and magnetic fields could be "self-generating" through a wave that traveled at a speed that he calculated to be the speed of light'
wmd_tags = get_wmd_for_tags_test(question, answer)
wmd_glove = get_emd(question, answer)
print('tags:', wmd_tags)
print('glove:', wmd_glove)

tags: 3.053748080848804
glove: 3.7172126367149225


In [37]:
con = "The Panthers finished the regular season with a 15–1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP). They defeated the Arizona Cardinals 49–15 in the NFC Championship Game and advanced to their second Super Bowl appearance since the franchise was founded in 1995. The Broncos finished the regular season with a 12–4 record, and denied the New England Patriots a chance to defend their title from Super Bowl XLIX by defeating them 20–18 in the AFC Championship Game. They joined the Patriots, Dallas Cowboys, and Pittsburgh Steelers as one of four teams that have made eight appearances in the Super Bowl."
sent = sent_tokenize(con)
ques = "What record did Panthers finished the regular season?"

In [38]:
sent

['The Panthers finished the regular season with a 15–1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP).',
 'They defeated the Arizona Cardinals 49–15 in the NFC Championship Game and advanced to their second Super Bowl appearance since the franchise was founded in 1995.',
 'The Broncos finished the regular season with a 12–4 record, and denied the New England Patriots a chance to defend their title from Super Bowl XLIX by defeating them 20–18 in the AFC Championship Game.',
 'They joined the Patriots, Dallas Cowboys, and Pittsburgh Steelers as one of four teams that have made eight appearances in the Super Bowl.']

In [39]:
for each_sent in sent:
    wmd_tags = get_wmd_for_tags_test(ques, each_sent)
    wmd_glove = get_emd(ques, each_sent)
    print(wmd_glove,wmd_tags)
    score = 0.8 * wmd_glove + 0.2 * wmd_tags
    print(score)

3.4867441507873513 2.842164403858565
3.357828201401594
5.128535162627934 3.57210255767044
4.817248641636436
3.997036878093651 3.7701861794093325
3.9516667383567876
4.049631195637022 4.403624415423046
4.120429839594227
