In [1]:
import pandas as pd
import numpy as np

import gensim
from gensim import corpora,models,similarities
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

import nltk
from nltk.corpus import stopwords

import scipy

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report



In [2]:
data = pd.read_json('squad_train_doc.json')

In [3]:
data.head()

Unnamed: 0,passages,title
0,"[{'context': 'Architecturally, the school has ...",University_of_Notre_Dame
1,[{'context': 'Beyoncé Giselle Knowles-Carter (...,Beyoncé
2,[{'context': 'Montana i/mɒnˈtænə/ is a state i...,Montana
3,"[{'context': 'The phrase ""in whole or in part""...",Genocide
4,[{'context': 'The emergence of resistance of b...,Antibiotics


In [4]:
context_list = []
question_list = []
for i in range(len(data)):
    context_temp = []
    question_temp = []
    for j in range(len(data['passages'][i])):
        context_temp.append(data['passages'][i][j]['context'])
        question_temp.append(data['passages'][i][j]['questions'])
    context_list.append(context_temp)
    question_list.append(question_temp)

In [5]:
dic = {'context':context_list,'questions':question_list,'title':data.title}

In [6]:
new_df = pd.DataFrame(data = dic)

In [7]:
new_df.head()

Unnamed: 0,context,questions,title
0,"[Architecturally, the school has a Catholic ch...",[[To whom did the Virgin Mary allegedly appear...,University_of_Notre_Dame
1,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,[[When did Beyoncé release her first solo albu...,Beyoncé
2,[Montana i/mɒnˈtænə/ is a state in the Western...,[[How many ranges are part of the Rocky Mounta...,Montana
3,"[The phrase ""in whole or in part"" has been sub...",[[Which phrase is especially contentious withi...,Genocide
4,[The emergence of resistance of bacteria to an...,[[What is resistance to antibiotics a cause of...,Antibiotics


In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
context_listperdoc = [' '.join(context) for context in new_df.context]
question_listperdoc =  [' '.join(question) for questions in new_df.questions for question in questions]

In [10]:
#context_listperdoc

In [11]:
#question_listperdoc

In [12]:
tokenized_question_context =[]
untokenized_question_context = []
for question,context in zip(question_listperdoc,context_listperdoc):
    question_words = [word for word in nltk.word_tokenize(question) if word not in stop_words]
    context_words = [word for word in nltk.word_tokenize(context) if word not in stop_words]
    
    tokenized_question_context.append(question_words+context_words)
    untokenized_question_context.append(' '.join(question_words+context_words))

In [13]:
BM25_model = gensim.summarization.bm25.BM25(tokenized_question_context)

In [14]:
def bm25(query):
    scores = BM25_model.get_scores(query.split(),1)
    bm25_df = pd.DataFrame(data={'title':new_df.title,'bm25_score':scores}).sort_values(by=['bm25_score'],ascending=False)
    bm25_df['bm25_rank'] = [i for i in range(1, len(bm25_df.title)+1)]
    return bm25_df

In [126]:
bm25('What is Grotto at Notre Dame?').head(10)

Unnamed: 0,title,bm25_score,bm25_rank
0,University_of_Notre_Dame,13.112193,1
28,Prime_minister,10.379726,2
132,Hanover,6.471414,3
25,Genome,4.326248,4
35,Iranian_languages,4.301328,5
26,Comprehensive_school,4.266961,6
38,Architecture,4.154598,7
50,Sony_Music_Entertainment,4.154463,8
39,Human_Development_Index,4.14644,9
24,Saint_Barth%C3%A9lemy,4.003093,10


In [16]:
tfidf_dict = corpora.Dictionary(tokenized_question_context)
#tfidf_dict.save('./tmp/squad.dict') 
raw_corpus = [tfidf_dict.doc2bow(doc) for doc in tokenized_question_context]
corpora.MmCorpus.serialize('./tmp/squad.mm', raw_corpus)
corpus = corpora.MmCorpus('./tmp/squad.mm')
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
tfidf_model = similarities.MatrixSimilarity(corpus_tfidf)
#tfidf_model.save('./tmp/squad.TFIDF_model')

  if np.issubdtype(vec.dtype, np.int):


In [17]:
def tfidf(query):
    query_1 = []
    query_1.append(nltk.word_tokenize(query))
    query_raw_corpus = [tfidf_dict.doc2bow(i) for i in query_1]
    corpora.MmCorpus.serialize('./tmp/query.mm',query_raw_corpus)
    query_corpus = corpora.MmCorpus('./tmp/query.mm')
    similarity_table = tfidf_model[query_corpus]
    ranks = scipy.stats.rankdata(similarity_table, method = 'max')
    similarity_table = list(np.array(similarity_table).flatten())
    tfidf_df = pd.DataFrame({'title':new_df.title, 'tfidf_score':similarity_table}).sort_values(by=['tfidf_score'],ascending=False)
    tfidf_df['tfidf_rank'] = [i for i in range(1, len(new_df.title)+1)]
    return tfidf_df

In [128]:
tfidf('What is Grotto at Notre Dame?').head(10)

Unnamed: 0,title,tfidf_score,tfidf_rank
0,University_of_Notre_Dame,0.407653,1
40,Southern_Europe,0.081975,2
28,Prime_minister,0.053481,3
38,Architecture,0.053332,4
26,Comprehensive_school,0.047113,5
52,Hunter-gatherer,0.04191,6
31,Dutch_Republic,0.041179,7
39,Human_Development_Index,0.036986,8
35,Iranian_languages,0.03666,9
32,Symbiosis,0.036566,10


In [19]:
sentenceLabeled = []
for sentenceID, sentence in enumerate(untokenized_question_context):
    sentenceL = TaggedDocument(words=sentence.split(), tags = ['SENT_%s' %sentenceID])
    sentenceLabeled.append(sentenceL)

In [20]:
doc2vec_model = Doc2Vec(vector_size=300, window=10, min_count=0, workers=11, alpha=0.005, min_alpha=0.025)
doc2vec_model.build_vocab(sentenceLabeled)

In [21]:
# for epoch in range(50):
#     print(epoch ,"is running")
#     doc2vec_model.train(sentenceLabeled,total_examples=len(data),epochs=15000)
#     doc2vec_model.alpha -= 0.0002  # decrease the learning rate
#     doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay
#     doc2vec_model.save("doc2vec-withoutTime-15000.model")

In [22]:
#doc2vec_model = Doc2Vec.load("doc2vec-withoutTime-15000.model")

In [23]:
len(new_df.title)

442

In [24]:
def doc2vec(query):
    similarity_score_matrix , list_doc_names, list_doc_scores, list_doc_ranks, rank = [], [], [], [], 1
    avg_sentence = np.zeros((300))
    count = 0
    for word in nltk.word_tokenize(query):
        if word in doc2vec_model.wv.vocab:
            avg_sentence += doc2vec_model[word]
            count += 1
    if count !=0:
        avg_sentence = avg_sentence/count
    similarity_score_matrix.append(doc2vec_model.docvecs.most_similar([avg_sentence],topn=len(new_df.title)))
    for each_compared_row in similarity_score_matrix[0]:
        list_doc_names.append(each_compared_row[0])
        list_doc_scores.append(each_compared_row[1])
        list_doc_ranks.append(rank)
        rank += 1
    doc2vec_df = pd.DataFrame({'title':new_df.title, 'doc2vec_score':list_doc_scores, 'doc2vec_rank':list_doc_ranks})
    return doc2vec_df

In [130]:
doc2vec('What is Grotto at Notre Dame?').head(10)

  if np.issubdtype(vec.dtype, np.int):


Unnamed: 0,title,doc2vec_score,doc2vec_rank
0,University_of_Notre_Dame,0.168956,1
1,Beyoncé,0.164374,2
2,Montana,0.153053,3
3,Genocide,0.138571,4
4,Antibiotics,0.129852,5
5,Frédéric_Chopin,0.121766,6
6,Sino-Tibetan_relations_during_the_Ming_dynasty,0.120051,7
7,IPod,0.118357,8
8,The_Legend_of_Zelda:_Twilight_Princess,0.114344,9
9,Spectre_(2015_film),0.113487,10


In [26]:
question_list =[]
for i in range(len(new_df)):
    question_list.append([question for questions in new_df.questions[i] for question in questions])

In [27]:
final_df = pd.DataFrame()

In [28]:
title_names_Sorted = list(new_df.title).copy()
title_names_Sorted.sort()

In [29]:
title_list = [new_df.title[i] for i in range(new_df.shape[0])]

In [45]:
frames = []
doc_number = 0

for all_questions_each_doc in question_list:
    
    one_hot_keys = []
    for each_doc in title_names_Sorted:
        if each_doc == title_list[doc_number]:
            one_hot_keys.append(1)
        else:
            one_hot_keys.append(0)

    for each_question in all_questions_each_doc:
        BM_25_Dataframe = bm25(each_question).sort_values(by=['title'],ascending=True)
        TFDIF_Dataframe = tfidf(each_question).sort_values(by=['title'],ascending=True)
        Doc2Vec_Dataframe = doc2vec(each_question).sort_values(by=['title'],ascending=True)
        
        #WMD
        #WMD_Dataframe = WMD(each_question).sort_values(by=['Document'],ascending=True)
        #each_question_score_all_docs = pd.merge(pd.merge(pd.merge(BM_25_Dataframe, TFDIF_Dataframe), Doc2Vec_Dataframe), WMD_Dataframe)
        
        each_question_score_all_docs = pd.merge(pd.merge(BM_25_Dataframe, TFDIF_Dataframe), Doc2Vec_Dataframe)
        list_each_question = [each_question for i in range(442)] 
        each_question_score_all_docs['question'] = list_each_question
        each_question_score_all_docs['Actual_Document'] = one_hot_keys
        frames.append(each_question_score_all_docs)
        
    doc_number += 1

  if np.issubdtype(vec.dtype, np.int):


In [46]:
result = pd.concat(frames, ignore_index=True)
result.head(3)

Unnamed: 0,title,bm25_score,bm25_rank,tfidf_score,tfidf_rank,doc2vec_score,doc2vec_rank,question,Actual_Document
0,2008_Sichuan_earthquake,0.209195,311,0.000451,347,0.100791,11,To whom did the Virgin Mary allegedly appear i...,0
1,2008_Summer_Olympics_torch_relay,3.655016,45,0.005532,79,0.090815,22,To whom did the Virgin Mary allegedly appear i...,0
2,51st_state,5.702542,13,0.012178,41,-0.066697,382,To whom did the Virgin Mary allegedly appear i...,0


In [47]:
#result.to_csv('Combined_Dataframe.csv', index=False)

## Logistic Regression

In [30]:
combined_df = pd.read_csv('Combined_Dataframe.csv')

In [31]:
normalized_combined_df = combined_df

normalized_combined_df['bm25_score'] = (normalized_combined_df.bm25_score-min(normalized_combined_df.bm25_score))/(max(normalized_combined_df.bm25_score)-min(normalized_combined_df.bm25_score))
normalized_combined_df['tfidf_score']=(normalized_combined_df.tfidf_score-min(normalized_combined_df.tfidf_score))/(max(normalized_combined_df.tfidf_score)-min(normalized_combined_df.tfidf_score))
normalized_combined_df['doc2vec_score']=(normalized_combined_df.doc2vec_score-min(normalized_combined_df.doc2vec_score))/(max(normalized_combined_df.doc2vec_score)-min(normalized_combined_df.doc2vec_score))

normalized_combined_df.head()

Unnamed: 0,title,bm25_score,bm25_rank,tfidf_score,tfidf_rank,doc2vec_score,doc2vec_rank,question,Actual_Document
0,2008_Sichuan_earthquake,0.002787,311,0.00069,347,0.683617,11,To whom did the Virgin Mary allegedly appear i...,0
1,2008_Summer_Olympics_torch_relay,0.0487,45,0.008462,79,0.667423,22,To whom did the Virgin Mary allegedly appear i...,0
2,51st_state,0.075982,13,0.018628,41,0.411718,382,To whom did the Virgin Mary allegedly appear i...,0
3,ASCII,0.006013,211,0.002844,216,0.564947,150,To whom did the Virgin Mary allegedly appear i...,0
4,A_cappella,0.01327,181,0.001741,277,0.485875,287,To whom did the Virgin Mary allegedly appear i...,0


In [32]:
X = normalized_combined_df[['bm25_score','tfidf_score','doc2vec_score']]
X.head()

Unnamed: 0,bm25_score,tfidf_score,doc2vec_score
0,0.002787,0.00069,0.683617
1,0.0487,0.008462,0.667423
2,0.075982,0.018628,0.411718
3,0.006013,0.002844,0.564947
4,0.01327,0.001741,0.485875


In [33]:
y = normalized_combined_df[['Actual_Document']]
y.head()

Unnamed: 0,Actual_Document
0,0
1,0
2,0
3,0
4,0


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [35]:
model2 = LogisticRegression()
model2.fit(X,y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
#y_pred2 = model2.predict(X_test)

In [37]:
model3 = LinearRegression()
model3.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [38]:
def final_doc(query,model):
        BM_25_Dataframe = bm25(query).head(20)
        TFDIF_Dataframe = tfidf(query).head(20)
        Doc2Vec_Dataframe = doc2vec(query).head(20)
        
        final_doc_df = pd.merge(pd.merge(BM_25_Dataframe, TFDIF_Dataframe,on=['title'],how='outer'), Doc2Vec_Dataframe,on=['title'],how='outer')
        final_doc_df = final_doc_df.fillna(0)
        final_doc_df['bm25_score'] = (final_doc_df.bm25_score-final_doc_df.bm25_score.min())/(final_doc_df.bm25_score.max()-final_doc_df.bm25_score.min())
        final_doc_df['tfidf_score'] = (final_doc_df.tfidf_score-final_doc_df.tfidf_score.min())/(final_doc_df.tfidf_score.max()-final_doc_df.tfidf_score.min())        
        final_doc_df['doc2vec_score'] = (final_doc_df.doc2vec_score-final_doc_df.doc2vec_score.min())/(final_doc_df.doc2vec_score.max()-final_doc_df.doc2vec_score.min()) 
        final_doc_X = final_doc_df[['bm25_score','tfidf_score','doc2vec_score']]
        
        final_doc_df['total_score'] = model.predict(final_doc_X)
        #final_doc_df['total_score'] = 0.01243557 * final_doc_df['bm25_score'] + 0.29682442 * final_doc_df['tfidf_score'] - 0.01673123 * final_doc_df['doc2vec_score']
        return final_doc_df.sort_values(by='total_score', ascending=False)

In [122]:
final_doc('Who is Beyonce?',model2)['title'].head(1)[0]

  if np.issubdtype(vec.dtype, np.int):


'Web_browser'

In [124]:
final_doc('Who is Beyonce?',model3)['title'].head(1)

  if np.issubdtype(vec.dtype, np.int):


5    Tristan_da_Cunha
Name: title, dtype: object

In [41]:
title_context_dict = {}

In [42]:
for row_index in range(len(data)):
    context = ''
    for context_index in range(len(data['passages'][row_index])):
        context = context + data['passages'][row_index][context_index]['context']
    title_context_dict[data['title'][row_index]]=context

In [80]:
wmd_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',binary=True)

In [81]:
from nltk.corpus import stopwords

In [116]:
def wmd_distances(query,contexts):
    list_distances = []
    stop_words = set(stopwords.words('english'))
    sent1 = [word for word in nltk.word_tokenize(query) if word not in stop_words]
    tag = nltk.pos_tag(sent1)
    words = []
    for each_tag in tag:
        if each_tag[1] in ['NN','NNP','NNS','VBD','VB']:
            words.append(each_tag[0])
    sent1 = words        
    
    for cont in nltk.sent_tokenize(contexts):
        sent2 = [word for word in nltk.word_tokenize(cont) if word not in stop_words]
        wmd_distance = wmd_model.wmdistance(sent1,sent2)
        list_distances.append(wmd_distance)
        
    WMD_Dataframe = pd.DataFrame({'Sentence': nltk.sent_tokenize(contexts), 'WMD_Score': list_distances}).sort_values(by=['WMD_Score'],ascending=True)
    Top8_sentences = ' '.join([sent for sent in WMD_Dataframe[0:8].Sentence])
    return Top8_sentences

In [132]:
query = 'What is Grotto at Notre Dame?'

In [120]:
def query_context(query):
    doc = final_doc(query,model2)['title'].head(1)[0]
    print("DOC",doc)
    context = title_context_dict[doc]
    wmd_distance = wmd_distances(query,context)
    
    return wmd_distance

In [133]:
query_context(query)

  if np.issubdtype(vec.dtype, np.int):


DOC University_of_Notre_Dame


'Notre Dame\'s most recent[when?] Kelly\'s record in midway through his sixth season at Notre Dame is 52–21. Later that day, the trumpet section will play the Notre Dame Victory March and the Notre Dame Alma Mater under the dome. It was written by two brothers who were Notre Dame graduates. The Notre Dame Leprechaun is the mascot of the athletic teams. The 32 wins were the most by the Fighting Irish team since 1908-09.The "Notre Dame Victory March" is the fight song for the University of Notre Dame. Notre Dame moved its hockey team to Hockey East. What though the odds be great or small, old Notre Dame will win over all.'