## Vector Representations (BM_25) : Sentence-Document  Comparison

### Importing Packages

In [15]:
import numpy as np
import pandas as pd
from os import listdir

# --- NLTK PACKAGE ---
import nltk
# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer, RegexpTokenizer
# Stemming and Lemmatizing
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Stopwords
from nltk.corpus import stopwords, state_union, brown, movie_reviews, treebank

# --- GENSIM PACKAGE ---
import gensim, logging
from gensim.models import Word2Vec, doc2vec, Doc2Vec

### Loading Documents

In [17]:
data_train = pd.read_json('squad_train_doc.json')
data_train.rename(columns={'passages': 'documents'}, inplace=True)

### Segregating Data to form a Dataframe

In [19]:
def get_compact_dataframe():
    context_list = []
    question_list = []
    for doc in data_train.documents:
        context_list.append(get_each_context_each_questionSet(doc)[0])
        question_list.append(get_each_context_each_questionSet(doc)[1])
    return context_list, question_list

def get_each_context_each_questionSet(document):
    each_doc_context_list = [document[i]['context'] for i in range(len(document))]
    each_doc_question_list = [document[i]['questions'] for i in range(len(document))]
    return  each_doc_context_list, each_doc_question_list

context_list, question_list = get_compact_dataframe()[0], get_compact_dataframe()[1]
dataframe = pd.DataFrame({'title':data_train.title, 'context':context_list, 'questions': question_list})
list_all_context_per_doc = [' '.join(context_list[i]) for i in range(len(context_list))]
list_all_questions_per_doc = [sum(question_list[i],[]) for i in range(len(question_list))]

dataframe.head()

Unnamed: 0,context,questions,title
0,"[Architecturally, the school has a Catholic ch...","[[What is the Grotto at Notre Dame?, To whom d...",University_of_Notre_Dame
1,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,[[In what city and state did Beyonce grow up?...,Beyoncé
2,[Montana i/mɒnˈtænə/ is a state in the Western...,"[[What is its rank in popularion?, What is the...",Montana
3,"[The phrase ""in whole or in part"" has been sub...",[[Which phrase is especially contentious withi...,Genocide
4,[The emergence of resistance of bacteria to an...,[[What is the purpose of antibiotic treatment?...,Antibiotics


### Training Data

In [20]:
stop_words = set(stopwords.words("english"))
tokenized_context_and_questions = []

for context, question in zip(list_all_context_per_doc, list_all_questions_per_doc):
    context_words = [word for word in word_tokenize(context) if word not in stop_words]
    question_words = [word for word in word_tokenize(' '.join(question)) if word not in stop_words]
    tokenized_context_and_questions.append(context_words + question_words)

### Model

In [24]:
# BM25 MODEL
''' Accepts a list of tokenized words of context plus questions as its training data.
'''
BM_25_model = gensim.summarization.bm25.BM25(tokenized_context_and_questions)

##### Saving the model

In [9]:
#BM_25_model.save('bm25.model')

##### Loading the model

In [10]:
#model_loaded = gensim.models.Doc2Vec.load('bm25.model')

### Sentence - Document Comparison

In [25]:
query = 'What is Grotto at Notre Dame?'

In [29]:
BM25(query).head(10)

Unnamed: 0,Document,Score_BM25,Rank_BM25
0,University_of_Notre_Dame,21.450317,1
132,Hanover,9.429974,2
327,Paris,9.16379,3
237,Gothic_architecture,7.794399,4
287,Dominican_Order,7.794269,5
55,Universal_Studios,7.792825,6
12,To_Kill_a_Mockingbird,7.792561,7
435,Humanism,7.790938,8
340,Appalachian_Mountains,7.788856,9
21,2008_Summer_Olympics_torch_relay,6.652963,10


In [27]:
''' Accepts a question(query) to implement BM 25 Model.
    Takes a query and word tokenizes it. 
          'get_scores' - Calculates the similarity distance between the tokenized-query and the document.
    
    --> Returns a dataframe with Document name, Score and Rank
'''
def BM25(query):    
    scores = BM_25_model.get_scores(query.split(),1)
    BM25_dataframe = pd.DataFrame({'Document':data_train.title, 'Score_BM25':scores}).sort_values(by=['Score_BM25'],ascending=False)
    BM25_dataframe['Rank_BM25'] = [i for i in range(1, len(data_train.title)+1)]
    return BM25_dataframe