## Vector Representations (TF-IDF) : Sentence-Document  Comparison

### Importing Packages

In [11]:
# --- Importing Various packages ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import json
from pandas.io.json import json_normalize
from time import time
import operator
import scipy.stats as scipy


# --- NLTK PACKAGE ---
import nltk
# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize
# Stopwords
from nltk.corpus import stopwords


# --- GENSIM PACKAGE ---
import gensim
from gensim.models import Word2Vec, doc2vec, Doc2Vec
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora, models, similarities
from gensim.models import KeyedVectors


%matplotlib inline

### Loading Documents

In [7]:
data_train = pd.read_json('squad_train_doc.json')
data_train.rename(columns={'passages': 'documents'}, inplace=True)

### Segregating Data to form a Dataframe

In [8]:
def get_compact_dataframe():
    context_list = []
    question_list = []
    for doc in data_train.documents:
        context_list.append(get_each_context_each_questionSet(doc)[0])
        question_list.append(get_each_context_each_questionSet(doc)[1])
    return context_list, question_list

def get_each_context_each_questionSet(document):
    each_doc_context_list = [document[i]['context'] for i in range(len(document))]
    each_doc_question_list = [document[i]['questions'] for i in range(len(document))]
    return  each_doc_context_list, each_doc_question_list

context_list, question_list = get_compact_dataframe()[0], get_compact_dataframe()[1]
dataframe = pd.DataFrame({'title':data_train.title, 'context':context_list, 'questions': question_list})
list_all_context_per_doc = [' '.join(context_list[i]) for i in range(len(context_list))]
list_all_questions_per_doc = [sum(question_list[i],[]) for i in range(len(question_list))]

dataframe.head()

Unnamed: 0,context,questions,title
0,"[Architecturally, the school has a Catholic ch...","[[What is the Grotto at Notre Dame?, To whom d...",University_of_Notre_Dame
1,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,[[In what city and state did Beyonce grow up?...,Beyoncé
2,[Montana i/mɒnˈtænə/ is a state in the Western...,"[[What is its rank in popularion?, What is the...",Montana
3,"[The phrase ""in whole or in part"" has been sub...",[[Which phrase is especially contentious withi...,Genocide
4,[The emergence of resistance of bacteria to an...,[[What is the purpose of antibiotic treatment?...,Antibiotics


### Training Data

In [9]:
stop_words = set(stopwords.words("english"))
tokenized_context_and_questions = []

for context, question in zip(list_all_context_per_doc, list_all_questions_per_doc):
    context_words = [word for word in word_tokenize(context) if word not in stop_words]
    question_words = [word for word in word_tokenize(' '.join(question)) if word not in stop_words]
    tokenized_context_and_questions.append(context_words + question_words)

### Model

In [12]:
# TFIDF MODEL
''' Accepts a list of tokenized words of context plus questions as its training data.
'''
dictionary = corpora.Dictionary(tokenized_context_and_questions)
dictionary.save('/tmp/squad.dict') 
raw_corpus = [dictionary.doc2bow(each_doc) for each_doc in tokenized_context_and_questions]
corpora.MmCorpus.serialize('/tmp/squad.mm', raw_corpus)
corpus = corpora.MmCorpus('/tmp/squad.mm')
tfidf = models.TfidfModel(corpus) 
corpus_tfidf = tfidf[corpus]
TFIDF_model = similarities.MatrixSimilarity(corpus_tfidf)
TFIDF_model.save('/tmp/squad.TFIDF_model')

##### Saving the model

In [13]:
#TFIDF_model.save('TFIDF.model')

##### Loading the model

In [14]:
#model_loaded = gensim.models.Doc2Vec.load('TFIDF.model')

### Sentence - Document Comparison

In [16]:
query = 'What is Grotto at Notre Dame?'

In [17]:
TFIDF(query).head(10)

Unnamed: 0,Document,Score_TFIDF,Rank_TFIDF
0,University_of_Notre_Dame,0.611371,1
166,Order_of_the_British_Empire,0.063304,2
327,Paris,0.046176,3
340,Appalachian_Mountains,0.02989,4
435,Humanism,0.016467,5
237,Gothic_architecture,0.012007,6
338,Neoclassical_architecture,0.011725,7
287,Dominican_Order,0.010206,8
55,Universal_Studios,0.01003,9
132,Hanover,0.006234,10


In [15]:
''' Accepts a question(query) to implement TF-IDF Model.
    Takes a query and word tokenizes it. 
    'raw_corpus_query' - The word-tokenized query is compared with the dictionary used to train the document. 
        'corpus_query' - The word-id and word is converted into a corpus.The corpus is then fed to the TF-IDF model.
    'similarity_table' - Stores the TF-IDF weights which are then used to get most similiar documents.
               'ranks' - Scipy method which compares the similarity weights and sorts is accordingly.
    
    --> Returns a dataframe with Document name, Score and Rank
'''

def TFIDF(query): 
    query_1 = []
    query_1.append(word_tokenize(query))
    raw_corpus_query = [dictionary.doc2bow(word) for word in query_1]
    corpora.MmCorpus.serialize('/tmp/query3.mm',raw_corpus_query)
    corpus_query = corpora.MmCorpus('/tmp/query3.mm')
    similarity_table = TFIDF_model[corpus_query]
    ranks = scipy.rankdata(similarity_table, method = 'max')
    similarity_table = list(np.array(similarity_table).flatten())
    TFIDF_dataframe = pd.DataFrame({'Document':data_train.title, 'Score_TFIDF':similarity_table}).sort_values(by=['Score_TFIDF'],ascending=False)
    TFIDF_dataframe['Rank_TFIDF'] = [i for i in range(1, len(data_train.title)+1)]
    return TFIDF_dataframe