# Document Similarity using different methods

## Importing packages

In [1]:
# --- Importing Various packages ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import json
from pandas.io.json import json_normalize
from time import time
import operator
import scipy.stats as scipy


# --- NLTK PACKAGE ---
import nltk
# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize
# Stopwords
from nltk.corpus import stopwords


# --- GENSIM PACKAGE ---
import gensim
from gensim.models import Word2Vec, doc2vec, Doc2Vec
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora, models, similarities
from gensim.models import KeyedVectors


%matplotlib inline

## Loading Datasets

In [2]:
data_train = pd.read_json('../data/squad_train_doc.json')
data_train.rename(columns={'passages': 'documents'}, inplace=True)

#### Segregating Dataframe

In [3]:
''' Makes a list of list of all contexts per doc and a list of list of all questions per doc
'''
def get_compact_dataframe():
    context_list = []
    question_list = []
    for doc in data_train.documents:
        context_list.append(get_each_context_each_questionSet(doc)[0])
        question_list.append(get_each_context_each_questionSet(doc)[1])
    return context_list, question_list


''' Accepts one document at a time iteratively
    Returns list of context per doc and list of set of questions per doc
''' 
def get_each_context_each_questionSet(document):
    each_doc_context_list = [document[i]['context'] for i in range(len(document))]
    each_doc_question_list = [document[i]['questions'] for i in range(len(document))]
    return  each_doc_context_list, each_doc_question_list

context_list, question_list = get_compact_dataframe()[0], get_compact_dataframe()[1]
compact_dataframe = pd.DataFrame({'title':data_train.title, 'context':context_list, 'questions': question_list})


''' Dataframe contains all 442 documents.
    Each row contains:
        - 1D List of contexts per doc
        - 2D List of list of questions per doc
        - Title Name of one doc 
'''
compact_dataframe.head()

Unnamed: 0,context,questions,title
0,"[Architecturally, the school has a Catholic ch...","[[What is the Grotto at Notre Dame?, To whom d...",University_of_Notre_Dame
1,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,[[In what city and state did Beyonce grow up?...,Beyoncé
2,[Montana i/mɒnˈtænə/ is a state in the Western...,"[[What is its rank in popularion?, What is the...",Montana
3,"[The phrase ""in whole or in part"" has been sub...",[[Which phrase is especially contentious withi...,Genocide
4,[The emergence of resistance of bacteria to an...,[[What is the purpose of antibiotic treatment?...,Antibiotics


----

#### Connected Lists

In [4]:
# Contains the list all titles
title_list = [data_train.title[i] for i in range(data_train.shape[0])]

In [5]:
# Contains the list of all contexts per doc
list_all_context_per_doc = [' '.join(context_list[i]) for i in range(len(context_list))]

In [6]:
# Contains the list of list of all questions per doc
list_all_questions_per_doc = [sum(question_list[i], []) for i in range(len(question_list))]

In [7]:
# Contains the list of all questions combined
list_all_questions = sum(list_all_questions_per_doc, [])

.

.
## Pre-processing for the models

#### Removing Stopwords

In [8]:
''' tokenized_context_and_questions contains the list of word tokenized form of contexts plus questions per each doc
    untokenized_context_and_questions contains the list of string format of contexts plus questions per each doc
'''

stop_words = set(stopwords.words("english"))
tokenized_context_and_questions, untokenized_context_and_questions = [], []

for context, question in zip(list_all_context_per_doc, list_all_questions_per_doc):
    context_words = [word for word in word_tokenize(context) if word not in stop_words]
    question_words = [word for word in word_tokenize(' '.join(question)) if word not in stop_words]
    
    tokenized_context_and_questions.append(context_words + question_words)
    untokenized_context_and_questions.append(' '.join(context_words + question_words))

#### Model Training

In [9]:
# BM25 MODEL
''' Accepts a list of tokenized words of context plus questions as its training data.
'''
BM_25_model = gensim.summarization.bm25.BM25(tokenized_context_and_questions)

In [10]:
# TFIDF MODEL
''' Accepts a list of tokenized words of context plus questions as its training data.
'''
dictionary = corpora.Dictionary(tokenized_context_and_questions)
dictionary.save('/tmp/squad.dict') 
raw_corpus = [dictionary.doc2bow(each_doc) for each_doc in tokenized_context_and_questions]
corpora.MmCorpus.serialize('/tmp/squad.mm', raw_corpus)
corpus = corpora.MmCorpus('/tmp/squad.mm')
tfidf = models.TfidfModel(corpus) 
corpus_tfidf = tfidf[corpus]
TFIDF_model = similarities.MatrixSimilarity(corpus_tfidf)
TFIDF_model.save('/tmp/squad.TFIDF_model')

In [11]:
# Doc2Vec MODEL
''' Accepts a list of untokenized string format of context plus questions as its training data.
'''
size = 200
window = 50
min_count = 1

class DocIterator(object):
    ## Initailizes document's list(doc1,doc2...) and its label's list('Book_1','Book_2',...)
    def __init__(self, doc_list, labels_list):
        self.doc_list = doc_list
        self.labels_list = labels_list    
    ## Assigns label1 to a list of all words in doc1, label2 to all words in doc2, etc.
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield doc2vec.LabeledSentence(words=doc.split(), tags=[self.labels_list[idx]])

docLabels = title_list
iter_docs = DocIterator(untokenized_context_and_questions, docLabels)
Doc2Vec_model = Doc2Vec(iter_docs, size=size, window=window, min_count=min_count, workers=11, dbow_words = 1)

In [12]:
# # WMD
# ''' Uses a Google pretrained training model to train WMD's model.
# '''
# WMD_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

## Implementing MODEL Methods

In [13]:
''' Accepts a question(query) to implement BM 25 Model.
    Takes a query and word tokenizes it. 
          'get_scores' - Calculates the similarity distance between the tokenized-query and the document.
    
    --> Returns a dataframe with Document name, Score and Rank
'''

def BM25(query):    
    scores = BM_25_model.get_scores(query.split(),1)
    BM25_dataframe = pd.DataFrame({'Document':data_train.title, 'Score_BM25':scores}).sort_values(by=['Score_BM25'],ascending=False)
    BM25_dataframe['Rank_BM25'] = [i for i in range(1, len(data_train.title)+1)]
    return BM25_dataframe

In [14]:
''' Accepts a question(query) to implement TF-IDF Model.
    Takes a query and word tokenizes it. 
    'raw_corpus_query' - The word-tokenized query is compared with the dictionary used to train the document. 
        'corpus_query' - The word-id and word is converted into a corpus.The corpus is then fed to the TF-IDF model.
    'similarity_table' - Stores the TF-IDF weights which are then used to get most similiar documents.
               'ranks' - Scipy method which compares the similarity weights and sorts is accordingly.
    
    --> Returns a dataframe with Document name, Score and Rank
'''

def TFIDF(query): 
    query_1 = []
    query_1.append(word_tokenize(query))
    raw_corpus_query = [dictionary.doc2bow(word) for word in query_1]
    corpora.MmCorpus.serialize('/tmp/query3.mm',raw_corpus_query)
    corpus_query = corpora.MmCorpus('/tmp/query3.mm')
    similarity_table = TFIDF_model[corpus_query]
    ranks = scipy.rankdata(similarity_table, method = 'max')
    similarity_table = list(np.array(similarity_table).flatten())
    TFIDF_dataframe = pd.DataFrame({'Document':data_train.title, 'Score_TFIDF':similarity_table}).sort_values(by=['Score_TFIDF'],ascending=False)
    TFIDF_dataframe['Rank_TFIDF'] = [i for i in range(1, len(data_train.title)+1)]
    return TFIDF_dataframe

In [15]:
''' Accepts a question(query) to implement Doc2Vec Model.
    Takes a query and word tokenizes it. 
       'avg_sentence' - After that the average of the sentenced words are compared with every document.
       'most_similar' - Calculates the similarity distance between the avg of tokenized-sentence with every 
                        document iteratively.
    'list_doc_scores' - Returns the sorted list of comparison with each doc in ascending order.
    
    --> Returns a dataframe with Document name, Score and Rank(top_n, ascending order sorted)
'''

def Doc2Vec(query):
    similarity_score_matrix , list_doc_names, list_doc_scores, list_doc_ranks, rank = [], [], [], [], 1
    avg_sentence = np.zeros((200))
    count = 0
    for word in word_tokenize(query):
        if word in Doc2Vec_model.wv.vocab:
            avg_sentence +=  Doc2Vec_model[word]
            count+=1
    if count != 0:
        avg_sentence = avg_sentence / count
    similarity_score_matrix.append(Doc2Vec_model.docvecs.most_similar([avg_sentence], topn=len(title_list)))
    for each_compared_row in similarity_score_matrix[0]:
        list_doc_names.append(each_compared_row[0])
        list_doc_scores.append(each_compared_row[1])
        list_doc_ranks.append(rank)
        rank += 1
    query_comparison_dataframe = pd.DataFrame({'Document':list_doc_names, 'Score_Doc2Vec':list_doc_scores, 'Rank_Doc2Vec':list_doc_ranks})
    return query_comparison_dataframe

In [16]:
''' Accepts a question(query) to implement Word Mover's Distance Model.
    Takes a query and word tokenizes it. 
                  'sent1' - Stores the word-tokenized question.
                  'sent2' - Stores the word-tokenized context plus questions iteratively for all documents.
    'similarity_distance' - Calculates the similarity distance between sent1 and sent2.
    
    --> Returns a dataframe with Document name, Score(Similarity Distance) and Rank(Ascending order sorted)
'''

def WMD(query):
    list_doc_names, list_doc_scores, list_doc_ranks, rank = [], [], [], 1
    sent1 = word_tokenize(query)
    for index in range(len(data_train.title[0:442])):
        sent2 = tokenized_context_and_questions[index]
        similarity_distance = WMD_model.wmdistance(sent1, sent2)
        list_doc_scores.append(similarity_distance)        
    wmd_dataframe = pd.DataFrame({'Document':data_train.title[0:442], 'Score_WMD':list_doc_scores}).sort_values(by=['Score_WMD'],ascending=False)
    wmd_dataframe['Rank_WMD'] = [i for i in range(1, len(data_train.title[0:442])+1)]
    return wmd_dataframe

## COMBINING DATAFRAME

In [19]:
Document_names_Sorted = title_list.copy()
Document_names_Sorted.sort()

##### Combining all models and returning a list of dataframes for every question

In [24]:
''' Takes out one document at a time and stores all questions in that doc in 'all_questions_each_doc'
    'one_hot_keys' - Makes a list of one-hot-encoded values of 442 values. 
                     This list will be used for all the questions in one document. 
                     Thus for that one document, value will be 1 corresponding to the actual document 
                     and other values will be assigned 0.
                     
        'frames'  -  Contains the list of 87,433 dataframes: Each dataframe is one-question vs all documents(442)
    
    Takes out one question at a time, gets the dataframe from each model having document name, score and rank
    concats all dataframes into one and stores it in 'each_question_score_all_docs'.
    Adds the list of one-hot-keys to every question with actual document to the dataframe and appends in 'frames'.
    
    Total Rows    = (1 x 442) x 87,433times  = 87,433x442
    Total Columns = 9 
'''

frames = []
doc_number = 0

for all_questions_each_doc in list_all_questions_per_doc[0:5]:
    
    one_hot_keys = []
    for each_doc in Document_names_Sorted:
        if each_doc == title_list[doc_number]:
            one_hot_keys.append(1)
        else:
            one_hot_keys.append(0)

    for each_question in all_questions_each_doc[0:1]:
        BM_25_Dataframe = BM25(each_question).sort_values(by=['Document'],ascending=True)
        TFDIF_Dataframe = TFIDF(each_question).sort_values(by=['Document'],ascending=True)
        Doc2Vec_Dataframe = Doc2Vec(each_question).sort_values(by=['Document'],ascending=True)
        
        #WMD
        #WMD_Dataframe = WMD(each_question).sort_values(by=['Document'],ascending=True)
        #each_question_score_all_docs = pd.merge(pd.merge(pd.merge(BM_25_Dataframe, TFDIF_Dataframe), Doc2Vec_Dataframe), WMD_Dataframe)
        
        each_question_score_all_docs = pd.merge(pd.merge(BM_25_Dataframe, TFDIF_Dataframe), Doc2Vec_Dataframe)
        list_each_question = [each_question for i in range(442)] 
        each_question_score_all_docs['Question'] = list_each_question
        each_question_score_all_docs['Actual_Document'] = one_hot_keys
        frames.append(each_question_score_all_docs)
        
    doc_number += 1

##### Final Dataframe : Combining the list of dataframes and Questions List

In [30]:
result = pd.concat(frames, keys=list_all_questions[0:1026], ignore_index=True)
result.head(15)

Unnamed: 0,Document,Score_BM25,Rank_BM25,Score_TFIDF,Rank_TFIDF,Rank_Doc2Vec,Score_Doc2Vec,Question,Actual_Document
0,2008_Sichuan_earthquake,0.623699,23,0.001628,27,381,0.112189,What is the Grotto at Notre Dame?,0
1,2008_Summer_Olympics_torch_relay,6.652963,10,0.002257,17,266,0.193726,What is the Grotto at Notre Dame?,0
2,51st_state,0.621078,220,0.000945,103,353,0.1337,What is the Grotto at Notre Dame?,0
3,ASCII,0.620538,253,0.00056,302,366,0.121287,What is the Grotto at Notre Dame?,0
4,A_cappella,0.619212,328,0.00049,355,57,0.365747,What is the Grotto at Notre Dame?,0
5,Adolescence,0.621875,168,0.000635,246,190,0.247007,What is the Grotto at Notre Dame?,0
6,Adult_contemporary_music,0.622177,144,0.000693,209,114,0.308748,What is the Grotto at Notre Dame?,0
7,Affirmative_action_in_the_United_States,0.621735,180,0.000673,220,78,0.346206,What is the Grotto at Notre Dame?,0
8,Age_of_Enlightenment,0.620538,254,0.001019,87,150,0.284539,What is the Grotto at Notre Dame?,0
9,Aircraft_carrier,0.62194,162,0.000712,200,251,0.203036,What is the Grotto at Notre Dame?,0


------

### Forming a CSV File

In [None]:
result.to_csv('Combined_Dataframe', index=False)