# Sentence-Paragraph Similarity

In [1]:
# --- Importing Various packages ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import json
from pandas.io.json import json_normalize
import math
import scipy.stats as scipy


# --- NLTK PACKAGE ---
import nltk
# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer, RegexpTokenizer
# Lemmatizing
from nltk.stem import WordNetLemmatizer
# Stopwords
from nltk.corpus import stopwords, state_union


# --- GENSIM PACKAGE ---
import gensim
from gensim.models import Word2Vec, doc2vec, Doc2Vec
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora, models, similarities
from gensim.models import KeyedVectors

%matplotlib inline

## Loading Dataset

In [2]:
data_train = pd.read_json('squad_train_doc.json')
data_train.rename(columns={'passages': 'documents'}, inplace=True)

def get_compact_dataframe():
    context_list = []
    question_list = []
    for doc in data_train.documents:
        context_list.append(get_each_context_each_questionSet(doc)[0])
        question_list.append(get_each_context_each_questionSet(doc)[1])
    return context_list, question_list

def get_each_context_each_questionSet(document):
    each_doc_context_list = [document[i]['context'] for i in range(len(document))]
    each_doc_question_list = [document[i]['questions'] for i in range(len(document))]
    return  each_doc_context_list, each_doc_question_list

context_list, question_list = get_compact_dataframe()[0], get_compact_dataframe()[1]
dataframe = pd.DataFrame({'title':data_train.title, 'context':context_list, 'questions': question_list})
list_all_context_per_doc = [' '.join(context_list[i]) for i in range(len(context_list))]
list_all_questions_per_doc = [sum(question_list[i],[]) for i in range(len(question_list))]

dataframe.head()

Unnamed: 0,context,questions,title
0,"[Architecturally, the school has a Catholic ch...","[[What is the Grotto at Notre Dame?, To whom d...",University_of_Notre_Dame
1,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,[[In what city and state did Beyonce grow up?...,Beyoncé
2,[Montana i/mɒnˈtænə/ is a state in the Western...,"[[What is its rank in popularion?, What is the...",Montana
3,"[The phrase ""in whole or in part"" has been sub...",[[Which phrase is especially contentious withi...,Genocide
4,[The emergence of resistance of bacteria to an...,[[What is the purpose of antibiotic treatment?...,Antibiotics


------------

## QUERY

In [61]:
query = 'What is Grotto at Norte Dame?'

In [4]:
# Storing document[i] in document
document = list_all_context_per_doc[0]

In [11]:
stop_words = set(stopwords.words("english"))

## Loading WMD Pretrained Model

In [5]:
WMD_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

.


.


.

### Approach 1 : BM25 Filter  +  WMD

In [39]:
def chunking(document, step):
    sentences = sent_tokenize(document)
    chunks = []
    chunks = [sentences[i:i+step] for i in range(0, len(sentences), step)]
    #chunks = [sentences[i:i+10] for i in range(0, len(sentences)-10)]
    return chunks

In [40]:
chunks = chunking(document, 10)

In [41]:
len(chunks)

38

In [42]:
def remove_stopwords(chunks):
    chunked_words = [word_tokenize(' '.join(chunk)) for chunk in chunks]
    chunked_words_without_stopwords = []
    for chunk in chunked_words:
        chunked_words_without_stopwords.append([word for word in chunk if word not in stop_words])
    return chunked_words_without_stopwords

In [43]:
tokenized_chunked_words = remove_stopwords(chunks)

In [44]:
BM25_Paragraph_Model = gensim.summarization.bm25.BM25(tokenized_chunked_words)

In [45]:
def BM25(query):    
    scores = BM25_Paragraph_Model.get_scores(query.split() ,1) 
    chunk_names = [i for i in range(len(chunks))]
    BM25_dataframe = pd.DataFrame({'Chunk':chunk_names, 'Score':scores}).sort_values(by=['Score'],ascending=False)
    BM25_dataframe['Rank'] = [i for i in range(1, len(chunks)+1)]

    return BM25_dataframe.reset_index()

In [46]:
BM25_Dataframe = BM25(query)
BM25_Dataframe.head(5)

Unnamed: 0,index,Chunk,Score,Rank
0,36,36,4.951295,1
1,0,0,3.133145,2
2,12,12,3.133145,3
3,2,2,3.133145,4
4,20,20,3.133145,5


In [47]:
def chunk_selector(BM25_Dataframe, threshold):
    highest_score = BM25_Dataframe.Score[0]
    selected_chunk_numbers = []
    selected_chunk_numbers.append(BM25_Dataframe.Chunk[0])
    if BM25_Dataframe.Score[0] - BM25_Dataframe.Score[1] < threshold:
        for index in range(1, len(BM25_Dataframe.Score)):    
            if BM25_Dataframe.Score[0] - BM25_Dataframe.Score[index] < threshold:
                selected_chunk_numbers.append(BM25_Dataframe.Chunk[index])
    
    return selected_chunk_numbers

In [48]:
selected_chunk_numbers = chunk_selector(BM25_Dataframe, 9)

In [49]:
def WMD_in_chunks(query, chunks, selected_chunk_numbers):

    sent1 = word_tokenize(query)
    
    list_sentences, list_distances, list_chunkNumber, list_sentence_index = [], [], [], []  
    for index in selected_chunk_numbers:
        for each_sentence_index, each_sentence in zip(range(len(chunks[index])), chunks[index]):
            sent2 = word_tokenize(each_sentence)
            similarity_distance = WMD_model.wmdistance(sent1, sent2)
            list_sentences.append(each_sentence)
            list_distances.append(similarity_distance)
            list_sentence_index.append(each_sentence_index)
            list_chunkNumber.append(index)
    WMD_Dataframe = pd.DataFrame({'Sentence': list_sentences, 'WMD_Score': list_distances, 'Chunk': list_chunkNumber, 'Sentence_Index': list_sentence_index})

    return WMD_Dataframe 

In [50]:
WMD_Dataframe = WMD_in_chunks(query, chunks, selected_chunk_numbers).sort_values(by=['WMD_Score'], ascending=True).reset_index()
WMD_Dataframe.head(10)

Unnamed: 0,index,Chunk,Sentence,Sentence_Index,WMD_Score
0,184,35,"The ""Notre Dame Victory March"" is the fight so...",4,2.554218
1,160,33,Kelly's record in midway through his sixth sea...,0,2.628065
2,202,19,Notre Dame's most recent[when?],2,2.707056
3,47,20,"The Grotto of Our Lady of Lourdes, which was b...",7,2.718668
4,140,31,The Notre Dame Leprechaun is the mascot of the...,0,2.738287
5,1,36,"What though the odds be great or small, old No...",1,2.752606
6,14,0,"Immediately behind the basilica is the Grotto,...",4,2.811086
7,178,34,"The team is coached by Mike Brey, who, as of t...",8,2.88631
8,8,36,Notre Dame alumni work in various fields.,8,2.886595
9,170,34,"Later that day, the trumpet section will play ...",0,2.888879


In [51]:
def paragraph1(chunks, WMD_Dataframe_Top):
    
    chunk_index = WMD_Dataframe_Top.Chunk[0]
    sentence_index = WMD_Dataframe_Top.Sentence_Index[0]
    if sentence_index >=3 and sentence_index <=6:
        paragraph = chunks[chunk_index][sentence_index-3:sentence_index+5]
    elif sentence_index <3:
        if chunk_index!=0:
            previous_chunk = chunks[chunk_index-1][-3:]
            paragraph = previous_chunk[sentence_index:] +  chunks[chunk_index][:sentence_index+5]        
        else:
            paragraph = chunks[chunk_index][0:8]             
    else:
        if chunk_index != len(chunks)-1 :
            after_chunk = chunks[chunk_index+1][0:4]
            paragraph = chunks[chunk_index][sentence_index-3:] + after_chunk[0:4 - (9 -sentence_index)]
        else:
            paragraph = chunks[chunk_index][-8:]        

    return ' '.join(paragraph)

In [None]:
para =[]

for i in range(0,11):
    para.append(paragraph(chunks, WMD_Dataframe[i:i+1].reset_index()))

### Approach 2: Only WMD

In [58]:
data = paragraph2(query, document)
data.head(10)

Unnamed: 0,Sentence,Sentence_Index,WMD_Score
40,program also exists.,40,3.593895
59,"1,400 of the 3,577 (39.1%) were admitted under...",59,3.63665
372,John Jenkins.,372,3.661503
221,"A new engineering building, Stinson-Remick Hal...",221,3.663944
281,"He soon erected additional buildings, includin...",281,3.706028
175,"O'Shaughnessy, at the time the largest ever ma...",175,3.71905
285,"With each new president, new academic programs...",285,3.725277
195,Jenkins took over the position from Malloy on ...,195,3.727485
33,"This program has been recognized previously, b...",33,3.737364
87,Lobund was the first research organization to ...,87,3.740897


In [57]:
def paragraph2(query, document):
    sent1 = [word for word in word_tokenize(query) if word not in stop_words]
    tag = nltk.pos_tag(sent1)
    words = []
    for each_tag in tag:
        if each_tag[1] == 'NN' or each_tag[1] == 'NNP' or each_tag[1] == 'NNS' or each_tag[1] == 'VBD' or each_tag[1] == 'VB':
            words.append(each_tag[0])
    sent1 = words
    index = 0
    sentences = sent_tokenize(document)
    list_distances, list_sentence_index = [], []
    for each_sentence in sentences:
        sent2 = [word for word in word_tokenize(each_sentence) if word not in stop_words]
        similarity_distance = WMD_model.wmdistance(sent1, sent2)
        list_distances.append(similarity_distance)
        list_sentence_index.append(index)
        index+=1
    WMD_Dataframe = pd.DataFrame({'Sentence': sentences, 'Sentence_Index': list_sentence_index, 'WMD_Score': list_distances}).sort_values(by=['WMD_Score'],ascending=True) 
    Top8_sentences = ' '.join([sent for sent in WMD_Dataframe[0:8].Sentence])
   
    return WMD_Dataframe

### Approach 3: USing Named Entity Recognition, Search and then WMD

In [62]:
paragraph3(query, document).head(10)

[('What', 'WP'), ('is', 'VBZ'), ('Grotto', 'NNP'), ('at', 'IN'), ('Norte', 'NNP'), ('Dame', 'NNP'), ('?', '.')]

 ****** ['Grotto', 'Norte', 'Dame'] 
 ***** 



Unnamed: 0,Sentence,Sentence_Index,WMD_Score
64,"The ""Notre Dame Victory March"" is the fight so...",64,3.406189
70,Notre Dame alumni work in various fields.,70,3.525966
60,Kelly's record in midway through his sixth sea...,60,3.550963
48,Notre Dame teams are known as the Fighting Irish.,48,3.583133
62,"Later that day, the trumpet section will play ...",62,3.622741
65,It was written by two brothers who were Notre ...,65,3.671605
33,"Since 2005, Notre Dame has been led by John I....",33,3.688666
28,Holy Cross Father John Francis O'Hara was elec...,28,3.700507
24,The success of its football team made Notre Da...,24,3.723258
67,"What though the odds be great or small, old No...",67,3.726108


In [63]:
def paragraph3(query, document):
    index = 0
    
    tag = nltk.pos_tag(word_tokenize(query))
    print(tag)
    words = []
    for t in tag:
        if t[1] == 'NN' or t[1] == 'NNP' or t[1] == 'NNS':
            words.append(t[0])
    sent1 = words
    print("\n ******", sent1, "\n ***** \n")

    sentences = sent_tokenize(document)
    list_distances, list_sentence_index, list_sentences = [], [], []
    
    for each_sentence in sentences:
        if not set(sent1).isdisjoint(each_sentence.split()):
            sent2 = [word for word in word_tokenize(each_sentence) if word not in stop_words]
            similarity_distance = WMD_model.wmdistance(sent1, sent2)
            list_sentences.append(each_sentence)
            list_distances.append(similarity_distance)
            list_sentence_index.append(index)
            index+=1
    WMD_Dataframe = pd.DataFrame({'Sentence': list_sentences, 'Sentence_Index': list_sentence_index, 'WMD_Score': list_distances}).sort_values(by=['WMD_Score'],ascending=True) 
    Top8_sentences = ' '.join([sent for sent in WMD_Dataframe[0:8].Sentence])
   
    return WMD_Dataframe