# Filter dataset

In [1]:
import codecs
from collections import Counter
import re

import numpy as np
import pandas as pd
import nltk
import spacy
from tqdm import tqdm

In [2]:
nlp = spacy.load('en_core_web_md')

In [188]:
docs_index = pd.read_csv('../../data/documents.csv')
questions = pd.read_csv('../../data/qaps.csv')

In [5]:
docs_index = docs_index.loc[lambda df: df['kind'] == 'gutenberg']
questions = questions.loc[lambda df: df['question'].str.startswith('Who')]
questions = questions.merge(docs_index.loc[:, ['document_id']], how='inner', on='document_id')

In [7]:
docs_index.head()

Unnamed: 0,document_id,set,kind,story_url,story_file_size,wiki_url,wiki_title,story_word_count,story_start,story_end
0,0029bdbe75423337b551e42bb31f9a102785376f,train,gutenberg,http://www.gutenberg.org/ebooks/21572.txt.utf-8,814507,http://en.wikipedia.org/wiki/Percival_Keene,Percival Keene,173334,Produced by Nick,new eBooks .
1,00936497f5884881f1df23f4834f6739552cee8b,train,gutenberg,http://www.gutenberg.org/ebooks/3526.txt.utf-8,566874,http://en.wikipedia.org/wiki/Five_Weeks_in_a_B...,Five Weeks in a Balloon,112898,Produced by Judy,new eBooks .
2,00950a3641e6a28b04a6fabf6334140e2deaa9fd,train,gutenberg,http://www.gutenberg.org/ebooks/42188.txt.utf-8,90192,http://en.wikipedia.org/wiki/Shadows_in_the_Mo...,Shadows in the Moonlight (story),17670,Produced by Greg,new eBooks .
3,00fb61fa7bee266ad995e52190ebb73606b60b70,valid,gutenberg,http://www.gutenberg.org/ebooks/3771.txt.utf-8,372868,http://en.wikipedia.org/wiki/Cynthia's_Revels,Cynthia's Revels,74928,Produced by Sue,new eBooks .
4,014de1a8802c05ff64efa047e9290fb7fccea2b4,test,gutenberg,http://www.gutenberg.org/ebooks/1329.txt.utf-8,560685,http://en.wikipedia.org/wiki/A_Voyage_to_Arcturus,A Voyage to Arcturus,113790,Produced by An,new eBooks .


In [87]:
questions.head()

Unnamed: 0,document_id,set,question,answer1,answer2,question_tokenized,answer1_tokenized,answer2_tokenized
0,0029bdbe75423337b551e42bb31f9a102785376f,train,Who is Miss Delmer?,the elderly spinster aunt of the Earl de Verse...,She's Captail Delmar's aunt.,Who is Miss Delmer ?,the elderly spinster aunt of the Earl de Verse...,She s Captail Delmar s aunt .
1,0029bdbe75423337b551e42bb31f9a102785376f,train,Who does Arabella Mason wed?,"Ben Keene, Delmar's valet",Ben Keene,Who does Arabella Mason wed ?,"Ben Keene , Delmar s valet",Ben Keene
2,0029bdbe75423337b551e42bb31f9a102785376f,train,Who is the bully that steals Percival's lunch?,"his teacher, Mr. O'Gallagher",The schoolmaster,Who is the bully that steals Percival s lunch ?,"his teacher , Mr. O'Gallagher",The schoolmaster
3,0029bdbe75423337b551e42bb31f9a102785376f,train,Who does Percival convince the Pirates to spare?,a rich Dutch merchant and his daughter Minnie,A Dutch Merchant and his daughter,Who does Percival convince the Pirates to spare ?,a rich Dutch merchant and his daughter Minnie,A Dutch Merchant and his daughter
4,0029bdbe75423337b551e42bb31f9a102785376f,train,Who lives at Madeline Hall?,Miss Delmar,Miss Delmar,Who lives at Madeline Hall ?,Miss Delmar,Miss Delmar


In [9]:
docs_index.to_csv('../../data/documents_books.csv', index=False)
questions.to_csv('../../data/questions_books_who.csv', index=False)

In [3]:
docs_index = pd.read_csv('../../data/documents_books.csv')
questions = pd.read_csv('../../data/questions_books_who.csv')

# Vector entity similarity

In [4]:
def clean_ent_ws(ent):
    return re.sub('[\s]+', ' ', ent.text.strip())

def get_people_ents_and_mentions(doc):
    people_ents = []
    for ent in doc.ents:
        # Get person entities
        if ent.label_ is 'PERSON':
            people_ents.append(ent)
            
    ent_mentions = Counter([clean_ent_ws(ent) for ent in people_ents])
    #ent_mentions = [k for k,v in ent_mentions.items() if v >= 5 and len(k) > 2]
    return people_ents, ent_mentions

def get_ent_most_similar(people_ents, question):
    ent_similarity = {}
    
    for ent in people_ents:
        similarity = ent.sent.similarity(question)

        ent_str = clean_ent_ws(ent)
        if (ent_str not in ent_similarity or 
                ent_similarity[ent_str]['similarity'] < similarity):
            ent_similarity[ent_str] = {
                'question': question.text,
                'similarity': similarity,
                'sentence': ent.sent.text,
                'sent_start_char': ent.sent.start_char,
                'sent_end_char': ent.sent.end_char
            }
            
    return ent_similarity

In [8]:
previous_doc_id = ''
output_csv = ''
output_df = pd.DataFrame()

for i, row in tqdm(questions.iterrows(), total=len(questions)):
    if row['document_id'] != previous_doc_id:
        # process new doc
        with codecs.open('../../data/clean/'+ row['document_id'] +'-clean.content', 'r', 
                         encoding='utf-8', errors='ignore') as f:
            text = f.read()
            
        # Process doc in chunks of 500,000 characters
        chunk_size = 500000
        people_ents_total = []
        ent_mentions_total = {}
        chunk_num = 0
        for start, end in zip(range(0, len(text), chunk_size), range(chunk_size, len(text)+chunk_size, chunk_size)):
            doc = nlp(text[start:end])

            # processing on doc chunk
            people_ents, ent_mentions = get_people_ents_and_mentions(doc)

            # combine people_ents and ent_mentions from other chunks
            people_ents_total.extend(people_ents)
            for k,v in ent_mentions.items():
                if k not in ent_mentions_total:
                    ent_mentions_total.update({k: v})
                else:
                    ent_mentions_total[k] = ent_mentions_total[k] + v

        # Final data structures
        ent_mentions_total = [k for k,v in ent_mentions_total.items() if v >= 5 and len(k) > 2]
        people_ents_total = [ent for ent in people_ents_total if clean_ent_ws(ent) in ent_mentions_total]
        
        csv_output_id = previous_doc_id
        previous_doc_id = row['document_id']
        
    else:
        # use already processed doc
        pass
    
    # Question-sentence similarity
    question = nlp(row['question'])
    ent_similarity = get_ent_most_similar(people_ents_total, question)
    
    if csv_output_id != '' and len(output_df) != 0:
        # If we reach the end of a set of doc questions then output csv
        output_df.to_csv('../../data/ent_candidates/'+ csv_output_id +'-candidates.csv', index=False)
        output_df = pd.DataFrame()
        csv_output_id = ''
    elif i == len(questions) - 1:
        # Output csv of the last iteration
        df = pd.DataFrame.from_dict(ent_similarity, orient='index')
        df.reset_index(level=0, inplace=True)
        df.rename(columns={'index':'entity'}, inplace=True)
        df['document_id'] = row['document_id']
        output_df = pd.concat([output_df, df])
    
        output_df.to_csv('../../data/ent_candidates/'+ row['document_id'] +'-candidates.csv', index=False)
        output_df = pd.DataFrame()
        csv_output_id = ''
        
    # Append question record to datafram for this document
    df = pd.DataFrame.from_dict(ent_similarity, orient='index')
    df.reset_index(level=0, inplace=True)
    df.rename(columns={'index':'entity'}, inplace=True)
    df['document_id'] = row['document_id']
    output_df = pd.concat([output_df, df])

  0%|                                                 | 0/5707 [00:00<?, ?it/s]

NameError: name 'clean_ent_str' is not defined

# WordNet similarity

In [12]:
from nltk.corpus import wordnet as wn

In [80]:
def token_to_synset_id(sentence):
    syn_list = []
    for token in sentence:
        wn_tag = None
        if token.pos_.startswith('N'):
            wn_tag = 'n'
        if token.pos_.startswith('V'):
            wn_tag = 'v'
        if token.pos_.startswith('J'):
            wn_tag = 'a'
        if token.pos_.startswith('R'):
            wn_tag = 'r'
        
        if wn_tag is not None:
            try:
                syn_id = wn.synsets(token.lemma_, wn_tag)[0]
                if syn_id is not None:
                    syn_list.append(syn_id)
            except:
                syn_id = None
    
    return syn_list

def synset_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    # Get synsets
    syn_list1 = token_to_synset_id(sentence1)
    syn_list2 = token_to_synset_id(sentence2)
    
    score, count = 0.0, 0
    # For each word in the first sentence
    for synset in syn_list1:
        # Get the similarity value of the most similar word in the other sentence
        best_score = None
        sim_list = [synset.path_similarity(ss) or 0 for ss in syn_list2]
        if len(sim_list) != 0:
            best_score = max(sim_list)
        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1

    # Average the values
    if count != 0:
        score /= count
        return score
    else:
        return 0

In [82]:
sent1 = nlp('Shortly afterwards, Captain Delmar again came over to Madeline Hall, accompanied as usual, by Ben, and ' + 
    'the second day after their arrival it was made known to all whom it might concern, that Miss Arabella Mason' +
    ' had actually contracted a secret marriage with the handsome Benjamin Keene.')

sent2 = nlp('Who does Arabella Mason wed?')

synset_similarity(sent1, sent2)

0.2689123376623377

In [88]:
%%timeit -r1 -n1 -c

sim_list = []

for sent in doc.sents:
    sim_list.append(sent.similarity(sent2))
    
    last_sent = sent.text

2.71464431675264


In [20]:
wn.synset('write.v.01').path_similarity(wn.synset('write.v.01'))

1.0

In [27]:
nltk.edit_distance('Amelia', 'Arabella')

4