In [1]:
import codecs
import string
import sys
import numpy as np
import pandas as pd
import spacy

nlp = spacy.load('en')

In [2]:
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
docs_index = pd.read_csv('../../data/documents.csv')
docs_index.head()

Unnamed: 0,document_id,set,kind,story_url,story_file_size,wiki_url,wiki_title,story_word_count,story_start,story_end
0,0025577043f5090cd603c6aea60f26e236195594,test,movie,http://www.awesomefilm.com/script/pumpupthevol...,54078,http://en.wikipedia.org/wiki/Pump_Up_the_Volum...,Pump Up the Volume (film),11499,Happy Harry Hardon,by Martin Eaves
1,0029bdbe75423337b551e42bb31f9a102785376f,train,gutenberg,http://www.gutenberg.org/ebooks/21572.txt.utf-8,814507,http://en.wikipedia.org/wiki/Percival_Keene,Percival Keene,173334,Produced by Nick,new eBooks .
2,00936497f5884881f1df23f4834f6739552cee8b,train,gutenberg,http://www.gutenberg.org/ebooks/3526.txt.utf-8,566874,http://en.wikipedia.org/wiki/Five_Weeks_in_a_B...,Five Weeks in a Balloon,112898,Produced by Judy,new eBooks .
3,00950a3641e6a28b04a6fabf6334140e2deaa9fd,train,gutenberg,http://www.gutenberg.org/ebooks/42188.txt.utf-8,90192,http://en.wikipedia.org/wiki/Shadows_in_the_Mo...,Shadows in the Moonlight (story),17670,Produced by Greg,new eBooks .
4,00ee9e01a0e581e0d8cbf7e865a895147c480c5e,train,movie,http://www.imsdb.com/scripts/Crank.html,309143,http://en.wikipedia.org/wiki/Crank_(film),Crank (film),27546,CRANK Written by,TO SOUNDTRACK .


In [4]:
questions = pd.read_csv('../../data/qaps.csv')
questions.head(10)

Unnamed: 0,document_id,set,question,answer1,answer2,question_tokenized,answer1_tokenized,answer2_tokenized
0,0025577043f5090cd603c6aea60f26e236195594,test,Who is Mark Hunter?,He is a high school student in Phoenix.,A loner and outsider student with a radio stat...,Who is Mark Hunter ?,He is a high school student in Phoenix .,A loner and outsider student with a radio stat...
1,0025577043f5090cd603c6aea60f26e236195594,test,Where does this radio station take place?,It takes place in Mark's parents basement.,"Phoenix, Arizona",Where does this radio station take place ?,It takes place in Mark s parents basement .,"Phoenix , Arizona"
2,0025577043f5090cd603c6aea60f26e236195594,test,Why do more students tune into Mark's show?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...,Why do more students tune into Mark s show ?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...
3,0025577043f5090cd603c6aea60f26e236195594,test,Who commits suicide?,Malcolm.,Malcolm.,Who commits suicide ?,Malcolm .,Malcolm .
4,0025577043f5090cd603c6aea60f26e236195594,test,What does Paige jam into her microwave?,She jams her medals and accolades.,Her award medals,What does Paige jam into her microwave ?,She jams her medals and accolades .,Her award medals
5,0025577043f5090cd603c6aea60f26e236195594,test,What does Mark do with his radio station?,He dismantles it and attaches it to his mother...,Dismantle it.,What does Mark do with his radio station ?,He dismantles it and attaches it to his mother...,Dismantle it .
6,0025577043f5090cd603c6aea60f26e236195594,test,What does Mark tell the protesting students?,He tells them to make their own future.,That they should make their own future because...,What does Mark tell the protesting students ?,He tells them to make their own future .,That they should make their own future because...
7,0025577043f5090cd603c6aea60f26e236195594,test,Who gets arrested?,Mark and Nora.,Mark and Nora.,Who gets arrested ?,Mark and Nora .,Mark and Nora .
8,0025577043f5090cd603c6aea60f26e236195594,test,What does the radio show cause?,It causes trouble.,It causes much trouble in the community.,What does the radio show cause ?,It causes trouble .,It causes much trouble in the community .
9,0025577043f5090cd603c6aea60f26e236195594,test,Where does Mark Broadcast his station from?,Parent's Basement,At the basement of his home,Where does Mark Broadcast his station from ?,Parent s Basement,At the basement of his home


In [5]:
summaries = pd.read_csv('../../data/third_party/wikipedia/summaries.csv')

# TF-IDF Similarity

In [16]:
doc = nlp(summaries.loc[lambda df: df['document_id'] == '0029bdbe75423337b551e42bb31f9a102785376f','summary'].values[0])

In [61]:
def chunk_doc(doc, chunk_size):
    chunks = list()
    chunk = ''

    for i, token in enumerate(doc):
        chunk += token.text_with_ws
        if (i+1) % chunk_size == 0:
            chunks.append(chunk)
            chunk = ''

    if chunk != '':
        chunks.append(chunk)
        chunk = ''
    
    return chunks

def doc_ir(doc, question, vectorizer, top_n):
    chunks = list()
    for sent in doc.sents:
        #chunks.append(' '.join([token.text for token in sent if not token.is_stop]))
        chunks.append(sent.text)
    chunks.append(question)
    
    tfidf_matrix = vectorizer.fit_transform(chunks)
    tfidf_matrix = tfidf_matrix.todense()
    
    question_vector = tfidf_matrix[-1, :]
    chunk_vectors = tfidf_matrix[0:-1, :]
    
    similarity_rank = list()
    for i,vector in enumerate(chunk_vectors):
        cos_sim = 1 - cosine(question_vector, vector)
        similarity_rank.append((i, cos_sim))

    similarity_rank.sort(key=lambda tup: tup[1], reverse=True)
    #similarity_rank = similarity_rank[:top_n]
    #similarity_rank.sort(key=lambda tup: tup[0])
    
    return [chunks[i] for i,similarity in similarity_rank]

In [62]:
tfidf_vectorizer = TfidfVectorizer()

doc_id = '0029bdbe75423337b551e42bb31f9a102785376f'
i = 0
q = questions.loc[lambda df: df['document_id'] == doc_id,'question'].values[i]
a1 = questions.loc[lambda df: df['document_id'] == doc_id,'answer1'].values[i]
a2 = questions.loc[lambda df: df['document_id'] == doc_id,'answer2'].values[i]
summary = summaries.loc[lambda df: df['document_id'] == doc_id,'summary'].values[0]

sent_rank = doc_ir(doc, 
                   a1, 
                   tfidf_vectorizer, 
                   0)

print(q)
print(a1)
print(a2)

Who is Miss Delmer?
the elderly spinster aunt of the Earl de Verseley and Captain Delmar
She's Captail Delmar's aunt.


In [63]:
sent_rank[0]

' At Madeline Hall, an old mansion-house near Southampton belonging to the wealthy de Versely family, lives an elderly spinster Miss Delmar, the aunt of the earl de Versely and Captain Delmar.'

# Vector Average

In [8]:
nlp_simple = spacy.load('en', disable=["ner"])

In [9]:
doc_id = '0029bdbe75423337b551e42bb31f9a102785376f'
q_num = 0
q = questions.loc[lambda df: df['document_id'] == doc_id,'question'].values[q_num]
a1 = questions.loc[lambda df: df['document_id'] == doc_id,'answer1'].values[q_num]
a2 = questions.loc[lambda df: df['document_id'] == doc_id,'answer2'].values[q_num]
summary = summaries.loc[lambda df: df['document_id'] == doc_id,'summary'].values[0]

print(q)
print(a1)
print(a2)

Who is Miss Delmer?
the elderly spinster aunt of the Earl de Verseley and Captain Delmar
She's Captail Delmar's aunt.


In [10]:
a1_doc = nlp_simple(a1)
summary_doc = nlp_simple(summary)

In [11]:
with codecs.open('../../data/clean/'+doc_id+'-clean.content', 'r',encoding='utf-8', errors='ignore') as f:
    book_text = f.read()

book_doc = nlp_simple(book_text)

## Summary

In [12]:
summary_rank = list()
summary_sents = list()
for i,sent in enumerate(summary_doc.sents):
    summary_sents.append(sent.text)
    summary_rank.append((i, sent.similarity(a1_doc)))
    
summary_rank.sort(key=lambda tup: tup[1], reverse=True)

In [13]:
summary_rank[:5]

[(0, 0.9353053003675217),
 (3, 0.8344173232254826),
 (5, 0.7910966787495401),
 (22, 0.7865234077050896),
 (13, 0.7821810007353073)]

In [18]:
print(summary_sents[0])

 At Madeline Hall, an old mansion-house near Southampton belonging to the wealthy de Versely family, lives an elderly spinster Miss Delmar, the aunt of the earl de Versely and Captain Delmar.


## Book

In [15]:
book_rank = list()
book_sents = list()
for i,sent in enumerate(book_doc.sents):
    book_sents.append(sent.text)
    book_rank.append((i, sent.similarity(a1_doc)))
    
book_rank.sort(key=lambda tup: tup[1], reverse=True)

In [16]:
book_rank[:5]

[(817, 0.877449113798834),
 (16, 0.8693175194103513),
 (3935, 0.8466454051546873),
 (3932, 0.8453529605307817),
 (6486, 0.8296636715135427)]

In [19]:
print(book_sents[16])

At the period
in which I commence this history, there resided in this mansion an
elderly spinster of rank, named the Honourable Miss Delmar, sister of
the late Lord de Versely and aunt to the present earl, and an Honourable
Captain Delmar, who was the second son of the deceased nobleman.  


## Summary to book

In [20]:
best_summary_sentence = nlp_simple(summary_sents[3])
summary_book_rank = list()
for i,sent in enumerate(book_doc.sents):
    book_sents.append(sent.text)
    summary_book_rank.append((i, sent.similarity(best_summary_sentence)))
    
summary_book_rank.sort(key=lambda tup: tup[1], reverse=True)

In [21]:
summary_book_rank[:5]

[(705, 0.8843859428863184),
 (4503, 0.8835197247523201),
 (4589, 0.8689891202784701),
 (817, 0.8676940489653271),
 (46, 0.8673182453372974)]

In [23]:
print(book_sents[817])

Raising the Wind_, would be performed on Friday evening, for the
benefit of Miss Mortimer under the patronage of the Honourable Captain
Delmar, and the officers of his Majesty's ship Calliope.  
