In [5]:
import codecs
import re

import numpy as np
import pandas as pd
import spacy
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [10]:
nlp = spacy.load('en_core_web_md', disable=["parser", "ner"], max_length=10**7) #"tagger",  
#nlp = spacy.load('en', disable=["ner"])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [21]:
#docs_index.loc[lambda df: df['kind'] == 'gutenberg', 'set'].value_counts()
docs_index['set'].value_counts()

train    1102
test      355
valid     115
Name: set, dtype: int64

In [7]:
docs_index = pd.read_csv('../../data/documents.csv')
docs_index.head()

Unnamed: 0,document_id,set,kind,story_url,story_file_size,wiki_url,wiki_title,story_word_count,story_start,story_end
0,0025577043f5090cd603c6aea60f26e236195594,test,movie,http://www.awesomefilm.com/script/pumpupthevol...,54078,http://en.wikipedia.org/wiki/Pump_Up_the_Volum...,Pump Up the Volume (film),11499,Happy Harry Hardon,by Martin Eaves
1,0029bdbe75423337b551e42bb31f9a102785376f,train,gutenberg,http://www.gutenberg.org/ebooks/21572.txt.utf-8,814507,http://en.wikipedia.org/wiki/Percival_Keene,Percival Keene,173334,Produced by Nick,new eBooks .
2,00936497f5884881f1df23f4834f6739552cee8b,train,gutenberg,http://www.gutenberg.org/ebooks/3526.txt.utf-8,566874,http://en.wikipedia.org/wiki/Five_Weeks_in_a_B...,Five Weeks in a Balloon,112898,Produced by Judy,new eBooks .
3,00950a3641e6a28b04a6fabf6334140e2deaa9fd,train,gutenberg,http://www.gutenberg.org/ebooks/42188.txt.utf-8,90192,http://en.wikipedia.org/wiki/Shadows_in_the_Mo...,Shadows in the Moonlight (story),17670,Produced by Greg,new eBooks .
4,00ee9e01a0e581e0d8cbf7e865a895147c480c5e,train,movie,http://www.imsdb.com/scripts/Crank.html,309143,http://en.wikipedia.org/wiki/Crank_(film),Crank (film),27546,CRANK Written by,TO SOUNDTRACK .


In [8]:
questions = pd.read_csv('../../data/qaps.csv')
questions.head()

Unnamed: 0,document_id,set,question,answer1,answer2,question_tokenized,answer1_tokenized,answer2_tokenized
0,0025577043f5090cd603c6aea60f26e236195594,test,Who is Mark Hunter?,He is a high school student in Phoenix.,A loner and outsider student with a radio stat...,Who is Mark Hunter ?,He is a high school student in Phoenix .,A loner and outsider student with a radio stat...
1,0025577043f5090cd603c6aea60f26e236195594,test,Where does this radio station take place?,It takes place in Mark's parents basement.,"Phoenix, Arizona",Where does this radio station take place ?,It takes place in Mark s parents basement .,"Phoenix , Arizona"
2,0025577043f5090cd603c6aea60f26e236195594,test,Why do more students tune into Mark's show?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...,Why do more students tune into Mark s show ?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...
3,0025577043f5090cd603c6aea60f26e236195594,test,Who commits suicide?,Malcolm.,Malcolm.,Who commits suicide ?,Malcolm .,Malcolm .
4,0025577043f5090cd603c6aea60f26e236195594,test,What does Paige jam into her microwave?,She jams her medals and accolades.,Her award medals,What does Paige jam into her microwave ?,She jams her medals and accolades .,Her award medals


In [23]:
count = 0
for i,row in questions.iterrows():
    if row['question'][:3].lower() == 'who':
        count += 1
print(count)

11389


In [24]:
from collections import Counter

first_words = []
for i,row in questions.iterrows():
    first_words.append(row['question_tokenized'].split()[0].lower())

counts = Counter(first_words)

In [28]:
import argparse
argparse.ArgumentParser().register()

In [27]:
counts.most_common()[:10]

[('what', 17928),
 ('who', 11033),
 ('how', 4888),
 ('why', 4568),
 ('where', 3514),
 ('which', 1015),
 ('when', 768),
 ('in', 548),
 ('after', 281),
 ('whose', 260)]

In [6]:
def chunk_doc(doc, chunk_size):
    chunks = list()
    chunk = ''

    for i, token in enumerate(doc):
        chunk += token.text_with_ws
        if (i+1) % chunk_size == 0:
            chunks.append(chunk)
            chunk = ''

    if chunk != '':
        chunks.append(chunk)
        chunk = ''
    
    return chunks

def doc_ir(chunks, question, vectorizer, top_n):
    chunks.append(question)
    
    tfidf_matrix = vectorizer.fit_transform(chunks)
    tfidf_matrix = tfidf_matrix.todense()
    
    question_vector = tfidf_matrix[-1, :]
    chunk_vectors = tfidf_matrix[0:-1, :]
    
    similarity_rank = list()
    for i,vector in enumerate(chunk_vectors):
        cos_sim = 1 - cosine(question_vector, vector)
        similarity_rank.append((i, cos_sim))

    similarity_rank.sort(key=lambda tup: tup[1], reverse=True)
    similarity_rank = similarity_rank[:top_n]
    similarity_rank.sort(key=lambda tup: tup[0])
    
    return [chunks[i] for i,similarity in similarity_rank]

# Create training sets

In [None]:
with codecs.open('../../data/ir_chunk_dataset2.csv', 'w', encoding='utf-8', errors='ignore') as f:
    prev_doc_id = ''
    for index, row in tqdm(questions.iterrows(), total=len(questions)):
        doc_id, q, a = row['document_id'], row['question'], row['answer1']
        if prev_doc_id != doc_id:
            with codecs.open('../../data/clean/'+doc_id+'-clean.content', 'r', encoding='utf-8', errors='ignore') as g:
                doc = nlp(g.read())
            prev_doc_id = doc_id
            chunks = chunk_doc(doc, 20)
            #chunks = [sent.text for sent in doc.sents]

        ir_chunks = doc_ir(chunks, q, tfidf_vectorizer, 5)
        ir_output = '<del>'.join(ir_chunks)
        ir_output += '</c>'
        f.write("{}\n".format(ir_output))

In [None]:
regex = re.compile("'")
tfidf_vectorizer = TfidfVectorizer()

with codecs.open('../../data/ir_chunk_dataset.csv', 'w', encoding='utf-8', errors='ignore') as f:
    f.write("'document_id','text','question','answer'\n")
    
    prev_doc_id = ''
    for index, row in tqdm(questions.iterrows(), total=len(questions)):
        doc_id, q, a = row['document_id'], row['question'], row['answer1']
        if prev_doc_id != doc_id:
            with codecs.open('../../data/anonymized_entities/'+doc_id+'-clean.content', 'r', encoding='utf-8', errors='ignore') as g:
                doc = nlp(g.read())
            prev_doc_id = doc_id
            chunks = chunk_doc(doc, 20)
            #chunks = [sent.text for sent in doc.sents]

        ir_chunks = doc_ir(chunks, q, tfidf_vectorizer, 5)
        ir_output = '<del>'.join(ir_chunks)
        ir_output += '</c>'

        ir_output = regex.sub("\'", ir_output)
        q = regex.sub("\'", q)
        a = regex.sub("\'", a)
        f.write("'{}','{}','{}','{}'\n".format(doc_id, ir_chunks, q, a))

In [None]:
with codecs.open('../../data/ir_chunk_dataset2.csv', 'w', encoding='utf-8', errors='ignore') as f:
    prev_doc_id = ''
    
    for index, row in tqdm(questions.iterrows(), total=len(questions)):
        doc_id, q, a = row['document_id'], row['question'], row['answer1']
        doc_q = nlp(q)
        
        if prev_doc_id != doc_id:
            with codecs.open('../../data/clean/'+doc_id+'-clean.content', 'r', encoding='utf-8', errors='ignore') as g:
                doc = nlp(g.read())
            prev_doc_id = doc_id

        sent_rank = list()    
        for i,sent in enumerate(doc.sents):
            sent_rank.append((i, sent.similarity(doc_q), sent.text))
            
        sent_rank.sort(key=lambda tup: tup[1], reverse=True)
        sent_rank = sent_rank[:5]
        sent_rank.sort(key=lambda tup: tup[0])

        ir_chunks = [sent for i,similarity,sent in sent_rank]
        ir_output = '<del>'.join(ir_chunks)
        ir_output += '</c>'
        f.write("{}\n".format(ir_output))

# Other

In [8]:
with codecs.open('../../data/anonymized_entities/0025577043f5090cd603c6aea60f26e236195594-clean.content', 'r',encoding='utf-8', errors='ignore') as f:
    doc = nlp(f.read())

In [46]:
chunks.append(questions['question'].tolist()[4])

In [47]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(chunks)
tfidf_matrix = tfidf_matrix.todense()
tfidf_matrix.shape

(63, 1493)

In [48]:
from scipy.spatial.distance import cosine

question_vector = tfidf_matrix[-1, :]
chunk_vectors = tfidf_matrix[0:-1, :]

similarity_rank = list()
for i,vector in enumerate(chunk_vectors):
    cos_sim = 1 - cosine(question_vector, vector)
    similarity_rank.append((i, cos_sim))
        
similarity_rank.sort(key=lambda tup: tup[1], reverse=True)
similarity_rank = similarity_rank[:5]
similarity_rank.sort(key=lambda tup: tup[0])
similarity_rank

[(13, 0.05963392467389739),
 (14, 0.04539162697097321),
 (33, 0.03883041978135293),
 (38, 0.09886007582915535),
 (53, 0.03920798878490961)]

In [49]:
ir_chunks = [chunks[i] for i,similarity in similarity_rank]

In [50]:
questions['question'].tolist()[4]

'What does Paige jam into her microwave?'

In [51]:
print(chunks[13])

me a bit about what you do.

@entity72 - @entity116 run a comprehensive @entity158 values program, erm in which we discuss 
ethical situations, sex education and drug abuse.

Happy @entity158 @entity128 - What do you say to young people who look around at the world 
and see it's become, like you know, a sleazy country, a place you just can't trust. Like 
your school for example. Why is it, it wins all of these awards and students are dropping 
out like flies, why..why is that. Now my listeners are interested in the decision to expel 
@entity145 @entity69.

@entity72 - @entity116, erm, @entity116'm not aware of anything like that, @entity116 don't know what you're talking 
about.

Happy @entity158 @entity128 - That is not true sir. "@entity145 refuses to accept suggestions of a 
more positive mental attitude towards her health and her future. @entity116'm afraid @entity116 find no 
alternative, but to suggest suspension."

@entity72 - Who is this? How 


In [23]:
type(tfidf_matrix[-1, :])

numpy.matrixlib.defmatrix.matrix