In [96]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from functools import reduce
from nltk import word_tokenize
import numpy as np
import re
import nltk

In [100]:
def tokenize(sent):
    return word_tokenize(sent.lower())

In [7]:
def remove_newlines(text):
    if text[0] == '\n':
        text = text[-len(text)+1:]
    elif text[-1] == '\n':
        text = text[:len(text)-1]
    return text.lower()

def get_cdata(context_file):
    c = context_file.read()
    lines = re.split('\n' + '-'*30 + '\n', c)[:-1]
    return lines

def get_qdata(xfile):
    final = []
    line = []
    for l in xfile:
        l = remove_newlines(l)
        if l != '-'*30:
            line.append(l)
        else:
            final.append(line)
            line = []
    return final

In [8]:
context_file = open('./data/train_context', 'r')
questions_file = open('./data/train_question', 'r')
answers_file = open('./data/train_answer', 'r')
context_data = get_cdata(context_file)
questions_data = get_qdata(questions_file)
answers_data = get_qdata(answers_file)

In [83]:
data = []
for cid, context in enumerate(context_data):
    for qid, question in enumerate(questions_data[cid]):
        data.append((context.strip(), question.strip(), answers_data[cid][qid].strip()))

In [113]:
vocab = set()
for story, q, answer in data:
    vocab |= set(tokenize(story) + tokenize(q) + tokenize(answer))
vocab = sorted(vocab)
vocab_size = len(vocab) + 1

In [114]:
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in data)))
query_maxlen = max(map(len, (x for _, x, _ in data)))

In [115]:
def vectorize_stories(data):
    inputs, queries, answers = [], [], []
    for story, query, answer in data:
        inputs.append([word_idx[w] for w in tokenize(story)])
        queries.append([word_idx[w] for w in tokenize(query)])
        answers.append(word_idx[w] for w in tokenize(answer))
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen),
            np.array(answers))

In [116]:
inputs_train, queries_train, answers_train = vectorize_stories(data)

In [95]:
import nltk
nltk.word_tokenize('hello idiot\'s noob')

['hello', 'idiot', "'s", 'noob']