In [45]:
'''Trains a memory network on the bAbI dataset.

References:
- Jason Weston, Antoine Bordes, Sumit Chopra, Tomas Mikolov, Alexander M. Rush,
  "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks",
  http://arxiv.org/abs/1502.05698

- Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, Rob Fergus,
  "End-To-End Memory Networks",
  http://arxiv.org/abs/1503.08895

Reaches 98.6% accuracy on task 'single_supporting_fact_10k' after 120 epochs.
Time per epoch: 3s on CPU (core i7).
'''

from __future__ import print_function

import os
import re
from functools import reduce

import numpy as np
from keras.layers import Dense, Concatenate, Dropout, Reshape, Flatten, Input, concatenate
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model
from keras.preprocessing.sequence import pad_sequences

In [2]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [3]:
def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format

    If only_supporting is true, only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
#         line = line.decode('utf-8').strip()
        line = line.strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

In [4]:
def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story.

    If max_length is supplied, any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if
            not max_length or len(flatten(story)) < max_length]
    return data

In [5]:
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        y = np.zeros(len(word_idx) + 1)  # let's not forget that index 0 is reserved
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen),
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

In [6]:
challenges = {
    # sample data to understand the code structure
    'single_supporting_fact_sample': './data/sample_{}.txt',
    # QA1 with 10,000 samples
    'single_supporting_fact': './data/qa1_single-supporting-fact_{}.txt',
}
challenge_type = 'single_supporting_fact'
challenge = challenges[challenge_type]

print('Extracting stories for the challenge:', challenge_type)
train_stories = get_stories(open(challenge.format('train')))
test_stories = get_stories(open(challenge.format('test')))

Extracting stories for the challenge: single_supporting_fact


  return _compile(pattern, flags).split(string, maxsplit)


In [7]:
vocab = sorted(
    reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train_stories + test_stories)))
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))

print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training stories:', len(train_stories))
print('Number of test stories:', len(test_stories))
print('-')
print('Here\'s what a "story" tuple looks like (input, query, answer):')
print(train_stories[0])
print('-')
print('Vectorizing the word sequences...')

-
Vocab size: 22 unique words
Story max length: 66 words
Query max length: 4 words
Number of training stories: 1000
Number of test stories: 1000
-
Here's what a "story" tuple looks like (input, query, answer):
(['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'John', 'went', 'to', 'the', 'hallway', '.'], ['Where', 'is', 'Mary', '?'], 'bathroom')
-
Vectorizing the word sequences...


In [8]:
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize_stories(train_stories, word_idx, story_maxlen, query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories, word_idx, story_maxlen, query_maxlen)

print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)
print('-')
print('Compiling...')

-
inputs: integer tensor of shape (samples, max_length)
inputs_train shape: (1000, 66)
inputs_test shape: (1000, 66)
-
queries: integer tensor of shape (samples, max_length)
queries_train shape: (1000, 4)
queries_test shape: (1000, 4)
-
answers: binary (1 or 0) tensor of shape (samples, vocab_size)
answers_train shape: (1000, 22)
answers_test shape: (1000, 22)
-
Compiling...


In [42]:
input_encoder_c.outputs
# question_encoder.outputs

[<tf.Tensor 'in_dropout_9/cond/Merge:0' shape=(?, 66, 64) dtype=float32>]

In [43]:
type(inputs)

tensorflow.python.framework.ops.Tensor

In [46]:
# input_encoder_c = Sequential()
inputs = Input(shape=(story_maxlen,))
e = Embedding(input_dim=vocab_size, output_dim=64, input_length=story_maxlen)(inputs)
# input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=story_maxlen))
drop = Dropout(0.3, name="in_dropout")(e)
# input_encoder_c.add(Dropout(0.3))
input_encoder_c = Model(inputs=inputs, outputs=drop)
print(type(input_encoder_c.layers[0]),type(input_encoder_c.layers),type(input_encoder_c))

# question_encoder = Sequential()
inputs = Input(shape=(query_maxlen,))
e = Embedding(input_dim=vocab_size, output_dim=64, input_length=query_maxlen)(inputs)
# question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=query_maxlen))
drop = Dropout(0.3, name="ques_dropout")(e)
# question_encoder.add(Dropout(0.3))
question_encoder = Model(inputs=inputs, outputs=drop)

##########################################

# answer.add(Merge([input_encoder_c, question_encoder], mode='concat', concat_axis=1))
inputs = concatenate([input_encoder_c.outputs, question_encoder.outputs])
# inputs = Concatenate(axis=1)([input_encoder_c.outputs, question_encoder.outputs])
# concatenate([input_encoder_c, question_encoder])(answer)
flat = Flatten()(inputs)
# answer.add(Flatten())
# answer.add(Dense(100, activation='relu'))
dense1 = Dense(activation='relu')(flat)
drop1 = answer.Dropout(0.3)(dense1)
# answer.add(Dropout(0.3))
# answer.add(Dense(100, activation='relu'))
dense2 = answer.Dense(activation='relu')(drop1)
drop2 = answer.Dropout(0.3)(dense2)


# answer.add(Dropout(0.3))
# answer.add(Dense(vocab_size, activation='softmax'))
output_nodes = Dense(vocab_size, activation='softmax')(drop2)
answer = Model(inputs=inputs, outputs=output_nodes)

# input_nodes= Input(shape=(X_train.shape[1],))
# e = Embedding(input_dim=5000,
#               output_dim=32,
#               input_length=X_train.shape[1],
#               trainable=True)(input_nodes)
# lstm=LSTM(100)(e)
# output_nodes=Dense(1, activation='sigmoid')(lstm)

# #Build model
# model = Model(inputs=input_nodes, outputs=output_nodes)
# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

<class 'keras.engine.input_layer.InputLayer'> <class 'list'> <class 'keras.engine.training.Model'>


ValueError: Layer concatenate_12 was called with an input that isn't a symbolic tensor. Received type: <class 'list'>. Full input: [[<tf.Tensor 'in_dropout_11/cond/Merge:0' shape=(?, 66, 64) dtype=float32>], [<tf.Tensor 'ques_dropout_11/cond/Merge:0' shape=(?, 4, 64) dtype=float32>]]. All inputs to the layer should be tensors.

In [26]:
answer.compile(optimizer='adam', loss='categorical_crossentropy',
               metrics=['accuracy'])

In [27]:
answer.layers

[<keras.layers.merge.Concatenate at 0x1245b2828>,
 <keras.layers.core.Flatten at 0x1245b2e10>,
 <keras.layers.core.Dense at 0x1245b2780>,
 <keras.layers.core.Dropout at 0x124c72f28>,
 <keras.layers.core.Dense at 0x124c72e48>,
 <keras.layers.core.Dropout at 0x1245b2898>,
 <keras.layers.core.Dense at 0x124c97f28>]

In [30]:
answer.fit([inputs_train, queries_train], answers_train,
           batch_size=32,
           nb_epoch=50,
           validation_data=([inputs_test, queries_test], answers_test))

  after removing the cwd from sys.path.


AssertionError: 