In [1]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, CuDNNLSTM, Permute, Dropout, BatchNormalization, add, dot, concatenate
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from functools import reduce
import tarfile
import numpy as np
import re

Using TensorFlow backend.


<div class="alert alert-block alert-info">
<b>Preprocessing Functions:</b> We create some basic preprocessing functions related to text like tokenizing the text, parsing stories and vectorizing them in the next few command lines.
</div>

In [0]:
def tokenize(sent): ## splitting the text in tokens including punctuation
    return [x.strip() for x in re.split('(\W+)? ', sent) if (x!=None and x.split())]

In [0]:
def parse_stories(lines, only_supporting=False):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        id, line = line.split(' ', 1)
        id = int(id)
        if id == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

In [0]:
def get_stories(file, only_supporting=False, max_length=None):
    data = parse_stories(file, only_supporting=only_supporting)
    flat = lambda data: reduce(lambda i, j: i + j, data)
    data = [(flat(story), question, answer) for story, question, answer in data if not max_length or len(flat(story)) < max_length]
    return data

In [0]:
def vectorize_stories(data, word_id, story_maxlen, question_maxlen):
    X = []
    Q = []
    Y = []
    for story, question, answer in data:
        x = [word_id[i] for i in story]
        q = [word_id[i] for i in question]
        # Index 0 is reserved
        y = np.zeros(len(word_id) + 1)
        y[word_id[answer]] = 1
        X.append(x)
        Q.append(q)
        Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen), pad_sequences(Q, maxlen=question_maxlen), np.array(Y))

<div class="alert alert-block alert-info">
<b>Data Load:</b> Since the notebook was developed in collab, I was able to extract the tar_file from the location given else I have to download it in my local machine and extract from there
</div>

In [6]:
tar_file = tarfile.open(get_file('babi-tasks-v1-2.tar.gz',
                                 origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz'))

Downloading data from https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz


<div class="alert alert-block alert-info">
<b>Data to pick:</b> From the different set of Stories, Questions and Answers, we select the ones we will be using for training and prediction 
</div>

In [0]:
challenges = {
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt', # QA1 with 10,000 samples
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt', # QA2 with 10,000 samples
    'list_sets':'tasks_1-20_v1-2/en-10k/qa8_lists-sets_{}.txt'
}
challenge_type = 'list_sets'
challenge = challenges[challenge_type]

<div class="alert alert-block alert-info">
<b>Train and test files:</b> We select the train and test files from the loaded files and then find the actual size of the train and test data
</div>

In [0]:
train_file    = tar_file.extractfile(challenge.format('train'))
test_file     = tar_file.extractfile(challenge.format('test'))
vector_train  = train_file.readlines()
vector_test   = test_file.readlines()
train_stories = get_stories(vector_train)
test_stories  = get_stories(vector_test)

In [10]:
print('The number of samples in train stories is :- ',len(train_stories))
print('The number of samples in test stories is  :- ',len(test_stories))

The number of samples in train stories is :-  10000
The number of samples in test stories is  :-  1000


<div class="alert alert-block alert-info">
<b>Setting Up Vocabulary:</b> We set up the vocabulary for the train and test stories
</div>

In [11]:
vocab = set()
for story, question, answer in train_stories + test_stories:
    vocab |= set(story + question + [answer])
vocab = sorted(vocab)
print(vocab)

['?', 'Daniel', 'John', 'Mary', 'Sandra', 'What', 'apple', 'apple,football', 'apple,football,milk', 'apple,milk', 'apple,milk,football', 'apple.', 'back', 'bathroom.', 'bedroom.', 'carrying', 'discarded', 'down', 'dropped', 'football', 'football,apple', 'football,apple,milk', 'football,milk', 'football,milk,apple', 'football.', 'garden.', 'got', 'grabbed', 'hallway.', 'is', 'journeyed', 'kitchen.', 'left', 'milk', 'milk,apple', 'milk,apple,football', 'milk,football', 'milk,football,apple', 'milk.', 'moved', 'nothing', 'office.', 'picked', 'put', 'the', 'there.', 'to', 'took', 'travelled', 'up', 'went']


In [21]:
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
vocab_size

52

In [0]:
story_maxlen = max(map(len, (s for s, _, _ in train_stories + test_stories)))
question_maxlen = max(map(len, (s for _, s, _ in train_stories + test_stories)))

In [13]:
print('Story max length:', story_maxlen)
print('Question max length:', question_maxlen)

Story max length: 300
Question max length: 5


In [14]:
word2id = dict((w, i + 1) for i, w in enumerate(vocab))
print(word2id)

{'?': 1, 'Daniel': 2, 'John': 3, 'Mary': 4, 'Sandra': 5, 'What': 6, 'apple': 7, 'apple,football': 8, 'apple,football,milk': 9, 'apple,milk': 10, 'apple,milk,football': 11, 'apple.': 12, 'back': 13, 'bathroom.': 14, 'bedroom.': 15, 'carrying': 16, 'discarded': 17, 'down': 18, 'dropped': 19, 'football': 20, 'football,apple': 21, 'football,apple,milk': 22, 'football,milk': 23, 'football,milk,apple': 24, 'football.': 25, 'garden.': 26, 'got': 27, 'grabbed': 28, 'hallway.': 29, 'is': 30, 'journeyed': 31, 'kitchen.': 32, 'left': 33, 'milk': 34, 'milk,apple': 35, 'milk,apple,football': 36, 'milk,football': 37, 'milk,football,apple': 38, 'milk.': 39, 'moved': 40, 'nothing': 41, 'office.': 42, 'picked': 43, 'put': 44, 'the': 45, 'there.': 46, 'to': 47, 'took': 48, 'travelled': 49, 'up': 50, 'went': 51}


In [0]:
inputs_train, questions_train, answers_train = vectorize_stories(train_stories, word2id, story_maxlen, question_maxlen)

In [0]:
inputs_test, questions_test, answers_test = vectorize_stories(test_stories, word2id, story_maxlen, question_maxlen)

In [17]:
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)

inputs_train shape: (10000, 300)
inputs_test shape: (1000, 300)


In [18]:
print('queries_train shape:', questions_train.shape)
print('queries_test shape:', questions_test.shape)

queries_train shape: (10000, 5)
queries_test shape: (1000, 5)


In [19]:
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)

answers_train shape: (10000, 52)
answers_test shape: (1000, 52)


<div class="alert alert-block alert-info">
<b>Memory Module:</b> We create memory module here
</div>

In [0]:
story_sequence = Input((story_maxlen,))
question = Input((question_maxlen,))

# embed the input sequence into a sequence of vectors for the stories
input_encoder_s = Sequential()
input_encoder_s.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_s.add(Dropout(0.3))

# embed the input into a sequence of vectors of size question_maxlen
# output: (samples, story_maxlen, question_maxlen)
input_encoder_q = Sequential()
input_encoder_q.add(Embedding(input_dim=vocab_size, output_dim=question_maxlen))
input_encoder_q.add(Dropout(0.3))

# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=question_maxlen))
question_encoder.add(Dropout(0.3))

<div class="alert alert-block alert-info">
<b>Controller Module:</b> We create controller module here 
</div>

In [0]:
# encode input sequence and questions to sequences of dense vectors
input_encoded_s = input_encoder_s(story_sequence)
input_encoded_q = input_encoder_q(story_sequence)
question_encoded = question_encoder(question)

# compute a 'match' between the first input vector sequence
# and the question vector sequence
# shape: `(samples, story_maxlen, question_maxlen)`
match = dot([input_encoded_s, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

# add the match matrix with the second input vector sequence
response = add([match, input_encoded_q])  # (samples, story_maxlen, question_maxlen)
response = Permute((2, 1))(response)  # (samples, question_maxlen, story_maxlen)

# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])

answer = CuDNNLSTM(32)(answer)  # (samples, 32)
answer = Dropout(0.3)(answer)
answer = BatchNormalization()(answer)

output = Dense(vocab_size)(answer)  # (samples, vocab_size)
output = Activation('softmax')(output)

In [0]:
model = Model([story_sequence, question], output)

In [26]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
sequential_5 (Sequential)       multiple             3328        input_5[0][0]                    
__________________________________________________________________________________________________
sequential_7 (Sequential)       (None, 5, 64)        3328        input_6[0][0]                    
____________________________________________________________________________________________

In [27]:
model.compile(optimizer=Adam(0.005), loss='categorical_crossentropy', metrics=['accuracy'])





<div class="alert alert-block alert-info">
<b>Training the model:</b> We train the model using the given set of hyper parameters and parameters. I have not parameter tuned the observations because of lack of system memory to do that.
</div>

In [28]:
%%time
model.fit([inputs_train, questions_train], answers_train,
          batch_size=128,
          epochs=120,
          validation_data=([inputs_test, questions_test], answers_test))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 10000 samples, validate on 1000 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67

<keras.callbacks.History at 0x7f90fdbfd4a8>

In [0]:
model.save('../chatbot_model.h5')

In [0]:
model.load_weights('../chatbot_model.h5')
pred = model.predict(([inputs_test, questions_test]))

<div class="alert alert-block alert-info">
<b>Answering a question:</b> We now use the trained model to answer a few questions
</div>

In [0]:
n = np.random.randint(0,1000)

In [33]:
story_list = test_stories[n][0]
story =' '.join(word for word in story_list)
print("Story is:",story)

Story is: Daniel went to the hallway. Daniel journeyed to the kitchen. Mary went back to the kitchen. Mary took the football there.


In [34]:
question_list = test_stories[n][1]
question =' '.join(word for word in question_list)
print("Question is: ", question)

Question is:  What is Mary carrying ?


In [35]:
answer = test_stories[n][2]
print("Actual answer is: ", answer)

Actual answer is:  football


In [0]:
max_value = np.argmax(pred[n])

In [37]:
for key, val in word2id.items():
    if val == max_value:
        k = key

print("Machine answer is: ", k)
print("Machine says: I am ", pred[n][max_value], "certain of it")

Machine answer is:  football
Machine says: I am  0.99996567 certain of it
