# End-to-end Network
1. Input memory representation
2. Output memory representation
3. Generating final prediction

<img src='qa_bot_network.png'>

In [1]:
import pickle
import numpy as np

In [2]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [3]:
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [4]:
display(type(test_data))
display(len(test_data))
display(type(train_data))
display(len(train_data))

list

1000

list

10000

In [5]:
# [([story], [question], answer)]
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [6]:
print(' '.join(train_data[0][0]))
print(' '.join(train_data[0][1]))
train_data[0][2]

Mary moved to the bathroom . Sandra journeyed to the bedroom .
Is Sandra in the hallway ?


'no'

In [7]:
# Starting to create a vocabulary
all_data = train_data + test_data
display(len(all_data))

vocab = set()

for story,question,answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

vocab.add('no')
vocab.add('yes')

vocab

11000

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [8]:
# Account for Keras pad sequence length, where zero is a placeholder
vocab_len = len(vocab) + 1
vocab_len

38

In [9]:
# Longest story
all_story_len = [len(data[0]) for data in all_data]
max_story_len = max(all_story_len)
# Longest question
max_question_len = max([len(data[1]) for data in all_data])
display(max_story_len)
display(max_question_len)

156

6

## Manual process of implementing Keras for the vectorization of the data

In [11]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [12]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)
tokenizer.word_index

{'daniel': 1,
 'to': 2,
 'the': 3,
 'put': 4,
 'picked': 5,
 'discarded': 6,
 'hallway': 7,
 'garden': 8,
 'journeyed': 9,
 'john': 10,
 'apple': 11,
 'bathroom': 12,
 'there': 13,
 'back': 14,
 'yes': 15,
 'office': 16,
 'down': 17,
 'is': 18,
 'kitchen': 19,
 'sandra': 20,
 'in': 21,
 'football': 22,
 'travelled': 23,
 'bedroom': 24,
 'moved': 25,
 'no': 26,
 'mary': 27,
 'went': 28,
 'up': 29,
 'grabbed': 30,
 'dropped': 31,
 'left': 32,
 'took': 33,
 'got': 34,
 'milk': 35,
 '?': 36,
 '.': 37}

In [13]:
train_story_text = []
train_question_text = []
train_answers = []

for s, q, a in train_data:
    train_story_text.append(s)
    train_question_text.append(q)
#     train_answers.append(a)

In [14]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)
display(len(train_story_seq))

display(train_story_seq[:2])
display(train_story_text[:2])

10000

[[27, 25, 2, 3, 12, 37, 20, 9, 2, 3, 24, 37],
 [27,
  25,
  2,
  3,
  12,
  37,
  20,
  9,
  2,
  3,
  24,
  37,
  27,
  28,
  14,
  2,
  3,
  24,
  37,
  1,
  28,
  14,
  2,
  3,
  7,
  37]]

[['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.',
  'Mary',
  'went',
  'back',
  'to',
  'the',
  'bedroom',
  '.',
  'Daniel',
  'went',
  'back',
  'to',
  'the',
  'hallway',
  '.']]

In [15]:
def vectorize_stories(data,word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    '''
    INPUT: 
    
    data: consisting of Stories,Queries,and Answers
    word_index: word index dictionary from tokenizer
    max_story_len: the length of the longest story (used for pad_sequences function)
    max_question_len: length of the longest question (used for pad_sequences function)


    OUTPUT:
    
    Vectorizes the stories,questions, and answers into padded sequences. We first loop for every story, query , and
    answer in the data. Then we convert the raw words to an word index value. Then we append each set to their appropriate
    output list. Then once we have converted the words to numbers, we pad the sequences so they are all of equal length.
    
    Returns this in the form of a tuple (X,Xq,Y) (padded based on max lengths)
    '''

    # Stories = X
    X = []
    # Questions = Xq
    Xq = []
    # Correct answer (yes/no) = Y
    Y =[]
    
    for s, q, a in data:
        # For each story, create a list of what indexes are associated with each word
        x = [word_index[word.lower()] for word in s]
        # For each story, create a list of what indexes are associated with each word
        xq = [word_index[word.lower()] for word in q]
        
        # empty matrix with plus one for padding
        y = np.zeros(len(word_index)+1)
        
        y[word_index[a]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
        # pad the sequences based on their max length so the RNN can be trained on uniformly long sequences
        
    return (pad_sequences(X,maxlen=max_story_len), pad_sequences(Xq,maxlen=max_question_len), np.array(Y))


In [16]:
s_train, q_train, a_train = vectorize_stories(train_data)
s_test, q_test, a_test = vectorize_stories(test_data)

In [17]:
display(s_train)
display(a_train)

array([[ 0,  0,  0, ...,  3, 24, 37],
       [ 0,  0,  0, ...,  3,  7, 37],
       [ 0,  0,  0, ...,  3, 12, 37],
       ...,
       [ 0,  0,  0, ...,  3, 24, 37],
       [ 0,  0,  0, ..., 35, 13, 37],
       [ 0,  0,  0, ..., 11, 13, 37]], dtype=int32)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
display(tokenizer.word_index['yes'])
display(tokenizer.word_index['no'])

15

26

In [19]:
# Train has 5012 yes-es, 4988 no-es
sum(a_test)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0., 503.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

## Creating the model with Keras

In [20]:
# Read the paper to understand the network and the encoders: https://arxiv.org/pdf/1503.08895.pdf

In [21]:
from keras.models import Sequential,Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [22]:
# Have two inputs: story and a question
# Need to be linked together for the answer yes/no

In [23]:
# Placeholder for shape=(max_X_len, batch_size)
# not inputting anything for batch size lets it be editable - just leave a comma
story_input = Input((max_story_len,))
question_input = Input((max_question_len,))

In [24]:
# Create input encoders

# Start by defining the vocabulary size - same as vocab_len from earlier
vocab_size = len(vocab) + 1

# Input Encoder M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,
                              output_dim=64))
input_encoder_m.add(Dropout(rate=0.3))
# Encoder should output: (samples, story_maxlen, embedding_dim)

# Input Encoder C
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,
                              output_dim=max_question_len))
input_encoder_c.add(Dropout(rate=0.3))
# Encoder should output: (samples, story_maxlen, max_question_len)

# Question Encoder
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=max_question_len)) # match output to Encoder C
question_encoder.add(Dropout(rate=0.3))
# Encoder should output: (samples, question_maxlen, embedding_dim)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [25]:
# Now, pass in the inputs into the encoders
# Encoded <-- Encoder(Input)
input_encoded_m = input_encoder_m(story_input)
input_encoded_c = input_encoder_c(story_input)
question_encoded = question_encoder(question_input)

# Use the dot product to compute the match between the first input vector sequence and the question
match = dot([input_encoded_m,question_encoded], axes=(2,2))
match = Activation('softmax')(match)

# Add the match matrix with the second input matrix
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [26]:
answer = concatenate([response,question_encoded])
# Now, we will reduce our answer tensor with an RNN
answer = LSTM(32)(answer)
answer = Dropout(rate=0.5)(answer)
answer = Dense(vocab_size)(answer)
# Output: (samples, vocab_size) # Yes/No
answer = Activation('softmax')(answer)

In [27]:
model = Model([story_input,question_input],answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       multiple             2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
__________________________________________________________________________________________________
dot_1 (Dot

## Fitting and training the network
* this takes a long time, so we will use transfer learning instead

In [None]:
history = model.fit([s_train,q_train], a_train, batch_size=32,epochs=3)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/3
