In [None]:
!pip install keras

In [2]:
import pickle
import numpy as np

In [3]:
with open('train_qa.txt','rb' ) as f:
    train_data = pickle.load(f)

In [4]:
with open('test_qa.txt','rb' ) as f:
    test_data = pickle.load(f)

In [5]:
type(test_data)

list

In [6]:
type(train_data)

list

In [7]:
len(train_data)

10000

In [8]:
len(test_data)

1000

In [9]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [10]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [11]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [12]:
train_data[0][2]

'no'

In [13]:
all_data = test_data + train_data

In [14]:
len(all_data)

11000

In [15]:
set(train_data[0][0])

{'.',
 'Mary',
 'Sandra',
 'bathroom',
 'bedroom',
 'journeyed',
 'moved',
 'the',
 'to'}

In [16]:
vocab = set()
for story, question,answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [17]:
vocab.add('no')

In [18]:
vocab.add('yes')

In [19]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [20]:
vocab_len = len(vocab) + 1

In [21]:
vocab_len

38

In [22]:
all_story_lens = [len(data[0]) for data in all_data]

In [23]:
max_story_len = max(all_story_lens)

In [24]:
max_question_len = max([len(data[1]) for data in all_data])

In [25]:
max_question_len

6

In [26]:
!pip install Keras-Preprocessing



In [27]:
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [28]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [29]:
tokenizer.word_index

{'grabbed': 1,
 'sandra': 2,
 'is': 3,
 'football': 4,
 'went': 5,
 'back': 6,
 'put': 7,
 'no': 8,
 'bathroom': 9,
 'the': 10,
 'down': 11,
 'took': 12,
 'daniel': 13,
 'got': 14,
 'journeyed': 15,
 'garden': 16,
 '?': 17,
 'hallway': 18,
 'travelled': 19,
 'there': 20,
 'picked': 21,
 'john': 22,
 'kitchen': 23,
 'milk': 24,
 'up': 25,
 'moved': 26,
 'mary': 27,
 'apple': 28,
 'yes': 29,
 'discarded': 30,
 'left': 31,
 'bedroom': 32,
 'dropped': 33,
 'in': 34,
 '.': 35,
 'to': 36,
 'office': 37}

In [30]:
train_story_text = []
train_question_text = []
train_answers = []

In [31]:
for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [32]:
#train_story_text

In [33]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [34]:
len(train_story_seq)

10000

In [35]:
#train_story_text

In [36]:
#train_story_seq

In [37]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    # stories = X
    X = []
    # questions Xa
    Xq = []
    # Y correct answer(yes/no)
    Y = []

    for story,query,answer in data:
        
      # for each story      
      x = [word_index[word.lower()] for word in story]
      xq = [word_index[word.lower()] for word in query]

      
      y = np.zeros(len(word_index)+1) 

      y[word_index[answer]] = 1

      X.append(x)
      Xq.append(xq)
      Y.append(y)

    return(pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))
    
    

In [38]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [39]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [40]:
inputs_test

array([[ 0,  0,  0, ..., 10, 32, 35],
       [ 0,  0,  0, ..., 10, 16, 35],
       [ 0,  0,  0, ..., 10, 16, 35],
       ...,
       [ 0,  0,  0, ..., 10, 28, 35],
       [ 0,  0,  0, ..., 10, 16, 35],
       [ 0,  0,  0, ..., 28, 20, 35]])

In [41]:
answers_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [42]:
tokenizer.word_index['yes']

29

In [43]:
tokenizer.word_index['no']

8

In [44]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 503.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [45]:
from keras.models import Sequential, Model

In [46]:
from keras.layers import Embedding

In [47]:
from keras.layers import Input,Activation,Dense,Permute,Dropout,add,dot,concatenate, LSTM

In [48]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [49]:
vocab_size = len(vocab) + 1

In [50]:
# Input Encoder M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.3))

# (samples, story_maxlen, embedding_dim)

In [51]:
# Input Encoder C
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

# (samples, story_maxlen, embedding_dim)

In [52]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64))
question_encoder.add(Dropout(0.3))

In [53]:
# Encoded <--- Encoder(input)
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [54]:
match = dot([input_encoded_m, question_encoded],axes=(2,2))
match = Activation('softmax')(match)

In [55]:
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [56]:
answer = concatenate([response, question_encoded])

In [57]:
answer

<KerasTensor shape=(None, 6, 220), dtype=float32, sparse=False, name=keras_tensor_18>

In [58]:
answer = LSTM(32)(answer)

In [59]:
answer = Dropout(0.3)(answer)
answer = Dense(vocab_size)(answer)

In [60]:
answer = Activation('softmax')(answer)

In [61]:
model = Model([input_sequence,question],answer)

In [64]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['a'])

In [66]:
model.summary()

In [None]:
history = model.fit([inputs_train, queries_train],answers_train,batch_size=32,epochs=3,validation_data([inputs_test,queries_test],answers_test))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
print(history.history.keys())
#summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [72]:
model.load_weights('chatbot_10.h5')

In [73]:
pred_results = model.predict(([inputs_test, queries_test]))

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step 


In [74]:
test_data[0][0]

['Mary',
 'got',
 'the',
 'milk',
 'there',
 '.',
 'John',
 'moved',
 'to',
 'the',
 'bedroom',
 '.']

In [78]:
test_data[0][1]

['Is', 'John', 'in', 'the', 'kitchen', '?']

In [79]:
test_data[0][2]

'no'

In [77]:
pred_results[0]

array([4.5721033e-16, 4.9493586e-16, 4.2633275e-16, 4.4071924e-16,
       5.2396627e-16, 4.1640913e-16, 9.9963391e-01, 5.3508792e-16,
       5.6247922e-16, 3.7630333e-16, 4.0649619e-16, 4.7646567e-16,
       5.6136240e-16, 5.0769445e-16, 4.1197247e-16, 4.7709861e-16,
       3.9820829e-16, 4.1817924e-16, 4.2135135e-16, 3.6608562e-04,
       4.3472484e-16, 4.4771081e-16, 4.6097211e-16, 5.0044533e-16,
       3.9895331e-16, 4.2959457e-16, 4.3958757e-16, 4.1003150e-16,
       4.2350263e-16, 4.3441652e-16, 4.4838076e-16, 4.5243217e-16,
       4.8347068e-16, 4.6983451e-16, 4.0570149e-16, 4.7622945e-16,
       4.4257585e-16, 4.7220266e-16], dtype=float32)

In [80]:
val_max = np.argmax(pred_results[0])

In [81]:
for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

In [82]:
k

'back'

In [83]:
pred_results[0][val_max]

0.9996339