## Q & A bot with DeepLearning(LSTM)

In [1]:
import pickle
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

#### Setting Up Vocabulary of Words

In [3]:
vocab = set()
all_data = train_data + test_data
for story,question,answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))
vocab.add('yes')
vocab.add('no')

#### Vectorising the Data

In [4]:
#adding 1 for padding sequences
vocab_len = len(vocab) + 1

In [5]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
# integer encode sequences of words
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)
#finding length of longest story an longest questions
max_story_len = max([len(data[0]) for data in all_data])
max_question_len = max([len(data[1]) for data in all_data]) 
tokeniser = Tokenizer(filters =[])
tokeniser.fit_on_texts(vocab)
train_story_text =[] 
train_question_text = []
train_answers = []
for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)
train_story_seq = tokeniser.texts_to_sequences(train_story_text)

#### Functionalise Vectorisation

In [6]:
def vectorise_stories(data,word_index = tokeniser.word_index,max_story_len = max_story_len,max_question_len = max_question_len):
        #Stories = X
        X=[]
        #Questions = Xq
        Xq = []
        #Y coorect answer (yes/no)
        Y=[]
        for story,question,answer in data:
            x = [word_index[word.lower()] for word in story]
            xq = [word_index[word.lower()] for word in question]
            y= np.zeros(len(word_index)+1)
            y[word_index[answer]] = 1
            X.append(x)
            Xq.append(xq)
            Y.append(y)   
        return (pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq,maxlen=max_question_len),np.array(Y))
            

inputs_train,queries_train,answers_train =vectorise_stories(train_data)
inputs_test, queries_test, answers_test = vectorise_stories(test_data)

In [7]:
tokenizer.word_index['yes']

2

In [8]:
tokenizer.word_index['no']

9

## Creating the Model:

In [9]:
from keras.models import Sequential,Model
from keras.layers.embeddings import Embedding
from keras.layers import Input,Activation,Dense,Permute,Dropout,add,dot,concatenate,LSTM

### Building Networks:

In [10]:
#Placeholder shape = (max_story_len,batch_size)
Input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

#### Input Encoder M:

In [11]:
#input encoder M:
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim = vocab_len,output_dim = 64))
input_encoder_m.add(Dropout(0.3))

#### Input Encoder C:

In [12]:
#input encoder C:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim = vocab_len,output_dim = max_question_len))
input_encoder_c.add(Dropout(0.3))

#### Question Encoder:

In [13]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim = vocab_len,output_dim = 64,input_length = max_question_len))
question_encoder.add(Dropout(0.3))

#### Encode the Sequences

In [14]:

#Encoded <--- Encoder(input)
input_encoded_m = input_encoder_m(Input_sequence)
input_encoded_c = input_encoder_c(Input_sequence)
question_encoded = question_encoder(question)

#### Use dot product to compute the match between first input vector seq and the query

In [15]:
match = dot([input_encoded_m,question_encoded],axes=(2,2))
match = Activation('softmax')(match)

#### Adding this match matrix with the second input vector sequence


In [16]:
#implementing multiple layer
response = add([match,input_encoded_c])
response = Permute((2,1))(response)

#### Concatenate:

In [17]:

answer = concatenate([response,question_encoded])
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_len)(answer)
answer = Activation('softmax')(answer)
model = Model([Input_sequence,question],answer)
model.compile(optimizer='rmsprop',loss = 'categorical_crossentropy',metrics = ['accuracy'])
model.summary()




Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       multiple             2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
____________________________________________________________________________________________

#### Training:

In [None]:
#smaller batch size with larger epoch leads to good reult
history = model.fit([inputs_train,queries_train],answers_train,batch_size = 32,epochs = 100,validation_data=([inputs_test,queries_test],answers_test))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

#### generating new sequences an testing:

In [None]:
# Note the whitespace of the periods
my_story = "John left the kitchen . Sandra dropped the football in the garden ."
my_story.split()
my_question = "Is the football in the garden ?"
my_question.split()
mydata = [(my_story.split(),my_question.split(),'yes')]
my_story,my_ques,my_ans = vectorise_stories(mydata)
pred_results = model.predict(([ my_story, my_ques]))
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])