## Import important libraries

In [1]:
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Bidirectional, LSTM, Dense, TimeDistributed, Embedding, RepeatVector, Input
from keras.preprocessing.sequence import pad_sequences
import re
import numpy as np
import pandas as pd
import time
from keras.layers import Layer
from keras import utils
from sklearn.preprocessing import OneHotEncoder
from keras.optimizers import RMSprop, adam, Adam 

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


## import dataset

In [2]:
with open('intro_data.txt', 'r') as f:
    files = f.read()

In [3]:
con = files.split('\n')
len(con)

764

## Preprocessing

separatly store question and reply data 

In [4]:
ask = []
reply = []
for line in con:
    _line = line.split('##')
    if len(_line) == 2:
        ask.append(_line[0])
        reply.append(_line[1])

In [5]:
len(ask)

763

In [6]:
len(reply)

763

Now cleaning the text make it more readable and reomve unneccessary things

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she’s", "she will", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"why's", "why is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r" ok ", " okay ", text)
    text = re.sub(r" thankyou ", " thank you ", text)
    
    
  

    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

In [8]:
clean_ask = []
for que in ask:
    clean_ask.append(clean_text(que))

In [9]:
clean_reply = []
for ans in reply:
    clean_reply.append(clean_text(ans))

In [10]:
for j in range(10):
    print("question " + clean_ask[j])
    print("answer " + clean_reply[j])


question hi
answer hello
question hi
answer hello
question hi there
answer hellothere
question hello
answer hi there
question hi how is it going
answer hello fine
question hi how are you
answer hello i am fine and how are you
question hi nice to meet you
answer hello nice to meet you too
question nice to meet you too
answer it is a pleasure to meet you
question how are you
answer i am good and you
question how are you doing
answer i am doing well and you


In [11]:
print(len(clean_ask))
print(len(clean_reply))

763
763


- create tokenizer and load all words
- add SOS and EOS in all sentences


In [12]:
reply_tag = []
for sen in clean_reply:
    final = '<SOS> ' + sen + ' <EOS>'
    reply_tag.append(final)

In [13]:
ask_tag = clean_ask.copy()

In [14]:
all_vocabs = []
for i in range(len(reply_tag)):
    all_vocabs.append(ask_tag[i])
    all_vocabs.append(reply_tag[i])
all_vocabs[:5]

['hi', '<SOS> hello <EOS>', 'hi', '<SOS> hello <EOS>', 'hi there']

In [15]:
tokenizer = keras.preprocessing.text.Tokenizer(filters='!')
tokenizer.fit_on_texts(all_vocabs)
vocab_size = len(tokenizer.word_index)+1
print(vocab_size)

889


prepare data for encoder input and decoder input and decoder output

In [16]:
# encoder input data
tokenized_ask = tokenizer.texts_to_sequences(ask_tag) 
maxlen_ask = max([len(x) for x in tokenized_ask])
padded_ask = keras.preprocessing.sequence.pad_sequences(tokenized_ask, maxlen= maxlen_ask,padding='post')
encoded_input_data = np.array(padded_ask) 
print(encoded_input_data.shape, maxlen_ask)

(763, 11) 11


In [17]:
# decoder input data
tokenized_reply = tokenizer.texts_to_sequences(reply_tag)
maxlen_reply = max([len(x) for x in tokenized_reply])
padded_reply = keras.preprocessing.sequence.pad_sequences(tokenized_reply,maxlen=maxlen_reply,padding='post')
decoded_input_data = np.array(padded_reply)
print(decoded_input_data.shape,maxlen_reply)


(763, 19) 19


In [18]:
# decoder output data
tokenized_reply_output = tokenizer.texts_to_sequences(reply_tag)
for i in range(len(tokenized_reply_output)):
    tokenized_reply_output[i] = tokenized_reply_output[i][1:]
pad_decoded_output = keras.preprocessing.sequence.pad_sequences(tokenized_reply_output,maxlen=maxlen_reply,padding='post')
#onehot = OneHotEncoder()
#decoded_output_data = onehot.fit_transform(pad_decoded_output).toarray()
one_hot = utils.to_categorical(pad_decoded_output,vocab_size)
decoded_output_data = np.array(one_hot)
print(decoded_output_data.shape) 



(763, 19, 889)


## creating SEQ2SEQ Model


<img src=seq2seq.png>


- encoder model takes question as the input.
- encoder model sates vectors are the initial states for decoder model
- for decorder model takes encoder states and starting token as the input.

In [19]:
# encoder cell
encoder_input = Input(shape=(None,))
encoder_embedded = Embedding(vocab_size,200,mask_zero=True)(encoder_input)
encoder_output, state_h, state_c = LSTM(200,return_state=True)(encoder_embedded)
encoder_states = [state_h, state_c]

Instructions for updating:
Colocations handled automatically by placer.


In [20]:
# decoder cell
decoder_input = Input(shape=(None,))
decoder_embedded = Embedding(vocab_size, 200, mask_zero=True)(decoder_input)
decoder_lstm = LSTM(200, return_state=True,return_sequences= True)
decoder_output, _, _ = decoder_lstm(decoder_embedded, initial_state = encoder_states)
decoder_dense = Dense(vocab_size,activation='softmax')
output = decoder_dense(decoder_output)

In [21]:
model = Model([encoder_input, decoder_input],output)
model.compile(optimizer = RMSprop(), loss='categorical_crossentropy',metrics=['accuracy']) 

In [22]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    177800      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 200)    177800      input_2[0][0]                    
____________________________________________________________________________________________

Start Training

In [23]:
model.fit([encoded_input_data,decoded_input_data],decoded_output_data,batch_size=10,epochs=70)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.callbacks.History at 0x25f7e34fe10>

In [24]:
model.save('model.h5')

preparing inference model

In [25]:
def make_inference():
    encoder_model = keras.models.Model(encoder_input,encoder_states)

    decoder_state_input_h = Input(shape=(200,))
    decoder_state_input_c = Input(shape=(200,))

    decoder_state_input = [decoder_state_input_h,decoder_state_input_c]

    decoder_outputs, state_h,state_c = decoder_lstm(decoder_embedded,initial_state=decoder_state_input)
    decoder_states = [state_h,state_c] 
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = keras.models.Model([decoder_input]+decoder_state_input,
                                     [decoder_outputs]+decoder_states)
  
    return encoder_model, decoder_model

prepare chatbot for giving final result

In [26]:
def str_to_token(sentence:str):
    words = sentence.lower().split()
    token_list = []
    for word in words:
        token_list.append(tokenizer.word_index[word])
    return keras.preprocessing.sequence.pad_sequences([token_list], maxlen=maxlen_ask, padding='post')


In [27]:

enc_model, dec_model = make_inference()


for _ in range(1):
    ask=''
    input_user=input('you: ')
    temp = input_user
    split_input=input_user.split(' ')
    for i in range(len(split_input)):
        if split_input[i] not in tokenizer.word_index:
            unk_word=split_input[i]
            split_input[i]= 'ukn'
        ask +=split_input[i]+' '
    state_value = enc_model.predict(str_to_token(ask))
    empty_target_seq = np.zeros((1,1))
    empty_target_seq[0,0] = tokenizer.word_index['<sos>']
    stop_condition =False
    decoded_translation = ''
    while not stop_condition:
        dec_output, h, c = dec_model.predict([empty_target_seq]+state_value)
        sample_word_index = np.argmax(dec_output[0,-1,:])
        ##print(dec_output[0,-1,:])
        sample_word = None
        for word, index in tokenizer.word_index.items():
            if sample_word_index == index:
                decoded_translation += ' {}' .format(word)
                sample_word = word
          
        if sample_word == '<eos>' or len(decoded_translation.split())> maxlen_reply:
            stop_condition = True

        empty_target_seq =np.zeros((1,1))
        empty_target_seq[0,0] = sample_word_index
        state_value = [h,c]
    print('chatbot:{}'.format(decoded_translation))

you: hello
chatbot: hi there <eos>
