# ChatBot using Sequence to Sequence model and LSTM

## Data Download

In [3]:
!git clone https://github.com/BharadwajYellapragada/Chatbot_NLP_LSTM_Seq2Seq.git

Cloning into 'Chatbot_NLP_LSTM_Seq2Seq'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects:   3% (1/29)[Kremote: Counting objects:   6% (2/29)[Kremote: Counting objects:  10% (3/29)[Kremote: Counting objects:  13% (4/29)[Kremote: Counting objects:  17% (5/29)[Kremote: Counting objects:  20% (6/29)[Kremote: Counting objects:  24% (7/29)[Kremote: Counting objects:  27% (8/29)[Kremote: Counting objects:  31% (9/29)[Kremote: Counting objects:  34% (10/29)[Kremote: Counting objects:  37% (11/29)[Kremote: Counting objects:  41% (12/29)[Kremote: Counting objects:  44% (13/29)[Kremote: Counting objects:  48% (14/29)[Kremote: Counting objects:  51% (15/29)[Kremote: Counting objects:  55% (16/29)[Kremote: Counting objects:  58% (17/29)[Kremote: Counting objects:  62% (18/29)[Kremote: Counting objects:  65% (19/29)[Kremote: Counting objects:  68% (20/29)[Kremote: Counting objects:  72% (21/29)[Kremote: Counting objects:  75% (22/29)[K

## Libraries

In [4]:
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras import layers,activations,models,preprocessing,utils
import os
import yaml

In [5]:
from gensim.models import Word2Vec
import re

In [None]:
from gtts import gTTS #Import Google Text to Speech
from IPython.display import Audio #Import Audio method from IPython's Display Class

## Preparing data for the model

In [6]:
!unzip chatterbotenglish.zip -d chatbotconversations

unzip:  cannot find or open chatterbotenglish.zip, chatterbotenglish.zip.zip or chatterbotenglish.zip.ZIP.


### Reading the data

In [7]:
# run this cell if this error AttributeError: module 'yaml' has no attribute 'FullLoader' occurs
!pip install PyYaml==5.1



In [8]:
with open(r'Chatbot_NLP_LSTM_Seq2Seq/chatbot_nlp/data/ai.yml') as file:
    sample = yaml.load(file, Loader=yaml.FullLoader)
    print(sample)

{'categories': ['AI', 'artificial intelligence'], 'conversations': [['What is AI?', 'Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think.'], ['What is AI?', 'AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind.'], ['Are you sentient?', 'Sort of.'], ['Are you sentient?', "By the strictest dictionary definition of the word 'sentience', I may be."], ['Are you sentient?', "Even though I'm a construct I do have a subjective experience of the universe, as simplistic as it may be."], ['Are you sapient?', "In all probability, I am not.  I'm not that sophisticated."], ['Are you sapient?', 'Do you think I am?'], ['Are you sapient?', 'How would you feel about me if I told you I was?'], ['Are you sapient?', 'No.'], ['What language are you written in?', 'Python.'], ['What language are you written in?', 'I am written in Python.'], ['You sound like Data', "Yes I am

In [9]:
dir_path = 'Chatbot_NLP_LSTM_Seq2Seq/chatbot_nlp/data'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()

for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])

answers_with_tags = list() # list of answers starts with <START> tag and ends with <END> tag
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( questions + answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

VOCAB SIZE : 1894


In [13]:
vocab = []
for word in tokenizer.word_index:
    vocab.append( word )

def tokenize( sentences ):
    tokens_list = []
    vocabulary = []
    for sentence in sentences:
        sentence = sentence.lower() # converts all the sentences into lower cases
        sentence = re.sub( '[^a-zA-Z]', ' ', sentence ) # to remove special charecters from the sentence
        tokens = sentence.split() # will convert a setence into a list of words
        vocabulary += tokens # to collect all the words in one list
        tokens_list.append( tokens )
    return tokens_list , vocabulary

# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( questions ) # will convert all the words into sequence of numbers or tokens
maxlen_questions = max( [ len(x) for x in tokenized_questions ] ) # to identify the maximum sized length to pad other sentences to same length
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' ) # adding 0s to the sequence
encoder_input_data = np.array( padded_questions )
print( encoder_input_data.shape , maxlen_questions )
print("Encoder input data:",encoder_input_data[1])

# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )
print("Decoder input data:", decoder_input_data.shape , maxlen_answers )
print(decoder_input_data[1])

# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE )
decoder_output_data = np.array( onehot_answers )
print( decoder_output_data.shape )
print("decoder output data:",decoder_output_data[1])

(564, 22) 22
Encoder input data: [67 91  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
Decoder input data: (564, 74) 74
[  2 399 275 566 167   1   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
(564, 74, 1894)
decoder output data: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [11]:
some_list=[]
words = ['and','to','people']
print(words)
some_list += words
words2=['for','all']
some_list += words
some_list.append(words)
some_list

['and', 'to', 'people']


['and', 'to', 'people', 'and', 'to', 'people', ['and', 'to', 'people']]

In [12]:
tokenizer.texts_to_sequences( ["this is a sentence","this is also a sentence"] )

[[111, 7, 4], [111, 7, 1782, 4]]

In [19]:
# model = tf.keras.models.Sequential()
# model.add(tf.keras.layers.Input(shape=( maxlen_questions , )))
# model.add(tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ))
# model.add(tf.keras.layers.LSTM( 200 , return_state=True ))
# model.add(tf.keras.layers.Input(shape=( maxlen_answers ,  )))
# model.add(tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True))
# model.add(tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True ))
# model.add(tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) )

encoder_inputs = tf.keras.layers.Input(shape=( maxlen_questions , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( maxlen_answers ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 22)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 74)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 22, 200)      378800      input_4[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 74, 200)      378800      input_5[0][0]                    
____________________________________________________________________________________________

In [20]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=150 ) 

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f958bfb07f0>

In [21]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [22]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

In [56]:
enc_model , dec_model = make_inference_models()
conversing = True
# for _ in range(10):
while conversing:
    query = input( 'Enter question : ' )
    if query=='end':
      conversing = False
    states_values = enc_model.predict( str_to_tokens( query ) )
    # print(states_values)
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        # print(sampled_word)
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )
    # tts = gTTS(decoded_translation) #Provide the string to convert to speech
    # tts.save('1.wav') #save the string converted to speech as a .wav file
    # sound_file = '1.wav'
    # Audio(sound_file, autoplay=True) 

Enter question : Hi
 hello end
Enter question : Who are you
 i am not an artificial intelligence end
Enter question : who is your father
 a human end
Enter question : end
 hal misses sal end
