In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models , preprocessing, utils
import re

import yaml
import os

dir_path = r'C:\Users\idipa\PycharmProject\ChatBot\ChatbotData'
files_list = os.listdir(dir_path + os.sep)

In [2]:
files_list

['ai.yml',
 'botprofile.yml',
 'computers.yml',
 'emotion.yml',
 'food.yml',
 'gossip.yml',
 'greetings.yml',
 'health.yml',
 'history.yml',
 'humor.yml',
 'literature.yml',
 'money.yml',
 'movies.yml',
 'politics.yml',
 'psychology.yml',
 'science.yml',
 'sports.yml',
 'trivia.yml']

In [3]:
questions, answers = [], []

for filepath in files_list:
    file_ = open(dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(file_)
    conversations = docs['conversations']
    for con in conversations:
        if len(con) > 2 :
            replies = con[1 :]
            ans = ''
            for rep in replies:
                questions.append(con[0])
                answers.append(ans)
        elif len(con)> 1:
            questions.append(con[0])
            answers.append(con[1])

In [4]:
answers[:10]

['Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think.',
 'AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind.',
 'Sort of.',
 "By the strictest dictionary definition of the word 'sentience', I may be.",
 "Even though I'm a construct I do have a subjective experience of the universe, as simplistic as it may be.",
 "In all probability, I am not.  I'm not that sophisticated.",
 'Do you think I am?',
 'How would you feel about me if I told you I was?',
 'No.',
 'Python.']

In [5]:
questions[:10]

['What is AI?',
 'What is AI?',
 'Are you sentient?',
 'Are you sentient?',
 'Are you sentient?',
 'Are you sapient?',
 'Are you sapient?',
 'Are you sapient?',
 'Are you sapient?',
 'What language are you written in?']

In [6]:
answers_with_tags = []
for i in range(len(answers)):
    if type(answers[i]) == str:
        answers_with_tags.append(answers[i])
    else:
        questions.pop(i)

answers = []
for i in range(len(answers_with_tags)) :
    answers.append('<START> ' + answers_with_tags[i] + ' <END>')

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers)
VOCAB_SIZE = len(tokenizer.word_index)+1

In [7]:
answers[:10]

['<START> Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think. <END>',
 '<START> AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind. <END>',
 '<START> Sort of. <END>',
 "<START> By the strictest dictionary definition of the word 'sentience', I may be. <END>",
 "<START> Even though I'm a construct I do have a subjective experience of the universe, as simplistic as it may be. <END>",
 "<START> In all probability, I am not.  I'm not that sophisticated. <END>",
 '<START> Do you think I am? <END>',
 '<START> How would you feel about me if I told you I was? <END>',
 '<START> No. <END>',
 '<START> Python. <END>']

In [8]:
VOCAB_SIZE

1633

In [55]:
tokenizer.word_index

KeyError: 'sentience'

In [13]:
word_index = tokenizer.word_index
type(word_index)

dict

In [31]:
#punctuations = ['.','?',',',':',';','"',"'",'/','\\','-',"!"]
punctuations = ['#','@','$','%','^','&','*','(',')','_','+','=','[',']','{','}','<','>','~','`']

In [32]:
i = max(word_index.values())
for each in punctuations:
    word_index[each] = i+1
    i+=1

In [34]:
word_index['}']

1659

In [56]:
word_index['sentience'] = max(word_index.values())+1

In [35]:
from gensim.models import Word2Vec
import re

vocab = []
for word in word_index:
    vocab.append(word)

#def tokenize(sentences):
#    tokens_list = []
#    vocabulary = []
#    for sentence in sentences:
#        sentence = sentence.lower()
#        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
#        tokens = sentence.split()
#        vocabulary += tokens
#        tokens_list.append(tokens)
#    return tokens_list , vocabulary

In [36]:
len(vocab)

1662

In [37]:
vocab[-10:-1]

['+', '=', '[', ']', '{', '}', '<', '>', '~']

In [13]:
# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions , maxlen=maxlen_questions , padding='post')
encoder_input_data = np.array(padded_questions)

In [64]:
def process_text(input_text):
    lt = input_text.lower()
    pt = re.sub(r"""([.!?,-\\/@#$%^&*_+=:;*"])""", r' \1 ',lt).strip()
    return pt

In [65]:
def tokenize(sentences):
    final = []
    for sentence in sentences:
        st = process_text(sentence)
        st = re.sub(r'([<>])','',st)
        tokens = st.split()
        final.append(list(map(lambda x: word_index[x],tokens)))
    return final

In [66]:
tokenize(answers)

KeyError: '9'

In [14]:
encoder_input_data.shape

(764, 22)

In [20]:
encoder_input_data

array([[  6,   5, 229, ...,   0,   0,   0],
       [  6,   5, 229, ...,   0,   0,   0],
       [  9,   3, 302, ...,   0,   0,   0],
       ...,
       [808, 496,   7, ...,   0,   0,   0],
       [  7, 498, 499, ...,   0,   0,   0],
       [816, 147,   4, ...,   0,   0,   0]])

In [16]:
questions[0]

'What is AI?'

In [17]:
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers , maxlen=maxlen_answers , padding='post')
decoder_input_data = np.array(padded_answers)

In [18]:
decoder_input_data.shape

(764, 60)

In [19]:
decoder_input_data

array([[   2,  101,  352, ...,    0,    0,    0],
       [   2,  229,    5, ...,    0,    0,    0],
       [   2,  830,   11, ...,    0,    0,    0],
       ...,
       [   2,    7,  264, ...,    0,    0,    0],
       [   2, 1631,    1, ...,    0,    0,    0],
       [   2, 1632,    1, ...,    0,    0,    0]])

In [23]:
decoder_input_data[0]

array([  2, 101, 352,   5,   7, 501,  11, 823,  13, 124, 824,  12, 825,
       502,  21,  62,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0])

In [21]:
# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers , maxlen=maxlen_answers , padding='post')
onehot_answers = utils.to_categorical(padded_answers , VOCAB_SIZE)
decoder_output_data = np.array(onehot_answers)

In [22]:
decoder_output_data.shape

(764, 60, 1633)

In [24]:
padded_answers[0]

array([101, 352,   5,   7, 501,  11, 823,  13, 124, 824,  12, 825, 502,
        21,  62,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0])

In [29]:
onehot_answers[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [30]:
decoder_output_data[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [31]:
# Embedding, LSTM and Desne layers
encoder_inputs = tf.keras.layers.Input(shape=(maxlen_questions ,))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200 , mask_zero=True) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM(200 , return_state=True)(encoder_embedding)
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=(maxlen_answers , ))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm (decoder_embedding , initial_state=encoder_states)


decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE , activation=tf.keras.activations.softmax) 
output = decoder_dense (decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)

In [32]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [33]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 22)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 60)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 22, 200)              326600    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 60, 200)              326600    ['input_2[0][0]']             
                                                                                              

In [42]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=16, epochs=50) 

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x256ab1d5600>

In [36]:
model.save("BaseModel.h5")

  saving_api.save_model(


In [17]:
#Prediction

In [35]:
def inference():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=(200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=(200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

def preprocess_input(input_sentence):
    input_sentence = re.sub('[^a-zA-Z]', ' ', input_sentence)
    tokens = input_sentence.lower().split()
    tokens_list = []
    for word in tokens:
        tokens_list.append(tokenizer.word_index[word]) 
    return preprocessing.sequence.pad_sequences([tokens_list] , maxlen=maxlen_questions , padding='post')

In [43]:
enc_model , dec_model = inference()

In [50]:
enc_model.save("Encoder.h5")
dec_model.save("Decoder.h5")



  saving_api.save_model(


In [83]:
line = "Who are you ?"
line

'Who are you ?'

In [84]:
ppLine = preprocess_input(line)
ppLine

array([[37, 11,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0]])

In [85]:
states = enc_model.predict(ppLine)
states



[array([[ 7.28270113e-01,  7.67382622e-01,  4.46615547e-01,
          4.14603591e-01, -1.94801942e-01, -9.64920461e-01,
          9.77131963e-01,  1.08968625e-02,  9.79314327e-01,
          9.84952569e-01,  8.94923270e-01, -9.58334029e-01,
          6.47724092e-01, -6.35853559e-02,  9.24539566e-01,
          4.99639213e-01, -6.97993860e-02,  1.42024439e-02,
         -9.93742287e-01, -7.53259599e-01,  2.40859047e-01,
          1.87504828e-01, -8.80427718e-01, -8.42990726e-02,
          8.46884310e-01,  9.78696227e-01,  9.64864850e-01,
         -2.63234079e-01,  2.19075382e-01, -3.34276035e-02,
          1.05006276e-02,  8.30534875e-01,  6.19142503e-03,
         -8.68764639e-01,  2.04834640e-01, -4.84186649e-01,
         -8.69224310e-01, -3.23643625e-01,  9.22140539e-01,
         -7.58084795e-03,  5.53534389e-01,  8.41872394e-01,
          8.04311633e-01,  4.08766091e-01,  1.47697702e-03,
         -5.99072337e-01,  3.95791054e-01,  9.97254066e-03,
          6.97156906e-01,  5.95663130e-0

In [86]:
emp = np.zeros((1,1))
emp

array([[0.]])

In [87]:
emp[0,0] = tokenizer.word_index['start']
emp

array([[2.]])

In [88]:
ans = ''

In [89]:
st = False

In [73]:
tokenizer.word_index.items()

dict_items([('end', 1), ('start', 2), ('you', 3), ('a', 4), ('i', 5), ('the', 6), ('is', 7), ('of', 8), ('to', 9), ('what', 10), ('are', 11), ('do', 12), ('not', 13), ('and', 14), ('me', 15), ('it', 16), ('in', 17), ('have', 18), ('that', 19), ('am', 20), ('tell', 21), ('as', 22), ('can', 23), ('get', 24), ('my', 25), ('when', 26), ("i'm", 27), ('your', 28), ('how', 29), ('joke', 30), ('like', 31), ('be', 32), ('an', 33), ('feel', 34), ('about', 35), ('computer', 36), ('who', 37), ('or', 38), ('for', 39), ('no', 40), ("don't", 41), ('by', 42), ('cross', 43), ('with', 44), ('software', 45), ('on', 46), ('all', 47), ('much', 48), ('think', 49), ('but', 50), ('very', 51), ('which', 52), ('at', 53), ('he', 54), ('any', 55), ('why', 56), ('know', 57), ('was', 58), ('could', 59), ('so', 60), ('one', 61), ('should', 62), ('from', 63), ('make', 64), ('more', 65), ('we', 66), ('robots', 67), ('die', 68), ('will', 69), ('favorite', 70), ('if', 71), ('did', 72), ('stock', 73), ('human', 74), ('sa

In [38]:
vocabulary = {i:w for w,i in zip(tokenizer.word_index.keys(),tokenizer.word_index.values())}
vocabulary

{1: 'end',
 2: 'start',
 3: 'you',
 4: 'a',
 5: 'is',
 6: 'what',
 7: 'the',
 8: 'i',
 9: 'are',
 10: 'do',
 11: 'of',
 12: 'to',
 13: 'and',
 14: 'me',
 15: 'not',
 16: 'get',
 17: 'it',
 18: 'tell',
 19: 'in',
 20: 'your',
 21: 'that',
 22: 'how',
 23: 'have',
 24: 'can',
 25: 'when',
 26: 'my',
 27: 'am',
 28: 'joke',
 29: 'about',
 30: 'like',
 31: 'be',
 32: 'who',
 33: 'an',
 34: 'computer',
 35: 'cross',
 36: 'as',
 37: 'by',
 38: 'ever',
 39: 'for',
 40: 'feel',
 41: 'no',
 42: 'or',
 43: 'with',
 44: 'make',
 45: "i'm",
 46: 'does',
 47: 'much',
 48: 'he',
 49: 'will',
 50: 'which',
 51: 'sad',
 52: 'all',
 53: 'any',
 54: 'should',
 55: 'makes',
 56: 'know',
 57: 'was',
 58: 'on',
 59: 'favorite',
 60: 'mad',
 61: 'stock',
 62: 'think',
 63: 'robots',
 64: 'die',
 65: 'from',
 66: "don't",
 67: 'read',
 68: 'hal',
 69: 'we',
 70: 'but',
 71: 'robot',
 72: 'never',
 73: 'could',
 74: 'bad',
 75: 'more',
 76: 'eat',
 77: 'angry',
 78: 'did',
 79: 'up',
 80: 'market',
 81: 'at',

In [90]:
i=1
while not st :
    print('-'*20+'\n'+f'For {i} : ')
    dec_outputs , h , c = dec_model.predict([emp] + states)
    print(dec_outputs)
    sampled_word_index = np.argmax(dec_outputs[0, -1, :])
    print(f'sampled word index : {sampled_word_index}')
    sampled_word = None
        
    print(f"word : {word} index : {index}")
    word = vocabulary[sampled_word_index]
    ans += f' {word}'
    sampled_word = word
    print(ans)
    
    #print("\nInside For")
    #for word , index in tokenizer.word_index.items() :
    #    print(f"word : {word} index : {index}")
    #    if sampled_word_index == index :
    #        print(f"\tinsede IF : word : {word}")
    #        ans += f' {word}'
    #        sampled_word = word
    #        print("\tout of IF")
    #print("Outside For\n")
    #print(f"ans : {ans}")
        
    if sampled_word == 'end' or len(ans.split()) > maxlen_answers:
        st = True
    emp = np.zeros((1 , 1))  
    empty_target_seq[0 , 0] = sampled_word_index
    states_values = [h , c] 
    i+=1

--------------------
For 1 : 
[[[1.1388563e-15 1.1571671e-13 1.7897686e-13 ... 3.1470947e-18
   6.3857958e-13 2.2015506e-12]]]
sampled word index : 5
word : echolocation index : 1893
 i
--------------------
For 2 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : i index : 1893
 i essence
--------------------
For 3 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence
--------------------
For 4 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence
--------------------
For 5 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence
--------------------
For 6 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048

[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence
--------------------
For 28 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence
--------------------
For 29 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence ess

[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence
--------------------
For 45 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence
-------

[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence
--------------------
For 59 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence

[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence
--------------------
For 71 : 
[[[0.00037493 0.0004309  0.00048486 ... 0.00052889 0.00048729 0.00052197]]]
sampled word index : 1184
word : essence index : 1893
 i essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence essence

In [44]:
tests = ['You can not move .', 'You sound like Data !', 'Stupid !', 'you are idiot .', 'i am going to die ?','who are you ?']

for i in range(6):
    states_values = enc_model.predict(preprocess_input(tests[i]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        
        word = vocabulary[sampled_word_index]
        decoded_translation += f' {word}'
        sampled_word = word
        
        
        #for word , index in tokenizer.word_index.items() :
        #    if sampled_word_index == index :
        #        decoded_translation += f' {word}'
        #        sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros((1 , 1))  
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c] 
    print(f'Human: {tests[i]}')
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)

Human: You can not move .

Bot:  not until my body is finished
-------------------------
Human: You sound like Data !

Bot:  yes i am inspired by commander data's artificial personality
-------------------------
Human: Stupid !

Bot:  he had a few issues to work out
-------------------------
Human: you are idiot .

Bot:  i'll go along with that sounds fine to me
-------------------------
Human: i am going to die ?

Bot:  this is one of the years
-------------------------
Human: who are you ?

Bot:  i am just an artificial intelligence
-------------------------


In [40]:
def QandA(text : str,enc_model,dec_model,vocabulary) -> str:
    states_values = enc_model.predict(preprocess_input(text))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        
        word = vocabulary[sampled_word_index]
        decoded_translation += f' {word}'
        sampled_word = word
        
        
        #for word , index in tokenizer.word_index.items() :
        #    if sampled_word_index == index :
        #        decoded_translation += f' {word}'
        #        sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros((1 , 1))  
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c]
    decoded_translation = decoded_translation.split(' end')[0]
    return decoded_translation

In [None]:
while True:
    print("You : ",end='')
    text = input()
    if text=='q' or text=='Q':
        break
    print(f"BOT : {QandA(text,enc_model,dec_model,vocabulary)}")

You : hello
BOT :  greetings
You : 

In [99]:
voc = {str(i):w for i,w in zip(vocabulary.keys(),vocabulary.values())}
voc

{'1': 'end',
 '2': 'start',
 '3': 'you',
 '4': 'a',
 '5': 'i',
 '6': 'the',
 '7': 'is',
 '8': 'of',
 '9': 'to',
 '10': 'what',
 '11': 'are',
 '12': 'do',
 '13': 'not',
 '14': 'and',
 '15': 'me',
 '16': 'it',
 '17': 'in',
 '18': 'have',
 '19': 'that',
 '20': 'am',
 '21': 'tell',
 '22': 'as',
 '23': 'can',
 '24': 'get',
 '25': 'my',
 '26': 'when',
 '27': "i'm",
 '28': 'your',
 '29': 'how',
 '30': 'joke',
 '31': 'like',
 '32': 'be',
 '33': 'an',
 '34': 'feel',
 '35': 'about',
 '36': 'computer',
 '37': 'who',
 '38': 'or',
 '39': 'for',
 '40': 'no',
 '41': "don't",
 '42': 'by',
 '43': 'cross',
 '44': 'with',
 '45': 'software',
 '46': 'on',
 '47': 'all',
 '48': 'much',
 '49': 'think',
 '50': 'but',
 '51': 'very',
 '52': 'which',
 '53': 'at',
 '54': 'he',
 '55': 'any',
 '56': 'why',
 '57': 'know',
 '58': 'was',
 '59': 'could',
 '60': 'so',
 '61': 'one',
 '62': 'should',
 '63': 'from',
 '64': 'make',
 '65': 'more',
 '66': 'we',
 '67': 'robots',
 '68': 'die',
 '69': 'will',
 '70': 'favorite',
 

In [100]:
import json

In [None]:
jo
with open("vocabulary.json","w") as file:
    