<a href="https://colab.research.google.com/github/CGuzman99/Big-data-and-ML/blob/main/Chatbot/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing the data

In [None]:
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras import layers, activations, models, preprocessing, utils
import os
import yaml
from gensim.models import Word2Vec
import re

## Downloading the data

In [None]:
!wget https://github.com/shubham0204/Dataset_Archives/blob/master/chatbot_nlp.zip?raw=true -O chatbot_nlp.zip
!unzip chatbot_nlp.zip

--2023-10-27 16:02:47--  https://github.com/shubham0204/Dataset_Archives/blob/master/chatbot_nlp.zip?raw=true
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/shubham0204/Dataset_Archives/raw/master/chatbot_nlp.zip [following]
--2023-10-27 16:02:47--  https://github.com/shubham0204/Dataset_Archives/raw/master/chatbot_nlp.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/shubham0204/Dataset_Archives/master/chatbot_nlp.zip [following]
--2023-10-27 16:02:47--  https://raw.githubusercontent.com/shubham0204/Dataset_Archives/master/chatbot_nlp.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:

## Reading the data

In [None]:
dir_path = 'chatbot_nlp/data'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()

for filepath in files_list :
  stream = open(dir_path + os.sep + filepath, 'rb')
  docs = yaml.safe_load(stream)
  conversations = docs['conversations']
  for con in conversations:
    if len(con) > 2 :
      questions.append(con[0])
      replies = con[1:]
      ans = ''
      for rep in replies :
        ans += ' ' + rep
      answers.append(ans)
    elif len(con) > 1 :
      questions.append(con[0])
      answers.append(con[1])

answers_with_tags = list()
for i in range(len(answers)) :
  if type(answers[i]) == str :
    answers_with_tags.append(answers[i])
  else :
    questions.pop(i)

answers = list()
for i in range(len(answers_with_tags)) :
  answers.append('<START>' + answers_with_tags[i] + '<END>')

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers)
VOCAB_SIZE = len(tokenizer.word_index) + 1
print('VOCAB SIZE: {}'.format(VOCAB_SIZE))

VOCAB SIZE: 1894


## Preparing the data for the model

In [None]:
vocab = list()
for word in tokenizer.word_index :
  vocab.append(word)

# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([ len(x) for x in tokenized_questions ])
padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding='post')
encoder_input_data = np.array(padded_questions)
print(encoder_input_data.shape, maxlen_questions)

# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([ len(x) for x in tokenized_answers ])
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
decoder_input_data = np.array(padded_answers)
print(decoder_input_data.shape, maxlen_answers)

# decoder_output_data
for i in range(len(tokenized_answers)) :
  tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
onehot_answers = utils.to_categorical(padded_answers, VOCAB_SIZE)
decoder_output_data = np.array(onehot_answers)
print(decoder_output_data.shape)

(564, 22) 22
(564, 74) 74
(564, 74, 1894)


# Defining the Encoder-Decoder model

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=(maxlen_questions, ))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200, mask_zero=True) (encoder_inputs)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(200, return_state=True) (encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(maxlen_answers, ))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200, mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200, return_state=True, return_sequences=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE, activation=tf.keras.activations.softmax)
output = decoder_dense(decoder_outputs)

model = tf.keras.Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 22)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 74)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 22, 200)              378800    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 74, 200)              378800    ['input_2[0][0]']             
                                                                                              

# Training the model

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=50, epochs=250)
model.save('model.h5')

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

  saving_api.save_model(


# Defining inference models

In [None]:
def make_inference_models() :
  encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

  decoder_state_input_h = tf.keras.layers.Input(shape=(200, ))
  decoder_state_input_c = tf.keras.layers.Input(shape=(200, ))

  decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

  decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
  decoder_states = [state_h, state_c]
  decoder_outputs = decoder_dense(decoder_outputs)
  decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

  return encoder_model, decoder_model


# Talking with the chatbot

In [None]:
def str_to_tokens(sentence : str) :
  words = sentence.lower().split()
  tokens_list = list()
  for word in words :
    tokens_list.append(tokenizer.word_index[word])

  return preprocessing.sequence.pad_sequences([tokens_list], maxlen=maxlen_questions, padding='post')

In [None]:
enc_model, dec_model = make_inference_models()

while True :
  inp = input('Enter question: ')
  if inp.lower() == 'exit' :
    break
  states_values = enc_model.predict(str_to_tokens(inp))
  empty_target_seq = np.zeros((1,1))
  empty_target_seq[0,0] = tokenizer.word_index['start']
  stop_condition = False
  decoded_translation = ''
  while not stop_condition :
    dec_outputs, h, c = dec_model.predict([empty_target_seq] + states_values)
    sampled_word_index = np.argmax(dec_outputs[0, -1, :])
    sampled_word = None
    for word, index in tokenizer.word_index.items() :
      if sampled_word_index == index :
        decoded_translation += ' {}'.format(word)
        sampled_word = word

    if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers :
      stop_condition = True

    empty_target_seq = np.zeros((1,1))
    empty_target_seq[0,0] = sampled_word_index
    state_values = [h, c]

  print(decoded_translation[:-3])

Enter question: who are you




 i am all well 
Enter question: exit
