Team members:
* Akul Malhotra
* Huong Doan

This project is following the instruction of the article: https://towardsdatascience.com/generative-chatbots-using-the-seq2seq-model-d411c8738ab5

# Import library

In [None]:
# clear defined variables
#%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [None]:
import re
import random
import numpy as np

from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

from keras.models import load_model

# Load dataset

Dataset https://www.kaggle.com/grafstor/simple-dialogs-for-chatbot?select=dialogs.txt

In [None]:
# Defining lines as a list of each line
with open("dialogs.txt", 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')
print(len(lines))

3725


In [None]:
lines[0:5]

["hi, how are you doing?\ti'm fine. how about yourself?",
 "i'm fine. how about yourself?\ti'm pretty good. thanks for asking.",
 "i'm pretty good. thanks for asking.\tno problem. so how have you been?",
 "no problem. so how have you been?\ti've been great. what about you?",
 "i've been great. what about you?\ti've been good. i'm in school right now."]

In [None]:
lines[0].split(sep="\t")

['hi, how are you doing?', "i'm fine. how about yourself?"]

# Data Preprocessing

In [None]:
# Seperate input and reply lines
input_lines = []
reply_lines = []

for line in lines:
  input_text, reply_text = line.split(sep="\t")
  input_lines.append(input_text)
  reply_lines.append(reply_text)

print(len(input_lines))
print(len(reply_lines))
print(input_lines[0:20])
print(reply_lines[0:20])

3725
3725
['hi, how are you doing?', "i'm fine. how about yourself?", "i'm pretty good. thanks for asking.", 'no problem. so how have you been?', "i've been great. what about you?", "i've been good. i'm in school right now.", 'what school do you go to?', 'i go to pcc.', 'do you like it there?', "it's okay. it's a really big campus.", 'good luck with school.', "how's it going?", "i'm doing well. how about you?", 'never better, thanks.', 'so how have you been lately?', "i've actually been pretty good. you?", "i'm actually in school right now.", 'which school do you attend?', "i'm attending pcc right now.", 'are you enjoying it there?']
["i'm fine. how about yourself?", "i'm pretty good. thanks for asking.", 'no problem. so how have you been?', "i've been great. what about you?", "i've been good. i'm in school right now.", 'what school do you go to?', 'i go to pcc.', 'do you like it there?', "it's okay. it's a really big campus.", 'good luck with school.', 'thank you very much.', "i'm doi

In [None]:
# grouping lines by response pair
pairs = list(zip(input_lines, reply_lines))
print(len(pairs))
pairs[0:20]

3725


[('hi, how are you doing?', "i'm fine. how about yourself?"),
 ("i'm fine. how about yourself?", "i'm pretty good. thanks for asking."),
 ("i'm pretty good. thanks for asking.", 'no problem. so how have you been?'),
 ('no problem. so how have you been?', "i've been great. what about you?"),
 ("i've been great. what about you?",
  "i've been good. i'm in school right now."),
 ("i've been good. i'm in school right now.", 'what school do you go to?'),
 ('what school do you go to?', 'i go to pcc.'),
 ('i go to pcc.', 'do you like it there?'),
 ('do you like it there?', "it's okay. it's a really big campus."),
 ("it's okay. it's a really big campus.", 'good luck with school.'),
 ('good luck with school.', 'thank you very much.'),
 ("how's it going?", "i'm doing well. how about you?"),
 ("i'm doing well. how about you?", 'never better, thanks.'),
 ('never better, thanks.', 'so how have you been lately?'),
 ('so how have you been lately?', "i've actually been pretty good. you?"),
 ("i've actu

In [None]:
# Shuffle the pairs 
random.shuffle(pairs)
print(len(pairs))
pairs[0:20]

3725


[('the forecast says that it will be warm on the weekend.',
  "so do you think it'll be perfect weather for the beach?"),
 ('yes, those eight years were a lot of fun for everyone.',
  'only 4,000 american soldiers were killed overseas.'),
 ('what happened?',
  'i gave her $1,000 for her birthday. i told her to spend it on herself.'),
 ('yeah, i went. did you go?', "no, i didn't feel like it."),
 ('i hope i win the lotto.', 'your chances are very small.'),
 ('a good story is more important than color.',
  "actors didn't curse back then."),
 ("then they're worth every penny.", 'you might want to buy a pair.'),
 ('me, too. school was fun.', 'and it was only 12 years.'),
 ('i have to go to the bathroom.', 'you drink too much coffee.'),
 ('and you get a lot of exercise every day.', "that's the truth."),
 ('are you sure?', 'we will be house rich, but cash poor.'),
 ('next time you go to the market, let me go with you.',
  'no, thank you. all you want to eat are hot dogs and candy bars.'),
 (

In [None]:
# For target sequences, we will add ‘<START>’ at the beginning of the sequence and ‘<END>’ 
#    at the end of the sequence so that our model knows where to start and end text generation. 
input_lines_2 = []
reply_lines_2 = []
input_tokens = set()
reply_tokens = set()

for p in pairs:
  input_text, reply_text = p[0], p[1]
  
  # remove non-alphabet
  input_text = re.sub("[^a-zA-Z]", " ", input_text)
  reply_text = re.sub("[^a-zA-Z]", " ", reply_text)
  reply_text = " ".join(re.findall(r"[\w']+|[^\s\w]", reply_text))
  reply_text = '<START> ' + reply_text + ' <END>'

  input_lines_2.append(input_text)
  #reply_lines_2.append(reply)
  reply_lines_2.append(reply_text)
 
  # tokenize the input and reply lines
  for token in input_text.split(): #re.findall(r"[\w']+|[^\s\w]", input_text):
    input_tokens.add(token)
  for token in reply_text.split(): #re.findall(r"[\w'<>]+|[^\s\w]", reply_text):
    reply_tokens.add(token)


print(input_tokens)
print(reply_tokens)

input_tokens = sorted(list(input_tokens))
reply_tokens = sorted(list(reply_tokens))

print(len(input_tokens))
print(len(reply_tokens))
print(input_tokens[:20])
print(reply_tokens[:20])

print(len(input_lines_2))
print(len(reply_lines_2))
print(input_lines_2[:20])
print(reply_lines_2[:20])

{'forecast', 'm', 'cut', 'channels', 'flight', 'ashes', 'speech', 'cartoons', 'address', 'yet', 'for', 'speeding', 'unbelievable', 'jar', 'am', 'medication', 'stole', 'license', 'rude', 'golfers', 'folded', 'rules', 'deserved', 'housekeeping', 'chop', 'explain', 'tears', 'couldn', 'floors', 'both', 'long', 'colder', 'sitting', 'fingers', 'extra', 'various', 'things', 'noticed', 'snow', 'two', 'unit', 'insert', 'atlantic', 'crying', 'falling', 'christmas', 'stopping', 'suction', 'adding', 'rich', 'school', 'quite', 'god', 'navel', 'plus', 'news', 'wipes', 'reuse', 'stops', 'jacket', 'handyman', 'court', 'savings', 'hitting', 'students', 'survive', 'nation', 'artist', 'animal', 'feelings', 'squeeze', 'sliced', 'guarantees', 'rose', 'at', 'rush', 'prices', 'smoker', 'butterflies', 'learned', 'crowded', 'will', 'added', 'uses', 'standing', 'somewhere', 'potato', 'vacuumed', 'siren', 'degrees', 'poodle', 'overseas', 'crosswalk', 'under', 'seeing', 'worst', 'killer', 'shows', 'cart', 'licks'

In [None]:
for t in reply_tokens:
  if t == '<START>':
    print(t)

<START>


In [None]:
temp = "<START>  tell me what you've been up to .  <END>"
print(re.findall(r"[\w'<>]+|[^\s\w]", temp))

['<START>', 'tell', 'me', 'what', "you've", 'been', 'up', 'to', '.', '<END>']


In [None]:
temp = "tell me what you've been up to."
temp1 = " ".join(re.findall(r"[\w']+|[^\s\w]", temp))
print(temp1)
temp1 = '<START> ' + temp1 + ' <END>'
print(temp1)
print(temp1.split())

tell me what you've been up to .
<START> tell me what you've been up to . <END>
['<START>', 'tell', 'me', 'what', "you've", 'been', 'up', 'to', '.', '<END>']


In [None]:
"<START> tell me what you've been up to. <END>".split()

['<START>', 'tell', 'me', 'what', "you've", 'been', 'up', 'to.', '<END>']

## Use three matrices of one-hot vectors: Encoder input data, Decoder input data, and Decoder output data

In [None]:
[len(re.findall(r"[\w'<>]+|[^\s\w]", sens)) for sens in ["<START> tell me what you've been up to . <END>","I lob"]]

[10, 2]

In [None]:
def get_token_array(sentences_array, decoder_reply=""):
  ''' Input:
          sentences_array : an array of sentences
      Output:
          tokens_array : return an array of unique tokens 
  '''
  tokens_array = set()
  for sentence in sentences_array:
    #if decoder_reply == "encoder_input":
      #sentence_ = 
    for token in sentence.split():
      tokens_array.add(token)

  tokens_array = sorted(list(tokens_array))
  return tokens_array

def one_hot_matrix(sentences_array, token_array, decoder_reply=""):
  ''' Input:
          sentences_array : an array of sentences
      Output:
          text_matrix : return a 3-d one-hot-vector matrix
  '''
  dim1 = len(sentences_array)

  #token_array = get_token_array(sentences_array, decoder_reply)
  dim3 = len(token_array)

  #Maximum length of sentences in input and target documents
  dim2 = max([len(re.findall(r"[\w']+|[^\s\w]", sens)) for sens in sentences_array])

  # create a dictionary of key-values as token-index
  features_dict = dict([(token, i) for i, token in enumerate(token_array)])
  
  #initialize 3d-array
  text_matrix = np.zeros((dim1, dim2, dim3), dtype='float32')

  for line_index, line in enumerate(sentences_array):
    for token_index, token in enumerate(line.split()):
      if decoder_reply == "decoder_target" and token_index > 0:
        text_matrix[line_index , token_index-1, features_dict[token]] = 1
      elif decoder_reply != "decoder_target":
        text_matrix[line_index , token_index, features_dict[token]] = 1
  
  return text_matrix

In [None]:
encoder_input_model  = one_hot_matrix(input_lines_2, input_tokens, "")
decoder_input_model = one_hot_matrix(reply_lines_2, reply_tokens, "")
decoder_target_model = one_hot_matrix(reply_lines_2, reply_tokens, "decoder_target")

In [None]:
encoder_input_model

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

# Analyis applying Sequence to Sequence model

**Sequence to sequence** is a method of encoder-decoder based machine translation and language processing that maps an input of sequence to an output of sequence with a tag and attention value. The idea is to use 2 RNNs that will work together with a special token and try to predict the next state sequence from the previous sequence.

The **Sequence to sequence** model also called the encoder-decoder model uses Long Short Term Memory- LSTM for text generation from the training corpus.  

Goal: It predicts a word given in the user input and then each of the next words is predicted using the probability of likelihood of that word to occur.  

The encoder model includes an input layer which defines a matrix for holding the one-hot vectors and an LSTM layer with some number of hidden states.  

The encoder outputs a final state vector (memory) which becomes the initial state for the decoder.

<img src="https://drive.google.com/uc?export=view&id=11X90gJrld15icniXmP8hyJH35v4Z2HYD" width="400" height="280" />

A method called **teacher forcing** to train the decoder which enables it to predict the following words in a target sequence given in the previous words. 

<img src="https://drive.google.com/uc?export=view&id=1vRmDM9vordxcTlORtZ1pwWsWRpgNIlhY" width="400" height="280" />


<img src="https://drive.google.com/uc?export=view&id=1rJ0EdaIMFAZBkiAop7K96OYdseHQcURy" width="400" height="280" />


Pros:
* Training with Teacher Forcing converges faster. 
* The model will be updated by a sequence of better predictions.  


Cons:
* During inference, since there is usually no ground truth available, the RNN model will need to feed its own previous prediction back to itself for the next prediction. Therefore there is a discrepancy between training and inference, and this might lead to poor model performance and instability. This is known as Exposure Bias in literature.


Reference:
* what is seq2seq? https://www.guru99.com/seq2seq-model.html#:~:text=Seq2Seq%20is%20a%20method%20of,sequence%20from%20the%20previous%20sequence. 
* https://github.com/jackfrost1411/Generative-chatbot?fbclid=IwAR2J5Q6caH_0RI4r7o1aUk1Rwv5ytEENORDXiK_RsO30Fg3aKzl65zcqCjo
* https://towardsdatascience.com/generative-chatbots-using-the-seq2seq-model-d411c8738ab5
* https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/

## Train the model

In [None]:
#Dimensionality
dimensionality = 256

#The batch size and number of epochs
batch_size = 10
epochs = 60

num_encoder_tokens = len(input_tokens) #len(get_token_array(input_lines_2))
num_decoder_tokens = len(reply_tokens) #len(get_token_array(reply_lines_2))

#Encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(dimensionality, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

#Decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(dimensionality, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
#Model
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

#Compiling
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', 
                       metrics=['accuracy'], sample_weight_mode='temporal')

#Training
training_model.fit([encoder_input_model, decoder_input_model], decoder_target_model, 
                   batch_size = batch_size, epochs = epochs, validation_split = 0.2)
training_model.save('training_model.h5')

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


## Test the model

In [None]:
training_model = load_model('training_model.h5')

encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

In [None]:
latent_dim = 256

decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]

In [None]:
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
'''token_array = get_token_array(reply_lines_2)
target_features_dict = dict([(token, i) for i, token in enumerate(token_array)])
i=0
for key,value in target_features_dict.items():
  if i == 30:
    break 
  i += 1
  print (key, " ", value)'''

In [None]:
def decode_response(test_input):
  token_array = reply_tokens #get_token_array(reply_lines_2)
  target_features_dict = dict([(token, i) for i, token in enumerate(token_array)])
  reverse_target_features_dict = dict((i, token) for token, i in target_features_dict.items())
  max_decoder_seq_length = max([len(sens.split()) for sens in reply_lines_2])
  
  #Getting the output states to pass into the decoder
  states_value = encoder_model.predict(test_input)
  #Generating empty target sequence of length 1
  target_seq = np.zeros((1, 1, num_decoder_tokens))
  #Setting the first token of target sequence with the start token
  target_seq[0, 0, target_features_dict['<START>']] = 1.
    
  #A variable to store our response word by word
  decoded_sentence = ''
    
  stop_condition = False
  while not stop_condition:
    #Predicting output tokens with probabilities and states
    output_tokens, hidden_state, cell_state = decoder_model.predict([target_seq] + states_value)

    #Choosing the one with highest probability
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_token = reverse_target_features_dict[sampled_token_index]
    decoded_sentence += " " + sampled_token

    #Stop if hit max length or found the stop token
    if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
      stop_condition = True

    #Update the target sequence
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.
    #Update states
    states_value = [hidden_state, cell_state]

  return decoded_sentence

# Generate Chatbot

In [None]:
#token_array = get_token_array(user_input)
#input_features_dict =  dict([(token, i) for i, token in enumerate(token_array)])
#max_encoder_seq_length = max([len(sens.split()) for sens in user_input])

In [None]:
class ChatBot:
  negative_responses = ("no", "nope", "nah", "naw", "not a chance", "sorry")
  exit_commands = ("quit", "pause", "exit", "goodbye", "bye", "later", "stop")
  
  #Method to start the conversation
  def start_chat(self):
    user_response = input("Hi, I'm a chatbot trained on random dialogs. Would you like to chat with me?\n")
    
    if user_response in self.negative_responses:
      print("Ok, have a great day!")
      return
    self.chat(user_response)

  #Method to handle the conversation
  def chat(self, reply):
    while not self.make_exit(reply):
      reply = input(self.generate_response(reply)+"\n")
    
  #Method to convert user input into a matrix
  def string_to_matrix(self, user_input):
    token_array = reply_tokens #get_token_array(input_lines_2)
    input_features_dict =  dict([(token, i) for i, token in enumerate(token_array)])
    max_encoder_seq_length = max([len(sens.split()) for sens in input_lines_2])
    tokens = re.findall(r"[\w']+|[^\s\w]", user_input)
    user_input_matrix = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
    
    for timestep, token in enumerate(tokens):
      if token in input_features_dict:
        user_input_matrix[0, timestep, input_features_dict[token]] = 1.
    return user_input_matrix
  
  #Method that will create a response using seq2seq model we built
  def generate_response(self, user_input):
    input_matrix = self.string_to_matrix(user_input)
    chatbot_response = decode_response(input_matrix)
    #Remove <START> and <END> tokens from chatbot_response
    chatbot_response = chatbot_response.replace("<START>",'')
    chatbot_response = chatbot_response.replace("<END>",'')
    return chatbot_response
  #Method to check for exit commands
  def make_exit(self, reply):
    for exit_command in self.exit_commands:
      if exit_command in reply:
        print("Ok, have a great day!")
        return True
    return False
  
chatbot = ChatBot()

In [None]:
chatbot.start_chat() # for 10 epoches

Hi, I'm a chatbot trained on random dialogs. Would you like to chat with me?
hi there
 what happened 
nothing
 what happened 
nah
 what happened 
tired
 what happened 
good
 stress 
really?
 yes i 
ok
 what happened 
bye
Ok, have a great day!


In [None]:
chatbot.start_chat() # for 20 epoches

Hi, I'm a chatbot trained on random dialogs. Would you like to chat with me?
hi there
 what happened 
nothing
 what happened 
tired
 i agree 
what
 what happened 
nothing
 what happened 
nope
 i agree 
ok
 i agree 
vacation
 i agree 
i am tired
 i was 
really
 teaching the is 
ok
 i agree 
bye
Ok, have a great day!


In [None]:
chatbot.start_chat() # for 30 epoches

Hi, I'm a chatbot trained on random dialogs. Would you like to chat with me?
hi there
 i m t 
good
 me causes 
ok
 thank you 
no problem
 thank you 
byt
 thank you 
bye
Ok, have a great day!


In [None]:
chatbot.start_chat() # for 60 epoches

Hi, I'm a chatbot trained on random dialogs. Would you like to chat with me?
hi there
 thank you 
=)
 well neither 
cool
 i like a 
me too
 i agree 
awesome
 oh yes 
talk to you later
Ok, have a great day!


In [None]:
chatbot.start_chat() # this is the one on the poster

Hi, I'm a chatbot trained on random dialogs. Would you like to chat with me?
hi there
 thank you 
=)
 well neither 
cool
 i like a 
awesome
 oh yes 
see you later
Ok, have a great day!
