# **PROJECT SUNRISE**
## **Theme : Conversational AI Project #1**
**Goal: Build a task oriented conversational model using seq2seq without attention approach**

ML Framework : Tensorflow 2.0

Dataset : https://research.fb.com/downloads/babi/

In [32]:
# Importing the libraries
import tensorflow as tf
import numpy as np
import re

In [2]:
# Setting the hyperparameters
batch_size = 64  
epochs = 100  
rnn_size = 512  
VOCAB_SIZE = 151

In [3]:
# Path to the data txt file on disk.
data_path = 'G:\Mini Project I\Datasets\Dataset.txt'

## **Loading and preprocessing the dataset**

In [4]:
# Separate the user and bot utterances from the dataset
def load_dataset(path):
  user_utterances = []
  bot_utterances = []
  input_characters = set()
  target_characters = set()
  with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
  for line in lines:
    if '\t' not in line:
      continue
    _user_utterance, _bot_utterance = line.rstrip().split('\t')
    _user_utterance = _user_utterance.split(" ",1)
    input_text = _user_utterance[1]
    user_utterances.append(input_text)
    bot_utterances.append(_bot_utterance)
  return user_utterances, bot_utterances

In [5]:
# Put <BOS> tag and <EOS> tag for bot utterances 
def tagger(bot_utterances):
  bos = "<BOS> "
  eos = " <EOS>"
  tagged_target = [bos + text + eos for text in bot_utterances] 
  return  tagged_target

In [6]:
# Doing a first cleaning of the texts
def clean_text(text):
  text = text.lower()
  text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,_]", "", text)
  return text

In [7]:
user_utterances, bot_utterances = load_dataset(data_path)
bot_utterances = tagger(bot_utterances)
# Cleaning the user utterances
clean_input = []
for text in user_utterances:
    clean_input.append(clean_text(text))
# Cleaning the bot utterances
clean_target = []
for text in bot_utterances:
    clean_target.append(clean_text(text))

NameError: name 're' is not defined

## **Making Vocabulary**

In [39]:
def vocab_creater(text_lists, VOCAB_SIZE):

  tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
  tokenizer.fit_on_texts(text_lists)
  dictionary = tokenizer.word_index
  
  word2idx = {}
  idx2word = {}
  for k, v in dictionary.items():
      if v < VOCAB_SIZE:
          word2idx[k] = v
          idx2word[v] = k
      if v >= VOCAB_SIZE-1:
          continue
          
  return word2idx, idx2word

In [40]:
input_token_index, index_input_token = vocab_creater(clean_input, VOCAB_SIZE)
input_token_index[' '] = len(input_token_index)+1
index_input_token[len(index_input_token)+1] = ' '
target_token_index, index_target_token = vocab_creater(clean_target, VOCAB_SIZE)
target_token_index[' '] = len(target_token_index)+1
index_target_token[len(index_target_token)] = ' '
print(input_token_index)
print(index_target_token)

{'silence': 1, 'in': 2, 'a': 3, 'for': 4, 'with': 5, 'price': 6, 'range': 7, 'i': 8, 'you': 9, 'table': 10, 'food': 11, 'be': 12, 'cuisine': 13, 'people': 14, 'instead': 15, 'could': 16, 'it': 17, 'actually': 18, 'would': 19, 'prefer': 20, 'book': 21, 'can': 22, 'please': 23, 'no': 24, 'cheap': 25, 'expensive': 26, 'moderate': 27, 'restaurant': 28, 'hello': 29, 'hi': 30, 'good': 31, 'morning': 32, 'four': 33, 'six': 34, 'eight': 35, 'two': 36, 'british': 37, 'london': 38, 'madrid': 39, 'italian': 40, 'paris': 41, 'french': 42, "i'd": 43, 'like': 44, 'to': 45, 'may': 46, 'have': 47, 'rome': 48, 'spanish': 49, 'bombay': 50, 'indian': 51, 'make': 52, 'reservation': 53, 'thanks': 54, 'thank': 55, 'rock': 56, 'am': 57, 'looking': 58, 'we': 59, 'will': 60, 'love': 61, ' ': 62}
{1: 'bos', 2: 'eos', 3: 'you', 4: 'for', 5: 'ok', 6: 'let', 7: 'me', 8: 'look', 9: 'into', 10: 'some', 11: 'options', 12: 'apicall', 13: 'it', 14: 'on', 15: 'sure', 16: 'is', 17: 'there', 18: 'anything', 19: 'else', 20

In [41]:
input_words = []
target_words = []
input_words = input_token_index.keys()
target_words = target_token_index.keys()
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
max_encoder_seq_length = max([len(txt.split()) for txt in clean_input])
max_decoder_seq_length = max([len(txt.split()) for txt in clean_target])
print(input_words)
print(target_words)
print(max_encoder_seq_length)
print(max_decoder_seq_length)

dict_keys(['silence', 'in', 'a', 'for', 'with', 'price', 'range', 'i', 'you', 'table', 'food', 'be', 'cuisine', 'people', 'instead', 'could', 'it', 'actually', 'would', 'prefer', 'book', 'can', 'please', 'no', 'cheap', 'expensive', 'moderate', 'restaurant', 'hello', 'hi', 'good', 'morning', 'four', 'six', 'eight', 'two', 'british', 'london', 'madrid', 'italian', 'paris', 'french', "i'd", 'like', 'to', 'may', 'have', 'rome', 'spanish', 'bombay', 'indian', 'make', 'reservation', 'thanks', 'thank', 'rock', 'am', 'looking', 'we', 'will', 'love', ' '])
dict_keys(['bos', 'eos', 'you', 'for', 'ok', 'let', 'me', 'look', 'into', 'some', 'options', 'apicall', 'it', 'on', 'sure', 'is', 'there', 'anything', 'else', 'to', 'update', 'hello', 'what', 'can', 'i', 'help', 'with', 'today', "i'm", 'be', 'cheap', 'expensive', 'moderate', "you're", 'welcome', 'four', 'six', 'eight', 'two', 'british', 'london', 'paris', 'madrid', 'italian', 'french', 'rome', 'indian', 'bombay', 'spanish', 'where', 'should',

In [42]:
print('Number of samples:', len(clean_input))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)

Number of samples: 40635
Number of unique input tokens: 62
Number of unique output tokens: 70


In [43]:
encoder_input_data = np.zeros((len(user_utterances), max_encoder_seq_length, num_encoder_tokens+1),dtype='float32')
decoder_input_data = np.zeros((len(user_utterances), max_decoder_seq_length, num_decoder_tokens+1),dtype='float32')
decoder_target_data = np.zeros((len(user_utterances), max_decoder_seq_length, num_decoder_tokens+1),dtype='float32')

In [44]:
#Teacher forcing method
for i, (input_text, target_text) in enumerate(zip(clean_input, clean_target)):
    for t, word in enumerate(input_text.split()):
      encoder_input_data[i, t, input_token_index[word]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    for t, word in enumerate(target_text.split()):
      decoder_input_data[i, t, target_token_index[word]] = 1.
      if t > 0:
        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.

In [45]:
# Encoder RNN
encoder_inputs = tf.keras.layers.Input(shape=(None, num_encoder_tokens+1))
encoder = tf.keras.layers.LSTM(rnn_size, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# Decoder RNN
decoder_inputs = tf.keras.layers.Input(shape=(None, num_decoder_tokens+1))
decoder_lstm = tf.keras.layers.LSTM(rnn_size, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(num_decoder_tokens+1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2seq Model
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,batch_size=batch_size,epochs=epochs,validation_split=0.2)
model.save('s2s.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [46]:
# Inference mode (sampling).

encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
decoder_state_input_h = tf.keras.layers.Input(shape=(rnn_size,))
decoder_state_input_c = tf.keras.layers.Input(shape=(rnn_size,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)


def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_decoder_tokens+1))
    target_seq[0, 0, target_token_index['bos']] = 1.
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index_target_token[sampled_token_index]
        if (sampled_char == 'eos'):
          decoded_sentence += '.'
          stop_condition = True
        else:
          decoded_sentence += sampled_char +' '
        target_seq = np.zeros((1, 1, num_decoder_tokens+1))
        target_seq[0, 0, sampled_token_index] = 1.
        states_value = [h, c]
    return decoded_sentence

In [47]:
for seq_index in range(20):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', user_utterances[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: hello
Decoded sentence: hello what can i help you with today .
-
Input sentence: can you book a table for six people with french food
Decoded sentence: i'm on it .
-
Input sentence: <SILENCE>
Decoded sentence: apicall italian paris six cheap .
-
Input sentence: in bombay
Decoded sentence: how many people would be   your party .
-
Input sentence: i am looking for a cheap restaurant
Decoded sentence: ok let me look into some options for you .
-
Input sentence: <SILENCE>
Decoded sentence: apicall italian paris six cheap .
-
Input sentence: hi
Decoded sentence: hello what can i help you with today .
-
Input sentence: can you make a restaurant reservation with italian cuisine for six people in a cheap price range
Decoded sentence: i'm on it .
-
Input sentence: <SILENCE>
Decoded sentence: apicall italian paris six cheap .
-
Input sentence: rome please
Decoded sentence: how many people would be   your party .
-
Input sentence: <SILENCE>
Decoded sentence: apicall italian pari

In [48]:
# Setting up the chat
while(True):
  user_input = []
  user_input = input("You: ")
  if(user_input == "Goodbye"):
    break
  input_seqn = np.zeros((1, max_encoder_seq_length, num_encoder_tokens+1),dtype='float32')
  for t, word in enumerate(user_input.split()):
    input_seqn[0, t, input_token_index[word]] = 1.
  input_seqn[0, t + 1:, input_token_index[' ']] = 1.
  decoded_sentence = decode_sequence(input_seqn)
  print('ChatBot: ' + decoded_sentence)

You: hello
ChatBot: hello what can i help you with today .
You: book a table for two
ChatBot: i'm on it .
You: Goodbye
