# **Import the dataset** 

In [3]:
%tensorflow_version 2.x
import tensorflow as tf
device_name= tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at:{}'.format(device_name))

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
Found GPU at:/device:GPU:0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import glob
import json


path= '/content/drive/MyDrive/nlp hw6/dialogues/*.txt'

!pwd
path_to_mount = '/content/drive/My Drive/hw6/'


Mounted at /content/drive
/content


# **Preproccesing**

In [4]:
list_of_files = glob.glob(path)
print(list_of_files[1]) # Visualize the first 1
print(len(list_of_files)) # 47

# Parsing
list_of_dicts = [] # Init

# Loop for each file
for filename in list_of_files:
  with open(filename) as f:
      for line in f: # Loop for each line (inside each file)
          list_of_dicts.append(json.loads(line)) # insert in a dictionary


# Visualize the dictionaries
print(list_of_dicts[0])
print(list_of_dicts[1].keys)
print(list_of_dicts[332])
print(list_of_dicts[:3])


#Create a new dict containing only useful data
new_list_of_dicts = [] 

for old_dict in list_of_dicts:
  foodict = {k: v for k, v in old_dict.items() if (k == 'turns')} 
  new_list_of_dicts.append(foodict)

print(len(new_list_of_dicts))

# Just to be sure we don't make bad use of the old variable,
# we will make the old dict equal to the new one.
# In the end, they are all the same.
list_of_dicts = []
list_of_dicts = new_list_of_dicts 

print(list_of_dicts[4390])

/content/drive/MyDrive/nlp hw6/dialogues/MAKE_RESTAURANT_RESERVATIONS.txt
47
{'id': '769feae6', 'user_id': '19b006ed', 'bot_id': '2ede825b', 'domain': 'EVENT_RESERVE', 'task_id': '4dfb7939', 'turns': ['Hello how may I help you?', 'Could you help me with reservations?', 'Sure, I can help with reservations. What would you like to reserve?', "Find me a theater in my area that's performing Shakespeare", 'The Johnson theater is located 1.5 miles from your location and is performing Hamlet on the 12th. Would you like to reserve seating?', "Sure, that sounds great! I'd like 3 tickets for that show on the 12th please", "I'm sorry, it looks like all the seats for the 12th have just filled. Would you be interested in the show on the 13th?", 'Yes, I would', 'Alright. I will reserve 3 tickets for Hamlet at Johnson theater for June 13th at 8 PM. Is that OK?', "Yep, that's great Thanks for the help!", 'Your reservations have been made.']}
<built-in method keys of dict object at 0x7faa793d53c0>
{'id'

In [5]:
# Init matrices
questions = []
answers = []

# We assume that the first answer by the bot (aka "Hello, how may I help you?") 
# is returned after a user greeting.
# This is used in order to ensure that the dataset will be even 
# and each question is paired with an answer.
# That's why we create a mini random catalog 
# of artificial 'ghost' user greetings.
matrix_greetings = ["Hey", "Hi"]

# A similar situation happens in the corner case 
# when the last sentence is from the user.
# As said, each sentence from the user should be paired
# with a sentence from the bot.
# That's why we will in this case add an artificial one.
matrix_byes = ["Ok", "Okie", "Bye"]

# For each dictionary in the list
for dictionary in list_of_dicts:
  matrix_QA = dictionary['turns']
  
  # Append a first random greeting, as explained above
  questions.append(random.choice(matrix_greetings))
    
  # In order to split the QAs to 2 matrices (questions & answers),
  # we will use a flag to indicate if the sentence 
  # is given from the bot or from the user
  bot_flag = True # Init

  # For each Q/A in the matrix
  for sentence in matrix_QA:

    if bot_flag == True:
      answers.append(sentence) # Used for bot's answers
      bot_flag = False # Switch
      continue
    else:
      questions.append(sentence) # Used for user's questions
      bot_flag = True # Switch
      continue

  # The last loop (ideally) ends with a bot's answer,
  # thus making bot_flag equal to False.
  # Although, with data visualization and exploring,
  # we can see that this does not happen all the time.

  # Corner case: If the last answers was from the user, 
  # then we need to add one artificial 'ghost' response 
  # from the bot to make the dataset even.
  if bot_flag == True: 
    answers.append(random.choice(matrix_byes))

#  **Vocabulary**

In [6]:
sorted_ques= []
sorted_ans=[]
for i in range(len(questions)):
  if len(questions[i]) < 20: #or smaller (13)
    sorted_ques.append(questions[i])
    sorted_ans.append(answers[i])


import re
def clean_text(txt):
  txt= txt.lower()
  txt= re.sub(r"i'm","i am", txt)
  txt= re.sub(r"he's","he is", txt)
  txt= re.sub(r"she's","she is", txt)
  txt= re.sub(r"he's","he is", txt)
  txt= re.sub(r"that's","that is", txt)
  txt= re.sub(r"what's","what is", txt)
  txt= re.sub(r"where's","where is", txt)
  txt= re.sub(r"\'ll"," will", txt)
  txt= re.sub(r"\'ve"," have", txt)
  txt= re.sub(r"\'re"," are", txt)
  txt= re.sub(r"\'d"," would", txt)
  txt= re.sub(r"won't","will not", txt)
  txt= re.sub(r"[^\w\s]","", txt)
  return txt


clean_ques= []
clean_ans= []

for line in sorted_ques:
  clean_ques.append(clean_text(line))

for line in sorted_ans:
  clean_ans.append(clean_text(line))



for i in range(len(clean_ans)):
  clean_ans[i]=' '.join(clean_ans[i].split()[:17])  #or smaller (11)



#clean_ans= clean_ans[:10000]
#clean_ques = clean_ques[:10000]




In [8]:
print(len(clean_ans))

95141


In [7]:
word2count= {}
for line in clean_ques:
  for word in line.split():
    if word not in word2count:
      word2count[word] =1
    else:
      word2count[word] +=1
for line in clean_ans:
  for word in line.split():
    if word not in word2count:
      word2count[word] =1
    else:
      word2count[word] +=1

thresh= 5
vocab={}
word_num = 0
for word, count in word2count.items():
  if count >= thresh:
    vocab[word] = word_num
    word_num +=1


for i in range(len(clean_ans)):
  clean_ans[i] = '<SOS> ' + clean_ans[i] +' <EOS>'

tokens= ['<PAD>','<EOS>','<OUT>','<SOS>']
x=len(vocab)
for token in tokens:
  vocab[token]=x
  x+=1

In [9]:
print(len(clean_ans))
print(len(clean_ques))
print(clean_ans[4])
print(clean_ques[4])

95141
95141
<SOS> hello how may i help you <EOS>
hi


In [10]:
print(clean_ques[100:900])

['i need some help', 'how much is it', 'hi', 'try saturday', 'hi', 'a circus tent', 'hey', 'tomorrow', 'the big theatre', 'hey', 'football on sunday', 'can you reserve me', '5pcspls', 'hey', 'how about the 7th', 'dope nope', 'hi', '444 pedro avenue', 'anytwon usa', 'hi', 'hey', 'how much is it', 'no thank you', 'hey', 'ok then', 'hey', 'ouch that is a lot', 'hey', 'hey', 'bot', 'the tn theater', '8', 'yes book it', 'johnny b cool', 'hi', 'hi hi', 'yeah sure', 'how can i help you', 'hey', 'the giants game', 'yea preferably', 'credit card', 'that is all thanks', 'hey', 'that is all thanks', 'hey', 'hey', 'how much is it ', 'hey', 'how much is that', 'thanks', 'hi', 'how much', 'thank you', 'hi', 'hey', 'what are the name ', 'how bout bachmans', 'ok bye', 'hey', 'making reservations', 'hladik theater', 'yes', 'harry potter', 'hey', 'great thanks', 'hi', 'yes please', 'oh im surprised', 'hi', 'at the theatre', '9', 'tomorrow', 'the ex pres threare', 'hey', 'how much is it', 'hi', 'how much

In [11]:
print(vocab)

{'hey': 0, 'yes': 1, 'i': 2, 'would': 3, 'thanks': 4, 'hi': 5, 'tonight': 6, 'cc': 7, 'on': 8, 'file': 9, 'need': 10, 'help': 11, 'five': 12, 'the': 13, 'theater': 14, 'july': 15, '23rd': 16, 'how': 17, 'about': 18, 'saturday': 19, 'what': 20, '5': 21, 'total': 22, 'sounds': 23, 'good': 24, 'that': 25, 'is': 26, 'it': 27, 'bears': 28, 'no': 29, '3': 30, 'please': 31, 'great': 32, 'thank': 33, 'you': 34, 'today': 35, 'florida': 36, 'okay': 37, 'any': 38, 'will': 39, 'do': 40, 'in': 41, 'middle': 42, 'row': 43, 'did': 44, 'bout': 45, 'far': 46, 'away': 47, 'ok': 48, '4': 49, 'theatre': 50, 'this': 51, 'friday': 52, 'night': 53, 'giants': 54, 'whats': 55, 'price': 56, 'credit': 57, 'card': 58, 'opera': 59, '7pm': 60, '2': 61, 'available': 62, 'go': 63, 'ahead': 64, 'sorry': 65, 'all': 66, 'tickets': 67, 'correct': 68, 'want': 69, '18': 70, 'forget': 71, 'then': 72, 'some': 73, 'much': 74, 'try': 75, 'a': 76, 'circus': 77, 'tent': 78, 'tomorrow': 79, 'big': 80, 'football': 81, 'sunday': 82

In [12]:
vocab['hey']= vocab['<PAD>']
vocab['<PAD>']=0

In [13]:
#inverse answers dict
inv_vocab= {w:v for v ,w in vocab.items()}


In [14]:
print(len(vocab))
print(len(inv_vocab))

3653
3653


# **Creating inputs**

In [15]:
encoder_input= []
for line in clean_ques:
  lst=[]
  for word in line.split():
    if word not in vocab:
      lst.append(vocab['<OUT>'])
    else:
      lst.append(vocab[word])
  
  encoder_input.append(lst)

decoder_input= []
for line in clean_ans:
  lst=[]
  for word in line.split():
    if word not in vocab:
      lst.append(vocab['<OUT>'])
    else:
      lst.append(vocab[word])
  
  decoder_input.append(lst)
#del(clean_ans,clean_ques,lst,word)

In [16]:
#print(encoder_input)
print(decoder_input[:100])
print(vocab)
print(inv_vocab)

[[3652, 209, 17, 127, 2, 11, 34, 3650], [3652, 306, 2, 39, 84, 30, 67, 130, 150, 112, 869, 14, 130, 124, 855, 112, 93, 132, 3650], [3652, 209, 17, 127, 2, 11, 34, 3650], [3652, 2401, 3650], [3652, 209, 17, 127, 2, 11, 34, 3650], [3652, 65, 299, 103, 29, 140, 62, 6, 3650], [3652, 48, 34, 119, 12, 140, 2329, 130, 265, 253, 8, 15, 945, 3650], [3652, 209, 17, 127, 2, 11, 34, 3650], [3652, 209, 2, 202, 13, 3651, 92, 17, 83, 2, 11, 34, 3650], [3652, 2, 119, 2329, 21, 140, 41, 270, 2847, 43, 93, 3650], [3652, 209, 17, 127, 2, 11, 34, 3650], [3652, 20, 1496, 3, 34, 153, 13, 67, 130, 3650], [3652, 2, 202, 65, 524, 299, 2848, 38, 140, 62, 3650], [3652, 209, 17, 127, 2, 11, 34, 3650], [3652, 19, 26, 2371, 1116, 1129, 134, 25, 137, 26, 405, 2127, 51, 303, 378, 214, 135, 2849, 3650], [3652, 209, 17, 127, 2, 11, 34, 3650], [3652, 639, 706, 2449, 63, 425, 3650], [3652, 209, 17, 127, 2, 11, 34, 3650], [3652, 37, 25, 26, 76, 22, 126, 255, 130, 21, 706, 3650], [3652, 34, 83, 204, 27, 341, 58, 430, 34, 6

In [17]:
from keras.utils import pad_sequences
encoder_input= pad_sequences(encoder_input, 20, padding='post',truncating='post')
decoder_input= pad_sequences(decoder_input, 20, padding='post',truncating='post')

In [18]:
print(decoder_input)
print(encoder_input)

[[3652  209   17 ...    0    0    0]
 [3652  306    2 ...  132 3650    0]
 [3652  209   17 ...    0    0    0]
 ...
 [3652  209   17 ...    0    0    0]
 [3652   26  299 ...    0    0    0]
 [3652   34  103 ...    0    0    0]]
[[3649    0    0 ...    0    0    0]
 [   1    2    3 ...    0    0    0]
 [3649    0    0 ...    0    0    0]
 ...
 [3649    0    0 ...    0    0    0]
 [  32    0    0 ...    0    0    0]
 [  29    4    0 ...    0    0    0]]


In [19]:
decoder_final_output=[]
for i in decoder_input:
  decoder_final_output.append(i[1:])

decoder_final_output= pad_sequences(decoder_final_output,20,padding='post',truncating='post')

# **Building the Model**

In [20]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM , Input
from tensorflow.keras.utils import to_categorical


In [None]:
#decoder_final_output = to_categorical(decoder_final_output, len(vocab))  #----> Raises error in ram 

In [21]:
enc_inp= Input(shape=(20,))
dec_inp= Input(shape=(20,))



VOCAB_SIZE= len(vocab)
embed= Embedding(VOCAB_SIZE+1, output_dim=50, 
                 input_length=20,
                 trainable=True)


enc_embed= embed(enc_inp)
enc_lstm= LSTM(512, return_sequences=True, return_state=True)
enc_op , h ,c = enc_lstm(enc_embed)
enc_states= [h,c]

dec_embed= embed(dec_inp)
dec_lstm= LSTM(512, return_sequences=True, return_state=True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

dense= Dense(VOCAB_SIZE, activation='softmax')
dense_op=dense(dec_op)

model=Model([enc_inp, dec_inp], dense_op)

model.compile(loss='sparse_categorical_crossentropy',metrics=['acc'],optimizer='Adam')

model.fit([encoder_input, decoder_input],decoder_final_output,epochs=50, batch_size=20, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fa992d02910>

In [22]:
model.save("/content/drive/My Drive/hw6/model_final.h5")
model.save('model_final')



In [25]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
import numpy as np

enc_model = Model([enc_inp], enc_states)



# decoder Model
decoder_state_input_h = Input(shape=(512,))
decoder_state_input_c = Input(shape=(512,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]


decoder_outputs, state_h, state_c = dec_lstm(dec_embed , 
                                    initial_state=decoder_states_inputs)


decoder_states = [state_h, state_c]


dec_model = Model([dec_inp]+ decoder_states_inputs,
                                      [decoder_outputs]+ decoder_states)




print("##########################################")
print("#       start chatting ver. 1.0          #")
print("##########################################")


prepro1 = ""
while prepro1 != 'q':
    prepro1  = input("you : ")
    ## prepro1 = "Hello"

    prepro1 = clean_text(prepro1)
    ## prepro1 = "hello"

    prepro = [prepro1]
    ## prepro1 = ["hello"]

    txt = []
    for x in prepro:
        # x = "hello"
        lst = []
        for y in x.split():
            ## y = "hello"
            try:
                lst.append(vocab[y])
                ## vocab['hello'] = 454
            except:
                lst.append(vocab['<OUT>'])
        txt.append(lst)

    ## txt = [[454]]
    txt = pad_sequences(txt, 20, padding='post')

    ## txt = [[454,0,0,0,.........13]]

    stat = enc_model.predict(txt)

    empty_target_seq = np.zeros( ( 1 , 1) )
     ##   empty_target_seq = [0]


    empty_target_seq[0, 0] = vocab['<SOS>']
    ##    empty_target_seq = [255]

    stop_condition = False
    decoded_translation = ''

    while not stop_condition :

        dec_outputs , h, c= dec_model.predict([ empty_target_seq] + stat )
        decoder_concat_input = dense(dec_outputs)
        ## decoder_concat_input = [0.1, 0.2, .4, .0, ...............]

        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
        ## sampled_word_index = [2]

        sampled_word = inv_vocab[sampled_word_index] + ' '

        ## inv_vocab[2] = 'hi'
        ## sampled_word = 'hi '

        if sampled_word != '<EOS> ':
            decoded_translation += sampled_word  

        if sampled_word == '<EOS> ' or len(decoded_translation.split()) > 20:
            stop_condition = True 

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        ## <SOS> - > hi
        ## hi --> <EOS>
        stat = [h, c]  

    print("chatbot attention : ", decoded_translation )
    print("==============================================")  


##########################################
#       start chatting ver. 1.0          #
##########################################
you : can i ask you something?




chatbot attention :  yes 
you : i am intrested abou London
chatbot attention :  it is not a style 
you : how can i go to London?
chatbot attention :  you can just get it for 10 more 
you : can you book me a table for four?
chatbot attention :  i am sorry but i am unable to recognize that book 
you : i mean i want you to make me a reservation
chatbot attention :  i am sorry but i am legally obligated for reservations <OUT> for too complicated i am 
you : can you make a reservation?
chatbot attention :  unfortunately but i cant 
you : what can you do?
chatbot attention :  i can help with alarms 
you : set an alarm for five a.m
chatbot attention :  sound or vibrate 
you : sound
chatbot attention :  what sound rooster sound 
you : whatever you like
chatbot attention :  cheers 
you : bye
chatbot attention :  bye 


KeyboardInterrupt: ignored