# **General Imports**

In [None]:
import pandas as pd
import datetime as dt
import seaborn as sns
import numpy as np
import json
import nltk as nltk
import string
import tensorflow as tf
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import tensorflow_datasets as tfds
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.sequence import pad_sequences

# **Script to Organize Data**

In [None]:
def get_entitie(state, entities_tracking):
  total = list(state['slot_values'].keys())
  if total is not None:
    return [x for x in total if x not in entities_tracking]
  else:
    return []

In [None]:
def get_info(turn, entities_tracking):
  for frame in turn['frames']:
    if frame['service'] == 'hotel':
      return {
        'entities': get_entitie(frame['state'], entities_tracking),
        'intent': frame['state']['active_intent']
      }

In [None]:
def get_system_slots(turn):
  slots = []
  for frame in turn['frames']:
    if frame['service'] == 'hotel':
      for index in frame['slots']:
        slots.append(index['slot'])
  return slots

In [None]:
def data(dialog_data):
  extracted_info = []
  for index in range(0, len(dialog_data['services'])):
    if 'hotel' in dialog_data['services'][index]:
      entities_tracking = []
      for turn in dialog_data['turns'][index]:
        if turn['speaker'] == 'USER':
          info = get_info(turn, entities_tracking)
          extracted_info.append({
              'document': turn['utterance'],
              'entities': info['entities'],
              'intent': info['intent']
          })
          entities_tracking = (entities_tracking + list(set(info['entities']) - set(entities_tracking)))
        else:
          slots = get_system_slots(turn)
          entities_tracking = (entities_tracking + list(set(slots) - set(entities_tracking)))
  return extracted_info

In [None]:
def get_entities(turn):
  for frame in turn['frames']:
    if frame['service'] == 'hotel':
      return frame['state']['slot_values']

In [None]:
def entities_dict(dialogs_data):
  entities = {}
  for dialog_data in dialogs_data:
    for index in range(0, len(dialog_data['services'])):
      if 'hotel' in dialog_data['services'][index]:
        last_user_turn = dialog_data['turns'][index][-1]
        if last_user_turn['speaker'] != 'USER':
          last_user_turn = dialog_data['turns'][index][-2]
        turn_entities = get_entities(last_user_turn)
        for entitie in turn_entities:
          words = turn_entities[entitie]
          for word in words:
            if not word in entities:
              entities[word] = entitie
  return entities


In [None]:
dialogs = []
for i in range(1,17):
  if i < 10:
    dialogs.append(pd.read_json("https://raw.githubusercontent.com/budzianowski/multiwoz/master/data/MultiWOZ_2.2/train/dialogues_00" + str(i) + ".json",encoding = "ISO-8859-1"))
  else:
    dialogs.append(pd.read_json("https://raw.githubusercontent.com/budzianowski/multiwoz/master/data/MultiWOZ_2.2/train/dialogues_0" + str(i) + ".json",encoding = "ISO-8859-1"))
final_data = list(map(data, dialogs))
test_dialogs = []
test_dialogs.append(pd.read_json("https://raw.githubusercontent.com/budzianowski/multiwoz/master/data/MultiWOZ_2.2/test/dialogues_001.json", encoding = "ISO-8859-1"))
test_dialogs.append(pd.read_json("https://raw.githubusercontent.com/budzianowski/multiwoz/master/data/MultiWOZ_2.2/test/dialogues_002.json", encoding = "ISO-8859-1"))
test_data = list(map(data, test_dialogs))
entidades = entities_dict(dialogs)
#with open('entities.csv', 'w', newline='') as csvfile:
#    fieldnames = ['word', 'entitie']
#    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#    writer.writeheader()
#    for word in entidades:
#      writer.writerow({'word': word, 'entitie': entidades[word]})
#with open('final_data.csv', 'w') as f:
#    w = csv.DictWriter(f, final_data[0][0].keys())
#    w.writeheader()
#    for dialogs in final_data:
#      for dialog in dialogs:
#        w.writerow(dialog))

# **Importing and Processing Data**

In [None]:
dataset = pd.read_csv("https://raw.githubusercontent.com/Acesarsilva/Chatbot_for_Hotel_Service/main/final_data.csv",encoding = "ISO-8859-1")
entity_map = pd.read_csv("https://raw.githubusercontent.com/Acesarsilva/Chatbot_for_Hotel_Service/main/entities.csv",encoding = "ISO-8859-1")

In [None]:
dataset.head()

Unnamed: 0,document,entities,intent
0,i need a place to dine in the center thats exp...,[],find_hotel
1,"Any sort of food would be fine, as long as it ...",[],find_hotel
2,"Sounds good, could I get that phone number? Al...","['hotel-pricerange', 'hotel-type']",find_hotel
3,Yes. Can you book it for me?,[],find_hotel
4,i want to book it for 2 people and 2 nights st...,"['hotel-bookday', 'hotel-bookpeople', 'hotel-b...",book_hotel


In [None]:
encoded_dataset = dataset.copy()

In [None]:
#Tratando tipo da coluna entities
entities = []
unique_entities = set()
for x in encoded_dataset['entities']:
  x = x.replace('[','')
  x = x.replace(']','')
  x = x.replace("'", '')
  x = x.replace(' ', '')
  x = x.split(',')
  entities.append(x)
  for y in x:
    unique_entities.add(y)
encoded_dataset['entities'] = entities

In [None]:
#Lowercase
encoded_dataset['document'] = encoded_dataset['document'].str.lower()
punc_to_remove = string.punctuation

#Removendo Pontuação
def remove_punctuation(text):
    return text.translate(str.maketrans('','', punc_to_remove))

encoded_dataset['document'] = encoded_dataset['document'].apply(lambda text: remove_punctuation(text))

#Removendo Stopwords
nltk.download('stopwords')
STOPWORDS=set(stopwords.words("english"))

def remove_stopword(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
    
encoded_dataset['document'] = encoded_dataset['document'].apply(lambda text: remove_stopword(text))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Lemmatizando
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
lemmatizer = WordNetLemmatizer()

wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatized_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word , wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

encoded_dataset['document'] = encoded_dataset['document'].apply(lambda text: lemmatized_words(text))
encoded_dataset.head()

In [None]:
def encodeEntitities (entities_dict, entities_list):
  for x in range(len(entities_list)):
    entities_list[x] = entities_dict.get(entities_list[x])
  return entities_list

In [None]:
#Codificando Intents
encoded_dataset['intent'] = LabelEncoder().fit_transform(encoded_dataset['intent'])
#Codificando Entities
unique_entities = list(unique_entities)
encoded_entities = LabelEncoder().fit_transform(unique_entities)
entities_dict = {unique_entities[x]:encoded_entities[x] for x in range(len(unique_entities))}
encoded_dataset['entities'] = encoded_dataset['entities'].apply(lambda x: encodeEntitities(entities_dict, x))

In [None]:
#Criando nosso Vocabulário
def addWords (vocabulary, word_list):
  for x in word_list:
    vocabulary.add(x)

vocabulary = set()
encoded_dataset['document'].apply(lambda text: addWords(vocabulary, text.split(" ")))
vocabulary = list(vocabulary)

#Devemos Remover Números do Vocabulário ??????????????????????????

tokens = LabelEncoder().fit_transform(vocabulary)
tokens_dict = {vocabulary[x]:tokens[x] for x in range(len(vocabulary))}
print(tokens_dict)

{'': 0, 'betcha': 506, 'hate': 1225, 'renting': 2081, 'repeat': 2082, 'regency': 2069, 'splendid': 2346, 'inspire': 1368, 'obvious': 1761, 'without': 2788, 'bonjour': 523, 'restauarnt': 2110, 'meh': 1596, 'cineworld': 673, 'break': 541, 'proper': 1980, 'goona': 1160, '1280': 73, 'arrving': 400, 'yougood': 2849, '2258': 144, 'hard': 1223, 'next': 1716, 'saigon': 2163, 'currently': 809, 'ell': 937, 'requirement': 2093, 'buildings': 558, 'fact': 1006, 'makes': 1561, 'goo': 1154, 'reccomend': 2042, 'jamaican': 1398, 'threestar': 2524, 'wasnt': 2730, 'want': 2724, 'otherwise': 1808, 'simply': 2271, 'highest': 1257, 'etc': 964, 'nee': 1701, 'arts': 403, 'best': 504, 'perfer': 1868, 'duration': 912, 'refried': 2065, 'spending': 2343, 'correcting': 771, 'cuisine': 806, 'weds': 2740, 'zizzi': 2863, 'chan': 626, 'moderate': 1644, 'locate': 1510, 'review': 2126, 'refuse': 2066, 'variety': 2691, 'joke': 1407, 'customer': 812, 'actual': 256, 'tyhat': 2647, 'consult': 755, 'bookit': 530, 'incorrect'

In [None]:
#Dividindo Conjunto de Dados
extractor_X = encoded_dataset['document']
extractor_y = encoded_dataset['entities']
extractor_X_train, extractor_X_test, extractor_y_train, extractor_y_test = train_test_split(extractor_X, extractor_y, test_size=0.4, random_state=1)
extractor_X_test, extractor_X_validation, extractor_y_test, extractor_y_validation = train_test_split(extractor_X_test, extractor_y_test, test_size=0.4, random_state=1)

In [None]:
VOCAB_SIZE = len(vocabulary)
VECTOR_SIZE = 5
BATCH_SIZE = 64
INTERN_LAYER_SIZE = 60
END_LAYER_SIZE = len(entities_dict)

# **Creating Entity Extractor**

In [None]:
entity_map

Unnamed: 0,word,entitie
0,saturday,hotel-bookday
1,2,hotel-bookpeople
2,university arms hotel,hotel-name
3,expensive,hotel-pricerange
4,hotel,hotel-type
...,...,...
174,rosa's b ed and breakfast,hotel-name
175,a and be guest house,hotel-name
176,avolon,hotel-name
177,cambridge belgry,hotel-name


In [None]:
vectorize_layer = TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = VECTOR_SIZE)

embedding_layer = tf.keras.layers.Embedding(
    VOCAB_SIZE,
    BATCH_SIZE,
    embeddings_initializer="uniform",
    embeddings_regularizer=None,
    activity_regularizer=None,
    embeddings_constraint=None,
    mask_zero=False,
    input_length= VECTOR_SIZE
)

In [None]:
def baseline_model(VOCAB_SIZE, VECTOR_SIZE, BATCH_SIZE, INTERN_LAYER_SIZE, END_LAYER_SIZE):
  # Criando Camada de Vetorização
    vectorize_layer = TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = VECTOR_SIZE
    )

  # Criando Camada de Embedding
    embedding_layer = tf.keras.layers.Embedding(
    VOCAB_SIZE,
    BATCH_SIZE,
    embeddings_initializer="uniform",
    embeddings_regularizer=None,
    activity_regularizer=None,
    embeddings_constraint=None,
    mask_zero=False,
    input_length= VECTOR_SIZE
    )

  # Montando Modelo
    model = Sequential()
    model.add(vectorize_layer)
    model.add(embedding_layer)
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True))),
    
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    model.add()#Adicionar Camada Convolucional
    model.add(Dense(INTERN_LAYER_SIZE,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(END_LAYER_SIZE, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(extractor_X_train, extractor_y_train, epochs=150, batch_size=32, verbose=0)

# **Creating Intent Classificator**

In [None]:
def encoding(intent, unique_intents):
  encoding = [0 for x in unique_intents]
  encoding[intent] = 1
  #return ''.join(encoding)
  return encoding

In [None]:
dt_intent = encoded_dataset.copy()
dt_intent = dt_intent.drop(columns=['entities'])
unique_intents = dt_intent['intent'].unique()
dt_intent['intent'] = dt_intent['intent'].apply(lambda intent: encoding(intent, unique_intents))

In [None]:
data_train, data_test = train_test_split(dt_intent, test_size=0.4, random_state=1)
data_train.head()

Unnamed: 0,document,intent
7336,dont need reservation time would like know pri...,"[0, 0, 1]"
22989,great would like book table,"[1, 0, 0]"
14280,traveling cambridge broxbourne,"[1, 0, 0]"
8908,book 1 people 5 nights starting sunday,"[0, 1, 0]"
24421,need place stay free wifi,"[0, 0, 1]"


In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

intent_train = tf.data.Dataset.from_tensor_slices((data_train['document'], pad_sequences(list(data_train['intent']), 3)))
intent_test = tf.data.Dataset.from_tensor_slices((data_test['document'], pad_sequences(list(data_test['intent']), 3)))

intent_train = intent_train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
intent_test = intent_test.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in intent_train.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  [b'take anything free parking'
 b'sure please tell address phone number postcode'
 b'yes need hotel north close airport thanks'
 b'help find place stay maybe expensive guesthouse'
 b'thanks youre help may also address phone number restaurant'
 b'thats need thanks' b'5pm available instead'
 b'departing stevenage going cambridge'
 b'hello chinese restaurants centre'
 b'yes need reservation thursday 1630 4 people' b'thanks'
 b'need find hotel cambridge decent prices'
 b'would like star rating 4 cost matter would also like free parking'
 b'thats need right thanks help'
 b'yes also need train departs stevenage goes cambridge'
 b'want train leave 1630' b'needed help thank' b'west price range'
 b'would like travel saturday go leicester' b'looking place dine'
 b'near restaurant' b'yes group people monday 1715 please'
 b'center one good could provide room type hotel guesthouse address postcode well'
 b'alphamilton guest house still operation' b'price range'
 b'great thanks much help' b'f

In [None]:
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(list(dt_intent['document']))

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'need', 'please', 'hotel', 'yes', 'like', 'thank',
       'free', 'would', 'looking', 'book', 'also', 'people', 'nights',
       'im', 'number', 'stay', 'help', 'place'], dtype='<U18')

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation = "softmax")
])

In [None]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True, True, True]


In [None]:
history = model.fit(intent_train, epochs=35,
                    validation_data=intent_test,
                    validation_steps=30)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [None]:
test_loss, test_acc = model.evaluate(intent_test)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Test Loss: 0.5826109647750854
Test Accuracy: 0.8209525346755981


# **Tunning Intent Classificator**

# **Luan Code**

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
texts = []
training_data = []
testing_data = []
for dialogs in final_data:
  for dialog in dialogs:
    texts.append(dialog['document'])
    training_data.append(dialog)
for dialogs in test_data:
  for dialog in dialogs:
    texts.append(dialog['document'])
    testing_data.append(dialog)
training = np.array(training_data)
testing = np.array(testing_data)
encoder.adapt(texts)
vocab = np.array(encoder.get_vocabulary())

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(training, epochs=10,
                    validation_data=testing,
                    validation_steps=30)

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

## **Imports e Dataframe**

In [None]:
from numpy import argmax
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

## **Pré-Processamento**

### **Ajustes no Dataset**

### **Tokenização**

### **Criação de Embeddings**