# NLP Based

## Libraries / Downloads

In [None]:
!python -m spacy download pt_core_news_lg
!pip install unidecode

2023-01-30 20:55:02.742434: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pt-core-news-lg==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.4.0/pt_core_news_lg-3.4.0-py3-none-any.whl (568.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.2/568.2 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_lg')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import json
from nltk.tokenize import WhitespaceTokenizer
import spacy
import string
import random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from ast import literal_eval
from unidecode import unidecode
import random
import nltk
import re
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from google.colab import drive

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
nlp = spacy.load("pt_core_news_lg")

In [None]:
!pip freeze > requirements.txt

## Classes

In [None]:
class Model():

  def __init__(self, intents_file, entities_file):
    json_file = open(intents_file)
    self.intents_data = json.load(json_file)
    self.entities_data = entities_file
    data = []
    for intent in self.intents_data["intents"]:
      data.append([intent["name"], intent["examples"], intent["responses"]])
    self.df = pd.DataFrame(data=data, columns=['name', 'examples', 'responses'])

  def preprocess(self, column_name):
    self.clean_text(column_name)
    tfidf_vectorizer = TfidfVectorizer().fit(self.train_df[column_name])
    intents_vector = tfidf_vectorizer.transform(self.train_df[column_name])
    return intents_vector
    
  def clean_text(self, column_name):
    lemmatized_texts = []
    for txt in self.train_df[column_name]:
      txt = re.sub(r'([a-zA-Z])\1+', r'\1', txt)
      tokens = nltk.word_tokenize(txt, language='portuguese')
      no_punct_text = ' '.join([unidecode(token.lower()) for token in tokens if token not in string.punctuation])
      doc = nlp(no_punct_text)
      lemmas = ' '.join([token.lemma_ for token in doc])
      lemmatized_texts.append(lemmas)
    self.train_df[column_name] = lemmatized_texts
  
  def get_user_df(self, column_name, input_text):
    user_input = {column_name: [input_text]}
    user_df = pd.DataFrame.from_dict(user_input)
    return user_df

  def get_list_from_df(self, column_name):
    data_list = []
    for data in self.df[column_name]:
      data_list.append(data)
    return data_list
  
  def parse_user_input(self, input):
    user_df = self.get_user_df('examples', input)
    return user_df

  def train_intents(self, intents_file):
    continue_bot = True
    while continue_bot == True:
      user_input = input('')
      if user_input != 'sair':
        full_df, self.train_df = get_training_data()
        user_df = self.parse_user_input(user_input)
        prediction = self.predict(user_df, None)
        print(f'The BOT predicted the input as {prediction["intent"]}. Is this correct? (y/n)')
        check_option = input('').lower()
        if check_option == 'n':
          option = 0
          intents_map = {}
          intents = []
          options = []
          for intent in list(full_df['name'].values):
            intents.append(intent)
            options.append(option)
            print(f"{option} - {intent}\n")
            option +=1
          print(f'What is the correct intent? Select an option (number only)')
          intent_option = input('')
          for intent_key,option_num in zip(intents,options):
            if int(option_num) == int(intent_option):
              self.append_to_json('intents.json', intent_key, user_input)
              break
        else:
          if user_input not in self.train_df['examples'].tolist():
            self.append_to_json('intents.json', prediction["intent"], user_input)
      else:
        continue_bot = False
  
  def predict(self, user_df, threshold=0.5):
    self.train_df = self.df.iloc[:,:2]
    self.train_df = self.train_df.loc[self.train_df['name'] != 'fallback']
    self.train_df = self.train_df.explode("examples")
    self.train_df = pd.concat([self.train_df,user_df])
    intents_vector = self.preprocess('examples')
    similarities = cosine_similarity(intents_vector[-1], intents_vector).flatten()
    ordered_similar_indexes = similarities.argsort()
    most_similar_index = ordered_similar_indexes[-2]
    print(sorted(similarities)[-2])
    if threshold != None:
      if sorted(similarities)[-2] > threshold:
        user_intent = self.get_intent(most_similar_index)
      else:
        user_intent = 'fallback'
    else:
      user_intent = self.get_intent(most_similar_index)
    self.train_df = self.train_df.iloc[:-1,:]
    bot_response = self.get_bot_response(self.df, user_intent)
    return {"response": bot_response, "intent": user_intent}

  def get_intent(self, row_index):
    intent_index = self.train_df.columns.get_loc('name')
    user_intent = self.train_df.iloc[row_index, intent_index]
    return user_intent

  def get_response(self, intents_list):
    name = intents_list[0]
    list_of_intents = self.intents_data["intents"]
    for i in list_of_intents:
      if i["name"] == name:
        result = random.choice(i["responses"])
        break
    return result
  
  def get_bot_response(self, df, user_intent):
    df = df.loc[df['name'] == user_intent]
    response_array = df.iloc[0,2]
    random_idx = random.randint(0,len(response_array)-1)
    bot_response = response_array[random_idx]
    return bot_response
  
  def append_to_json(self, intents_data, file_name, intent_key, user_input):
    for intent in intents_data['intents']:
      if intent['name'] == intent_key:
        intent['examples'].append(user_input)
    with open(file_name, 'w') as json_file:
      json.dump(intents_data, json_file, indent=4, separators=(',',': '), ensure_ascii=False)

  def check_for_entity_match(self, user_intent, user_entity, user_input):
    entity_file = self.get_json_file("entities.json")
    user_input = re.sub(r'([a-zA-Z])\1+', r'\1', user_input)
    tokens = nltk.word_tokenize(user_input, language='portuguese')
    match_entity = False
    for entry in user_entity["entries"]:
      if entry["value"] in tokens:
        match_entity = True
      else:
        for synonym in entry["synonyms"]:
          if synonym in tokens:
            match_entity = True
    return match_entity 
    


## Training intents (Reinforcement Learning)

In [None]:
model = Model('intents.json','entities.json')

model.train_intents()

KeyboardInterrupt: ignored

## Bot Conversation

In [None]:
continue_bot = True
model = Model('intents.json','entities.json')
while continue_bot:
  user_input = input('')
  if user_input != 'sair':
    user_df = model.parse_user_input(user_input)
    prediction = model.predict(user_df)
    print(prediction)
  else:
    continue_bot = False

olá
1.0
{'response': 'Olá. Posso ajudar?', 'intent': 'iniciar'}
quero pedir algo pra comer
0.5635046256060454
{'response': 'Já processarei seu pedido. Um momento.', 'intent': 'comprar_pizza'}
brigada
0.0
{'response': 'Não compreendi. Poderia repetir?', 'intent': 'fallback'}
brigadão
0.0
{'response': 'Não compreendi. Poderia repetir?', 'intent': 'fallback'}


KeyboardInterrupt: ignored

## Save Intents File

In [None]:
drive.mount('/content/drive')

In [None]:
!cp -r "/content/intents.json" "/content/drive/MyDrive/"
!cp -r "/content/entities.json" "/content/drive/MyDrive/"

# Deep Learning

## Libraries / Downloads

In [None]:
!pip install transformers
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import re
from sklearn.preprocessing import LabelEncoder
from torchinfo import summary
from torch.optim import AdamW
from transformers import AutoModel, BertTokenizerFast
import torch
import torch.nn as nn
import transformers
from sklearn.utils.class_weight import compute_class_weight
from torch.optim import lr_scheduler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
device = torch.device("cpu")

## Classes

In [None]:
class BERTModel(nn.Module):

  def __init__(self, bert):
    super(BERTModel, self).__init__()

    self.bert = bert

    self.dropout = nn.Dropout(0.4)

    self.relu = nn.ReLU()

    self.fc1 = nn.Linear(768,512)

    self.fc2 = nn.Linear(512, 256)

    self.fc3 = nn.Linear(256,128)

    self.fc4 = nn.Linear(128, 3)

    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, sent_id, mask):

    cls_hs = self.bert(sent_id, attention_mask=mask)[0][:,0]

    x = self.fc1(cls_hs)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.fc2(x)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.fc3(x)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.fc4(x)

    x = self.softmax(x)

    return x

## Functions

In [None]:
def train():

  model.train()

  total_loss = 0

  total_preds=[]

  for step,batch in enumerate(train_dataloader):

    if not step == 0:
      print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))

      batch = [r.to(device) for r in batch]
      sent_id, mask, labels = batch

      preds = model(sent_id, mask)

      loss = cross_entropy(preds, labels)

      total_loss = total_loss + loss.item()

      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)

      optimizer.step()

      optimizer.zero_grad()

      lr_sch.step()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  avg_loss = total_loss / len(train_dataloader)

  total_preds = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

def get_prediction(text, le):
  test_text = [text]
  model.eval()

  tokens_test_data = tokenizer(
      test_text,
      max_length = max_seq_len,
      padding=True,
      truncation=True,
      return_token_type_ids=False
  )

  test_seq = torch.tensor(tokens_test_data['input_ids'])
  test_mask = torch.tensor(tokens_test_data['attention_mask'])

  preds = None

  with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))

  preds = preds.detach().cpu().numpy()
  preds = np.argmax(preds, axis=1)
  print("Intent identified: ", le.inverse_transform(preds)[0])
  return le.inverse_transform(preds)[0]

def get_max_sent_length(df, column_name, visualize=False):
  seq_len = [len(seq.split()) for seq in df[column_name].tolist()]
  if visualize:
    pd.Series(seq_len).hist(bins=10)
  return np.max(seq_len)

def get_classes(df, column_name):
  le = LabelEncoder()
  df[column_name] = le.fit_transform(df[column_name])
  df[column_name].value_counts(normalize=True)
  classes = df[column_name]
  return le, classes

def get_training_data(file_name):
  intents_data = get_json_file(file_name)
  full_df = get_df_from_json(intents_data)
  train_df = full_df.iloc[:,:2]
  train_df = train_df.explode("examples")
  return full_df, train_df

def get_deep_response(df, message):
  user_intent = get_prediction(message, label_encoder)
  df = df.loc[df['name'] == user_intent]
  response_array = df.iloc[0,2]
  random_idx = random.randint(0,len(response_array)-1)
  bot_response = response_array[random_idx]
  return bot_response, user_intent

## Load tokenizer/bert

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [None]:
bert = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Prepare data

In [None]:
full_df, df = get_training_data('intents.json')

In [None]:
label_encoder, train_labels = get_classes(df, 'name')

In [None]:
max_seq_len = get_max_sent_length(df, 'examples')

In [None]:
training_tokens = tokenizer(
    df['examples'].tolist(),
    max_length = max_seq_len,
    padding = True,
    truncation = True,
    return_token_type_ids = False
)

In [None]:
training_seq = torch.tensor(training_tokens['input_ids'])
training_mask = torch.tensor(training_tokens['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

In [None]:
batch_size = 32

train_data = TensorDataset(training_seq, training_mask, train_y)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [None]:
for param in bert.parameters():
  param.requires_grad = False

model = BERTModel(bert)

model = model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-2)

train_losses = []

epochs = 100

lr_sch = lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)

summary(model)

Layer (type:depth-idx)                                  Param #
BERTModel                                               --
├─BertModel: 1-1                                        --
│    └─BertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                              (22,881,792)
│    │    └─Embedding: 3-2                              (393,216)
│    │    └─Embedding: 3-3                              (1,536)
│    │    └─LayerNorm: 3-4                              (1,536)
│    │    └─Dropout: 3-5                                --
│    └─BertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                             (85,054,464)
│    └─BertPooler: 2-3                                  --
│    │    └─Linear: 3-7                                 (590,592)
│    │    └─Tanh: 3-8                                   --
├─Dropout: 1-2                                          --
├─ReLU: 1-3                                             --
├─Linea

## Balance weights in case of unbalanced classes

In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)

weights = torch.tensor(class_weights, dtype=torch.float)
weights = weights.to(device)

cross_entropy = nn.NLLLoss(weight=weights)

## Fine-tune the BERT Model

In [None]:
for epoch in range(epochs):
  
  print('\n Epoch {:} / {:}'.format(epoch+1, epochs))

  train_loss, _ = train()

  train_losses.append(train_loss)

  # Reproducible results
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

print(f'\nTraining Loss: {train_loss:.3f}')


 Epoch 1 / 100
 Batch     1 of     2.

 Epoch 2 / 100
 Batch     1 of     2.

 Epoch 3 / 100
 Batch     1 of     2.

 Epoch 4 / 100
 Batch     1 of     2.

 Epoch 5 / 100
 Batch     1 of     2.

 Epoch 6 / 100
 Batch     1 of     2.

 Epoch 7 / 100
 Batch     1 of     2.

 Epoch 8 / 100
 Batch     1 of     2.

 Epoch 9 / 100
 Batch     1 of     2.

 Epoch 10 / 100
 Batch     1 of     2.

 Epoch 11 / 100
 Batch     1 of     2.

 Epoch 12 / 100
 Batch     1 of     2.

 Epoch 13 / 100
 Batch     1 of     2.

 Epoch 14 / 100
 Batch     1 of     2.

 Epoch 15 / 100
 Batch     1 of     2.

 Epoch 16 / 100
 Batch     1 of     2.

 Epoch 17 / 100
 Batch     1 of     2.

 Epoch 18 / 100
 Batch     1 of     2.

 Epoch 19 / 100
 Batch     1 of     2.

 Epoch 20 / 100
 Batch     1 of     2.

 Epoch 21 / 100
 Batch     1 of     2.

 Epoch 22 / 100
 Batch     1 of     2.

 Epoch 23 / 100
 Batch     1 of     2.

 Epoch 24 / 100
 Batch     1 of     2.

 Epoch 25 / 100
 Batch     1 of     2.

 Epoch 2

## Prediction

In [None]:
continue_bot = True
full_df, train_df = get_training_data('intents.json')
while continue_bot:
  user_input = input('')
  if user_input != 'sair':
    prediction, _ = get_deep_response(full_df, user_input)
    print(prediction)
  else:
    continue_bot = False

pizzaaaaaaa
Intent identified:  comprar_pizza
Seu pedido é uma ordem.
oba, bão?
Intent identified:  iniciar
Opa. Posso ajudar com algo?
quero pedir um alimento
Intent identified:  comprar_pizza
Seu pedido é uma ordem.
quero comer
Intent identified:  comprar_pizza
Seu pedido é uma ordem.
to precisando de algo pra comer
Intent identified:  comprar_pizza
É pra já!
to precisando me alimentar urgentemente
Intent identified:  comprar_pizza
Já processarei seu pedido. Um momento.
exijo que me alimente agora!
Intent identified:  finalizar
Até a próxima!


KeyboardInterrupt: ignored

## Train Intents

In [None]:
def train_intents_deep(intents_file):
  intents_data = get_json_file(intents_file)
  user_input = input('')
  if user_input != 'sair':
    full_df, _ = get_training_data('intents.json')
    prediction = get_deep_response(full_df, user_input)
    print(prediction)
    print(f'The BOT predicted the input as {prediction[1]}. Is this correct? (y/n)')
    check_option = input('').lower()
    if check_option == 'n':
      option = 0
      intents_map = {}
      intents = []
      options = []
      for intent in list(full_df['name'].values):
        intents.append(intent)
        options.append(option)
        print(f"{option} - {intent}\n")
        option +=1
      print(f'What is the correct intent? Select an option (number only)')
      intent_option = input('')
      for intent_key,option_num in zip(intents,options):
        if int(option_num) == int(intent_option):
          append_to_json(intents_data, 'intents.json', intent_key, user_input)
          break


In [None]:
train_intents_deep('intents.json')

aoba
Intent identified:  finalizar
('Até breve!', 'finalizar')
The BOT predicted the input as finalizar. Is this correct? (y/n)
n
0 - iniciar

1 - comprar_pizza

2 - finalizar

What is the correct intent? Select an option (number only)
0
iniciar iniciar
comprar_pizza iniciar
finalizar iniciar
