In [1]:
#### CRITICAL - ENABLE GPU 

import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader


## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install pytorch-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[K     |████████████████████████████████| 176 kB 17.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 74.2 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.26.26-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 78.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 73.0 MB/s 
Collecting botocore<1.30.0,>=1.29.26
  Downloading botocore-1.29.26-py3-none-any.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 62.8 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,

In [4]:
## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [5]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


In [None]:
## Install PyTorch-Transformer

In [6]:
!pip install -U pytorch-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
## Importing Datasets

In [7]:
'''
Important Step - Make sure you upload the data file to the exact location below. If you uploaded correctlt, the following command will run
'''

!ls drive/'My Drive'/2017-06-custom-intent-engines

AddToPlaylist	GetWeather  RateBook   SearchCreativeWork
BookRestaurant	PlayMusic   README.md  SearchScreeningEvent


In [8]:
'''
Create the Dataset Path
'''


dataset_path = "drive/My Drive/2017-06-custom-intent-engines/"

In [9]:

'''
***Explain*** Summarize, in bullet points, what is the code doing?.

* creates a dataframe that the data will later be added to. This includes the queries and their corresponding intents
* it then populates this dataframe again with the corresponding queries and intents. 
'''



dataset = pd.DataFrame(columns = ['utterance', 'label'])
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open(dataset_path + intent + "/train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Class: {}, # utterances: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'utterance': text, 'label': intent}, ignore_index=True)
dataset.tail()

Class: AddToPlaylist, # utterances: 300
Class: BookRestaurant, # utterances: 300
Class: GetWeather, # utterances: 300
Class: PlayMusic, # utterances: 300
Class: RateBook, # utterances: 300
Class: SearchCreativeWork, # utterances: 300
Class: SearchScreeningEvent, # utterances: 300


Unnamed: 0,utterance,label
2095,Is Across the Line playing at the closest movi...,SearchScreeningEvent
2096,Which animated movies are playing in the neigh...,SearchScreeningEvent
2097,Where is They Always Return at Dawn playing,SearchScreeningEvent
2098,What is the movie schedule in the neighborhood,SearchScreeningEvent
2099,Tell me when Howling II: Your Sister Is a Were...,SearchScreeningEvent


In [10]:
'''
Assigning an Index to each intent. We will use this later
'''

'''
***Explain*** Why do we convert labels to indexes?. 

This is to make the classification process numeric. it makes for a more simple output layer.
'''

label_to_ix = {}
for label in dataset.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

{'AddToPlaylist': 0,
 'BookRestaurant': 1,
 'GetWeather': 2,
 'PlayMusic': 3,
 'RateBook': 4,
 'SearchCreativeWork': 5,
 'SearchScreeningEvent': 6}

In [None]:
## Loading RoBERTa classes

In [11]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

100%|██████████| 481/481 [00:00<00:00, 500362.16B/s]


{
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 7,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 1,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [12]:
'''
Loading Pretrained tokenizer and instantiating the model from settings in config
'''

'''
***Explain*** : a. What is a tokenizer? b. What is special about the following tokenizer?. 

A tokenizer splits up sentences or larger objects into more component parts. In this case sentences to words.
The roberta tokenizer is a pretrained model that can conduct this process. 
'''

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

'''
***Explain*** :  What is the next line doing?

This line initializes the model. 
'''

model = RobertaForSequenceClassification(config)

100%|██████████| 898823/898823 [00:00<00:00, 25241788.94B/s]
100%|██████████| 456318/456318 [00:00<00:00, 24004921.71B/s]


In [None]:
## Feature Preparation

In [14]:
'''
Some important Feature Engineering
'''

'''
***Explain*** : What are the implications for setting  include_CLS_token = True, include_SEP_token = True ?

The Cls token represents sentence level classification. Sep token is for splitting up the sentences to make learning easier.
These are used to help the model learn the input more easily.
'''

def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [15]:
msg = "My dog is cute!"
prepare_features(msg)

(tensor([[    0,  1308,  2335,    16, 11962,   328,     2]]),
 [1, 1, 1, 1, 1, 1, 1])

In [16]:
## Dataset Loader Classes

In [17]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.utterance[index]
        label = self.data.label[index]
        X, _  = prepare_features(utterance)
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [18]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [19]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (2100, 2)
TRAIN Dataset: (1680, 2)
TEST Dataset: (420, 2)


In [20]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [21]:
training_set.__getitem__(0)[0].shape

torch.Size([1, 8])

In [22]:
model(training_set.__getitem__(0)[0])

(tensor([[ 0.1852,  0.1096,  0.0946,  0.0154, -0.1428,  0.3253, -0.2342]],
        grad_fn=<AddmmBackward0>),)

In [23]:
## Training Params

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [25]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [26]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [28]:
'''
Instantiate the Loss
'''
'''
***Explain*** why cross entropy loss?, also print the model and explain why are not we using softmax at the end?

Cross entropy loss uses a softmax and essentially learns the probabilities of predicting an input to each class.
It works well for classification tasks, outperforming MSE which is often used for regression instead.
The softmax takes the input logits and converts them to output probabilities of assigning to a class.
'''
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [29]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(output.shape)

torch.Size([1, 7])


In [30]:
torch.__version__

'1.13.0+cu116'

In [31]:
'''
Actually train the model with train data
'''
'''
***Explain*** the Training Code Chunk in detail. Especially what is torch.max() doing here?

So for each training example it runs it forward through the model and comes up with a prediction for what intent that query was. 
The important part of the torch.max() is that it is taking the classifier index with the maximum calculated probability from the cross entropy loss.
Then the loss is backpropogated through the model. I notice this is not being done in batches but rather for each training example.
The last section is for printing purposes.
'''



max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(max_epochs)):


  0%|          | 0/3 [00:00<?, ?it/s]

EPOCH -- 0
Iteration: 0. Loss: 2.275467872619629. Accuracy: 65.71428571428571%
Iteration: 100. Loss: 2.5343434810638428. Accuracy: 0.0%
Iteration: 200. Loss: 2.390566349029541. Accuracy: 27.61904761904762%
Iteration: 300. Loss: 1.6814866065979004. Accuracy: 2.142857142857143%
Iteration: 400. Loss: 1.5779132843017578. Accuracy: 0.0%
Iteration: 500. Loss: 0.19182948768138885. Accuracy: 0.9523809523809523%
Iteration: 600. Loss: 0.7568603157997131. Accuracy: 48.57142857142857%
Iteration: 700. Loss: 0.532332181930542. Accuracy: 34.04761904761905%
Iteration: 800. Loss: 0.5287597179412842. Accuracy: 77.14285714285714%
Iteration: 900. Loss: 0.18333157896995544. Accuracy: 73.0952380952381%
Iteration: 1000. Loss: 1.8169546127319336. Accuracy: 31.19047619047619%
Iteration: 1100. Loss: 0.15359678864479065. Accuracy: 72.85714285714286%
Iteration: 1200. Loss: 0.2900727391242981. Accuracy: 95.47619047619048%
Iteration: 1300. Loss: 0.23414534330368042. Accuracy: 82.61904761904762%
Iteration: 1400. Los

In [32]:
'''
***Explain*** what is the get_reply function doing?

The get reply function takes in a new query and uses our trained model ot predict the output intent. 
This is a helper function for making new predictions using the trained model.
'''
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=list(label_to_ix.keys())[pred_label]
  return prediction

In [33]:
label_to_ix.keys()

dict_keys(['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'])

In [34]:
'''Different text sentences pass to the model'''

get_reply("play radiohead song")

'PlayMusic'

In [35]:
get_reply("it is rainy in Sao Paulo")

'GetWeather'

In [36]:
get_reply("sun shinnes all day")

'GetWeather'

In [37]:
get_reply("low humidity, high altitude")

'SearchCreativeWork'

In [38]:
get_reply("Book tacos for me tonight")

'BookRestaurant'

In [39]:
get_reply("Book a table for me tonight")

'BookRestaurant'

In [40]:
get_reply("I want BBQ tonight")

'BookRestaurant'

## 5 additional sentences

In [42]:
get_reply("listen to some metal")

'PlayMusic'

In [43]:
get_reply("will it be cloudy again today")

'GetWeather'

In [44]:
get_reply("Find me somebody to love")

'SearchCreativeWork'

In [45]:
get_reply("1984 gets a 10 from me")

'BookRestaurant'

In [46]:
get_reply("the book phantom of the opera is excellent")

'SearchCreativeWork'

### The model performed well on the first couple sentences I tried. I tried to fool it a bit towards the end to see what the limitations would be.

### How I envision this model or something similar being used in a chatbot is to perhaps find the category of reply that should be made to an input to the chatbot. I want to say that you could use it in some encoder/decoder architecture to generate a new response. Particularly in the encoder side.