# **Chatbot - NLP and Deep Learning**

For this project, we will use the PyTorch library.

---

## 1. Theory and NLP concepts

We will talk about stemming, tokenization, bag of words.

First, we put all words (of each patterns) into an array.

- **Bag of words :** For each different pattern, we create an array w/ the same size as the all words array. If this word is included into the all words array, we put a 1 at his position, 0 otherwise.
- **Tokenization :** Splitting string into meaningful units (e.g. words, punctuation characters, numbers)
- **Stemming :** Generate the root form of the words. It is an heuristic that chops of the ends off of words. 

### **Whole NLP pre-processing pipeline :** 

At the beginning, we have the Whole sentence, then we tokenize it. We lower all the words, then we stem the words. We then exclude punctuation characters. And based on this array, we calculate the bag of words. 

---

## 2. Create training data

We are going to use a free Natural Language data, using a framework, called NLTK - Natural Language toolkit. 

In [1]:
import nltk

# Download a package from nltk
nltk.download('punkt')   # package w/ a pre-trained tokenizer

# Stemming: reduce a word to its root form
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


# Tokenize: split a sentence into a list of words
def tokenize(sentence):
    return nltk.word_tokenize(sentence)

def stem(word):
    return stemmer.stem(word.lower())

def bag_of_words(tokenized_sentence, all_words):
    """
    sentence = ["hello", "how", "are", "you"]
    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
    bag =   [  0,      1,      0,    1,      0,      0,       0]
    """

    tokenized_sentence = [stem(w) for w in tokenized_sentence]
    
    bag = np.zeros(len(all_words), dtype=np.float32)
    for idx, w in enumerate(all_words):
        if w in tokenized_sentence:
            bag[idx] = 1.0
            
    return bag

[nltk_data] Downloading package punkt to /home/bradfo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import json

with open('intents.json', 'r') as f:
    intents = json.load(f)

all_words = []
tags = []
xy = []

# Loop through each sentence in our intents patterns
for intent in intents['intents']:   #key: intents, value: list of intents
    tag = intent['tag']             #key: tag, value: intent
    tags.append(tag)
    # Loop through each pattern in the patterns
    for pattern in intent['patterns']:
        # Tokenize each word in the sentence
        w = tokenize(pattern)
        # Add to our words list (not append, because we don't want a list of lists)
        all_words.extend(w)
        # Add to xy pair
        # pattern and tag for each pattern
        xy.append((w, tag))

# Stem and lower each word and remove duplicates
ignore_words = ['?', '!', '.', ',']
all_words = [stem(w) for w in all_words if w not in ignore_words]

# Sort all words and remove duplicates
all_words = sorted(set(all_words))
# Sort tags and remove duplicates
tags = sorted(set(tags))

# Create training data
X_train = [] # bag of words for each pattern
y_train = [] # label for each tag

for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    # y: PyTorch CrossEntropyLoss
    label = tags.index(tag)
    y_train.append(label) # CrossEntropyLoss

# Convert to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

In [3]:
class ChatDataset(Dataset):
    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    # Dataset[idx]
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    # len(Dataset)
    def __len__(self):
        return self.n_samples
    
# Hyperparameters
batch_size = 8
hidden_size = 8
output_size = len(tags)
input_size = len(X_train[0])
learning_rate = 0.001
num_epochs = 1000

dataset = ChatDataset()

# Data loader which takes the dataset, shuffles it, and creates batches
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=2)



---

## 3. PyTorch model and training

A Feed Forward Neural Net

- Input_size : Number of different patterns which is fixed
- Hidden_size : can be changed

In [4]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__() # inherit from nn.Module
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, num_classes)

        # Create an activation function for in-between layers
        self.relu = nn.ReLU()

    # Our model
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        # no activation and no softmax because we use cross entropy loss
        return out

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #verify if GPU is available
model = NeuralNet(input_size, hidden_size, output_size).to(device) #push it to device if it's available

# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(device)

        # forward
        outputs = model(words)
        loss = criterion(outputs, labels)

        # backward and optimizer step
        optimizer.zero_grad() #empty the gradients first
        loss.backward() #calculate the gradients / the backpropagation
        optimizer.step()

    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print(f'final loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 1.4959
Epoch [200/1000], Loss: 0.8032
Epoch [300/1000], Loss: 0.4153
Epoch [400/1000], Loss: 0.0310
Epoch [500/1000], Loss: 0.2056
Epoch [600/1000], Loss: 0.0913
Epoch [700/1000], Loss: 0.0043
Epoch [800/1000], Loss: 0.0081
Epoch [900/1000], Loss: 0.0019
Epoch [1000/1000], Loss: 0.0010
final loss: 0.0010


---

## 4. Save and load model and implement the chat

In [6]:
data = {
    "model_state": model.state_dict(),
    "input_size": input_size,
    "output_size": output_size,
    "hidden_size": hidden_size,
    "all_words": all_words,
    "tags": tags
}

# Serialize it
FILE = "data.pth" # for pytorch

torch.save(data, FILE)

print(f'training complete. file savec to {FILE}')

training complete. file savec to data.pth


### **Implement the chat**

In [7]:
import random
import json

data = torch.load(FILE)

input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data["all_words"]
tags = data["tags"]
model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval() # set model to evaluation mode

bot_name = "Sam"
print("Let's chat! type 'quit' to exit")

while True:
    sentence = input('You: ')
    if sentence == "quit" :
        break
    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0]) # reshape to fit the model (one sample)
    X = torch.from_numpy(X)

    output = model(X)
    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()] #predicted.item : class label

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]

    if prob.item() > 0.75:
        for intent in intents["intents"]:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")
    else:
        print(f"{bot_name}: I do not understand...")

Let's chat! type 'quit' to exit
Sam: Hello!
Sam: Hello!
Sam: Hello!
Sam: Good to see you again!
Sam: Hello!
Sam: Hello!
Sam: Hi there, how can I help?
Sam: Hello!
