## Imports

In [3]:
import nltk
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
nltk.download('punkt')
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Function definitions

In [4]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence)

def stem(word):
    stemmer = PorterStemmer()
    return stemmer.stem(word=word.lower())

def bag_of_words(tokenized_sentence, all_words):
    tokenized_sentence = [stem(w) for w in tokenized_sentence]
    
    bag = np.zeros(len(all_words), dtype=np.float32)
    for idx, w in enumerate(all_words):
        if w in tokenized_sentence:
            bag[idx] = 1.0
            
    return bag

## Training

In [5]:
import pandas as pd
import json

intents_key = pd.read_csv('intents-key.csv')
intents_df = pd.read_csv('intents.csv')
df = pd.read_csv('dataset_appended.csv')
df = df.rename(columns={'Unnamed: 0': 'id', 'intent': 'tag', 'prompt': 'patterns', 'completion': 'responses'})

intents = {'intents': []}
tag_list = list(set(df['tag']))
for i, tag in enumerate(tag_list):
    intents['intents'].append(dict())
    intents['intents'][i]['tag'] = tag
    intents['intents'][i]['patterns'] = list(df[df['tag']==tag]['patterns'])
    intents['intents'][i]['responses'] = list(set(intents_df[intents_df['intent']==tag]['completion']))
    intents['intents'][i]['area'] = list(intents_df[intents_df['intent']==tag]['area'])[0]
    intents['intents'][i]['context'] = intents_key[intents_key['area']==intents['intents'][i]['area']]['context'].iloc[0]
    
with open('intents.json', 'w') as f:
    json.dump(intents, f) 
    
with open('intents.json', 'r') as f:
    intents = json.load(f)

In [6]:
all_words = []
tags = []
Xy = []

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        words = tokenize(pattern)
        all_words.extend(words)
        Xy.append((words, tag))    
        
ignore_words = ['?', '!', '.', ',']
all_words = [stem(w) for w in all_words if w not in ignore_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags))

In [7]:
X_train = []
y_train = []

for pattern_sentence, tag in Xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    
    label = tags.index(tag)
    y_train.append(label)
    
X_train = np.array(X_train)
y_train = np.array(y_train)

print(X_train.shape)
print(y_train.shape)

(4911, 559)
(4911,)


In [8]:
class ChatDataset(Dataset):
    def __init__(self, X_train, y_train):
        self.n_samples = len(X_train)
        self.x_data = torch.from_numpy(X_train)
        self.y_data = torch.from_numpy(y_train).type(torch.LongTensor)
        
    def __getitem__(self, idx):
        return self.x_data[idx], self.y_data[idx]
    
    def __len__(self):
        return self.n_samples
    
batch_size = 2048
    
dataset = ChatDataset(X_train, y_train)
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=0, shuffle=True)

In [9]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        return out

input_size = len(X_train[0])
hidden_size = 8
output_size = len(tags)
    
model = NeuralNet(input_size, hidden_size, output_size).to(device)

In [10]:
learning_rate = 0.001
num_epochs = 500

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for words, labels in train_loader:
        words, labels = words.to(device), labels.to(device)
        
        # forward
        outputs = model(words)
        loss = criterion(outputs, labels)
        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1)%50 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss = {loss.item():.4f}")
    
print(f"Final loss = {loss.item():.4f}")

Epoch 50/500, Loss = 2.9340
Epoch 100/500, Loss = 1.8812
Epoch 150/500, Loss = 0.9954
Epoch 200/500, Loss = 0.5663
Epoch 250/500, Loss = 0.3648
Epoch 300/500, Loss = 0.2655
Epoch 350/500, Loss = 0.2375
Epoch 400/500, Loss = 0.1808
Epoch 450/500, Loss = 0.1486
Epoch 500/500, Loss = 0.1471
Final loss = 0.1471


In [11]:
data = {
    'model_state': model.state_dict(),
    'input_size': input_size,
    'output_size': output_size,
    'hidden_size': hidden_size,
    'all_words': all_words,
    'tags': tags
}

torch.save(data, "model.pth")

## Inference

In [12]:
data = torch.load("model.pth")

input_size = data['input_size']
hidden_size = data['hidden_size']
output_size = data['output_size']
all_words = data['all_words']
tags = data['tags']

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(data['model_state'])
model.eval()

NeuralNet(
  (l1): Linear(in_features=559, out_features=8, bias=True)
  (l2): Linear(in_features=8, out_features=8, bias=True)
  (l3): Linear(in_features=8, out_features=40, bias=True)
  (relu): ReLU()
)

In [24]:
import random

use_gpt3 = 1
if use_gpt3: import openai

bot_name = "Chazbot"
start_token = f"\n{bot_name}:"
restart_token = "\nYou:"
greeting = f"{start_token} Hello there. I am {bot_name}, designed to answers general questions your may have about Charlie! I will do my best to answer any queries, and remember: to end the conversation, enter 'quit'."

conversation = []

print(greeting)
conversation.append(greeting)
while True:
    while True:
        sentence = input(f"{restart_token} ")
        if len(sentence) > 80:
            print(f"Your sentence was {len(sentence)} characters. The maximum allowable characters is 80.")
        else:
            break
    
    conversation.append(restart_token + " " + sentence)
    
    if sentence.lower() == "quit":
        break
    
    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)
    
    output = model(X)
    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()]
    
    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    
    if prob.item() > 0.0: 
        for intent in intents['intents']:
            if tag == intent['tag']:
                if use_gpt3:
                    
                    context = intent['context']
                    openai.api_key = "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"
                    recent_conversation = "".join(conversation[-6:])
                    prompt=f"Context: {context}\n{recent_conversation}{start_token}"
                    response = openai.Completion.create(
                        model='text-davinci-002',
                        prompt=prompt,
                        temperature=0.5,
                        max_tokens=256,
                        top_p=1,
                        best_of=1,
                        frequency_penalty=1,
                        presence_penalty=0.2,
                        stop=[restart_token]
                    )
                    output = f"{start_token} {response['choices'][0]['text'].strip()}"
                else:
                    output = f"{start_token} {random.choice(intent['responses'])}"
                print(output)
    else:
        output = f"{start_token} I do not understand the question, please rephrase."
        print(output)
        
    conversation.append(output)


Chazbot: Hello there. I am Chazbot, designed to answers general questions your may have about Charlie! I will do my best to answer any queries, and remember: to end the conversation, enter 'quit'.

You: quit


In [25]:
conversation

["\nChazbot: Hello there. I am Chazbot, designed to answers general questions your may have about Charlie! I will do my best to answer any queries, and remember: to end the conversation, enter 'quit'.",
 '\nYou: quit']

### JSONL Conversion and fine-tuning preparation

In [28]:
prompts_completions = []
for i, row, in df.iterrows():
    prompts_completions.append({'prompt': row['patterns'], 'completion': row['responses']})

with open('prompts_completions.json', 'w') as f:
    json.dump(prompts_completions, f)

In [32]:
with open('prompts_completions.json', 'r') as f:
    prompts_completions = json.load(f)
    
with open('prompts_completions.jsonl', 'w') as f:
    for pair in prompts_completions:
        json.dump(pair, f)
        f.write('\n')

In [55]:
with open('prompts_completions_prepared.jsonl', 'r') as f:
    json_list = list(f)
    prompts_completions = [json.loads(json_str) for json_str in json_list]
prompts_completions[398:400]

[{'prompt': 'The writer is closest to Charlie. ->',
  'completion': ' The Greatcoats series was written by Sebastian De Castell. END'},
 {'prompt': "Which writer is closest to Charlie's heart? ->",
  'completion': " Sebastian De Castell is the writer closest to Charlie's heart. END"}]

In [140]:
print(prompt)

Context: <context>

Chazbot: Hello there. I am Chazbot, designed to answers general questions your may have about Charlie. Please ask away.
You: how old
Chazbot: Charlie is 28 years old.
You: quit
Chazbot:
