# Chatbot for the Harper Valley Dataset

In this section, we fine tune a pre-trained transformer model to the Harper Valley Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

path = 'NLP-Project'

os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()

'/content/drive/MyDrive/NLP-Project'

## Install packages and initial imports

In [None]:
!pip install transformers
!pip install --upgrade accelerate
!pip install datasets

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import re
import json
import torch
import datasets
from sklearn.model_selection import train_test_split

In [6]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm, trange

from transformers import (
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

## Load the data and get conversations

Get the data

In [7]:
import pandas as pd
transcript_data = pd.read_csv('TranscriptDB.csv')

Clean the data by removing noise and unknown symbols from the human transcription

In [8]:
transcript_data_sorted = transcript_data.sort_values(by=['index'])

# The unique words included in the conversations which don't have a specific meaning, for example [noise] [cough] ecc
unique_words = set(word for sublist in transcript_data_sorted['human_transcript'].str.findall(r'\[([^\]]+)\]').dropna() for word in sublist)

print(unique_words)

pattern = r'(\[(' + '|'.join(unique_words) + r')\])|\<unk\>'

# Substitute the unique words and the <unk> symbols in each conversation with the empty '' string
transcript_data_sorted['human_transcript'] = transcript_data_sorted['human_transcript'].replace(to_replace=pattern, value='', regex=True).str.strip()

# Remove frome the dataframe the rows for wich the human_transcript is empty
transcript_data_sorted.drop(transcript_data_sorted[transcript_data_sorted['human_transcript'] == ''].index, inplace=True)
transcript_data_sorted = transcript_data_sorted.dropna(subset=['human_transcript'])


{'dogs', 'kids', 'ringing', 'unintelligible', 'music', 'noise', 'laughter', 'cough', 'baby'}


Get the list of conversations

In [9]:
# Function to convert DataFrame to array of arrays of dictionaries
def convert_to_conversations(df):
    conversations = []

    # Group by 'identifier'
    grouped = df.groupby('identifier')

    for group_id, group_df in grouped:
        # Sort by 'index'
        sorted_group = group_df.sort_values(by='index')

        # Initialize the list for this conversation
        conversation = []

        # Iterate through the sorted rows
        previous_type = None
        accumulated_text = ""

        for _, row in sorted_group.iterrows():
            current_type = row['speaker_role']
            current_text = row['human_transcript']

            if current_type == previous_type:
                # Concatenate text if the type is the same as the previous one
                accumulated_text += " " + current_text
            else:
                # Add the previous accumulated text to the conversation list
                if previous_type is not None:
                    conversation.append({'speaker_role': previous_type, 'text': accumulated_text})

                # Reset for the new type
                previous_type = current_type
                accumulated_text = current_text

        # Add the last accumulated text to the conversation list
        if previous_type is not None:
            conversation.append({'speaker_role': previous_type, 'text': accumulated_text})

        # Add to the final list of conversations
        conversations.append(conversation)

    return conversations

# Get the conversations
conversations = convert_to_conversations(transcript_data_sorted)
print(conversations[0])

[{'speaker_role': 'agent', 'text': 'hello this is harper valley national bank my name is elizabeth how can i help you today'}, {'speaker_role': 'caller', 'text': 'hi my name is patricia brown i lost my debit card can you send me a new one'}, {'speaker_role': 'agent', 'text': 'which card would you like to replace'}, {'speaker_role': 'caller', 'text': 'my debit card'}, {'speaker_role': 'agent', 'text': 'can you repeat that please'}, {'speaker_role': 'caller', 'text': 'yes my debit card'}, {'speaker_role': 'agent', 'text': 'is there anything else i can help you with today'}, {'speaker_role': 'caller', 'text': 'no that was going to be it'}, {'speaker_role': 'agent', 'text': 'thank you for calling have a great day'}, {'speaker_role': 'caller', 'text': 'bye'}]


Split the data in training and validation

In [12]:
train_data, validation_data = train_test_split(conversations,test_size=0.2)

## Load the model and preprocessing

We load the model to fine tune

In [13]:
output_dir = 'chatbot-3'
model_name_or_path = 'microsoft/DialoGPT-medium'

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelWithLMHead.from_pretrained(
    model_name_or_path,
    from_tf=False,
    config=config,
)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

We set the pad_token to the eos_token

In [15]:
tokenizer.pad_token = tokenizer.eos_token

We convert each conversation in a dialogue string

In [16]:
def sample_to_string(sample, eos_token):
    # Join strings
    dialogue = eos_token.join(f" {utterance['speaker_role']}: {utterance['text']} " for utterance in sample)
    # Build the dialogue string
    dialogue_string = f"{dialogue}{eos_token}"

    return dialogue_string

We apply the function to all the conversations

In [17]:
train_data_str = [sample_to_string(dialogue, tokenizer.eos_token) for dialogue in train_data]
validation_data_str = [sample_to_string(dialogue, tokenizer.eos_token) for dialogue in validation_data]

In [18]:
validation_data_str[0]

" agent: hello this is harper valley national bank my name is robert how can i help you today <|endoftext|> caller: hi my name is michael johnson i would like to pay a bill <|endoftext|> agent: and what bill would you like to pay <|endoftext|> caller: the company is fossil gas <|endoftext|> agent: and the amount <|endoftext|> caller: um one hundred and twenty one dollars <|endoftext|> agent: alright great and i have put that payment through is there anything else i can help you with <|endoftext|> caller: nope that's it thank you <|endoftext|> agent: thank you for calling have a great day <|endoftext|> caller: you too <|endoftext|> agent: bye <|endoftext|>"

Define the Dataset class

In [19]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer, conversations):

        self.examples = []
        for c in conversations:
            conv = tokenizer.encode(c, max_length = 500, padding = 'max_length')
            self.examples.append(conv)


    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

Define the function for loading a specific split of the data

In [20]:
def load_and_cache_examples(tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, df_val if evaluate else df_trn)

## Training

Define the training loop

In [21]:
# Training of model

def train(train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
    """ Train the model """

    #Training Arguments
    train_batch_size = 1
    num_train_epochs = 3
    gradient_accumulation_steps = 8
    warmup_steps = 0
    learning_rate = 5e-5
    adam_epsilon = 1e-8
    weight_decay = 0.0
    save_steps = 500
    overwrite_output_dir = True

    #Collate
    def collate(examples):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)


    #Data Loader
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=train_batch_size, collate_fn=collate, drop_last = True
    )

    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer(AdamW) and scheduler (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )


    # TRAIN
    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    # Check if continuing training from a checkpoint
    if model_name_or_path and os.path.exists(model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // gradient_accumulation_steps)
        except ValueError:
            print("  Starting fine-tuning.")

    tr_loss = 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(num_train_epochs), desc="Epoch")

    #Training Loop
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(device)
            labels = labels.to(device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps
            loss.backward()

            tr_loss += loss.item()

            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if save_steps > 0 and global_step % save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))


    return global_step, tr_loss / global_step

Get the training data and start the training process

In [22]:
# Training
train_dataset = load_and_cache_examples(tokenizer, train_data_str, validation_data_str, evaluate=False)

global_step, tr_loss = train(train_dataset, model, tokenizer)
print(f"global_step = {global_step}, average loss = {tr_loss}")

# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)




Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

global_step = 432, average loss = 0.2189286477293569




GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

## Evaluate the model

Define the evaluation loop

In [40]:
def evaluate(evaluation_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix=""):

    eval_batch_size = 1

    def collate(examples):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(evaluation_dataset)
    eval_dataloader = DataLoader(
        evaluation_dataset, sampler=eval_sampler, batch_size=eval_batch_size, collate_fn=collate, drop_last = True
    )


    # Eval!
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    return result

Load the model

In [None]:
# Load a trained model and vocabulary that you have fine-tuned
model = AutoModelWithLMHead.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model.to(device)


Run the evaluation process on the validation dataset

In [41]:
# Evaluation
evaluation_dataset = load_and_cache_examples(tokenizer, train_data_str, validation_data_str, evaluate=True)

result = evaluate(evaluation_dataset, model, tokenizer)
print(f"Perplexity on the validation set = {result['perplexity']}")

Evaluating:   0%|          | 0/290 [00:00<?, ?it/s]

Perplexity on the validation set = 826.120849609375


Seems very bad

We can also compute the perplexity on a single example

In [52]:
import random

random.seed(2)

idx = random.choice(range(len(validation_data)))
dialogue = validation_data[idx]
print(dialogue)

[{'speaker_role': 'agent', 'text': 'hello this is happy valley national bank my name is linda how can i help you today'}, {'speaker_role': 'caller', 'text': 'hi my name is james rodriguez i would like to schedule an appointment'}, {'speaker_role': 'agent', 'text': 'sure i can help you with that uh what day would you like for what day would you like for your appointment'}, {'speaker_role': 'caller', 'text': 'tuesday please'}, {'speaker_role': 'agent', 'text': 'okay tuesday and what time would you lie for your appointment'}, {'speaker_role': 'caller', 'text': 'three forty five p m'}, {'speaker_role': 'agent', 'text': 'three forty five p m okay is there anything else i can help you with'}, {'speaker_role': 'caller', 'text': 'no that was it'}, {'speaker_role': 'agent', 'text': 'okay thank you for calling have a great day'}]


We obtain the true response

In [53]:
response_idx = len(dialogue) // 2

original_response = dialogue[response_idx]
original_response_string = f"{original_response['speaker_role']}: {original_response['text']}"
print(original_response_string)

agent: okay tuesday and what time would you lie for your appointment


We build the context from the dialogue excluding the part starting from the response

In [54]:
context =dialogue[:response_idx]
context_string = sample_to_string(context, tokenizer.eos_token)
print(context_string.replace(tokenizer.eos_token, '\n'))

 agent: hello this is happy valley national bank my name is linda how can i help you today 
 caller: hi my name is james rodriguez i would like to schedule an appointment 
 agent: sure i can help you with that uh what day would you like for what day would you like for your appointment 
 caller: tuesday please 



We generate the response using the model and we observe that the answer seems pretty accurate

In [56]:
# Encode context
input_encoding = tokenizer(context_string, return_tensors='pt').to(device)
# Generate response
output_ids =  model.generate(input_encoding.input_ids, max_new_tokens=50,num_beams=8, pad_token_id=tokenizer.eos_token_id)
# Decode generated response
generated_response = tokenizer.decode(output_ids[0, input_encoding.input_ids.size(1):], skip_special_tokens=True)
print(generated_response)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


 agent: tuesday and what time would you like for your appointment 


We compute the perplexity

In [57]:
# Encode dialogue
input_encoding = tokenizer(context_string + original_response_string + tokenizer.eos_token, return_tensors='pt').to(device)
# Compute model outputs
outputs = model(**input_encoding)

In [58]:
labels = tokenizer(original_response_string + tokenizer.eos_token, return_tensors='pt').input_ids.to(device)
labels.size()

torch.Size([1, 15])

In [59]:
logits = outputs.logits[:, -labels.size(1):]
logits.size()

torch.Size([1, 15, 50257])

In [60]:
import torch.nn.functional as F

# Shift logits to exclude the last element
shift_logits = logits[..., :-1, :].contiguous()
# shift labels to exclude the first element
shift_labels = labels[..., 1:].contiguous()
# Compute loss
lm_loss = F.cross_entropy(
    shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
)
lm_loss

tensor(2.6899, device='cuda:0', grad_fn=<NllLossBackward0>)

In [61]:
ppl = torch.exp(lm_loss)
print(ppl)

tensor(14.7298, device='cuda:0', grad_fn=<ExpBackward0>)


## Chatbot interaction

In [None]:
dialogue = [{'speaker_role': 'agent', 'text': 'hello this is harper valley national bank my name is elizabeth how can i help you today'}]

tokenizer.padding_side = 'left'
# Maximum dialogue length (in turn pairs)
max_len = 10

print(dialogue[0]['text'])

for i in range(max_len):
    # Read user message
    user_message = input("Caller: ")
    # Append message to dialogue history
    dialogue.append(
        {'speaker_role': 'caller', 'text': user_message.lower()}
    )
    # Convert dialogue to string
    input_string = sample_to_string(dialogue, tokenizer.eos_token)
    # Encode input
    input_encoding = tokenizer(input_string, return_tensors='pt').to(device)
    # Generate DialoGPT response
    output_ids = model.generate(input_encoding.input_ids, num_beams=8, max_new_tokens=35, pad_token_id=tokenizer.eos_token_id, early_stopping = True)
    chatbot_response = tokenizer.decode(output_ids[0, input_encoding.input_ids.size(1):], skip_special_tokens=True)
    # Crop initial speaker token
    chatbot_response = chatbot_response[7:]
    # Append chatbot response to dialogue history
    dialogue.append(
        {'speaker_role': 'agent', 'text': chatbot_response}
    )
    # Print chatbot response
    print(f"Chatbot: {chatbot_response}")