In [None]:
# Licensing Information:  You are free to use or extend this project for
# educational purposes provided that (1) you do not distribute or publish
# solutions, (2) you retain this notice, and (3) you provide clear
# attribution to The Georgia Institute of Technology, including a link to https://aritter.github.io/CS-7650/

# Attribution Information: 
# This Project was developed at the Georgia Institute of Technology by Ashutosh Baheti (ashutosh.baheti@cc.gatech.edu), 
# borrowing  from the Neural Machine Translation Project (Project 2) 
# of the UC Berkeley NLP course https://cal-cs288.github.io/sp20/

# Project #3: Neural Chatbot

Neural Dialog Model are Sequence-to-Sequence (Seq2Seq) models that produce conversational response given the dialog history. State-of-the-art dialog models are trained on millions of multi-turn conversations. However, in this assignment we will narrow our scope to single turn conversations to make the problem easier.  

In this assignment you will implement,
1. Seq2Seq encoder-decoder model
2. Seq2Seq model with attention mechanism
3. Greedy and Beam search decoding algorithms  

First import libraries required for the implementation

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk
from google.colab import files

Then we implement some standard util functions that will be useful in the rest of the code.

In [None]:
# General util functions
def make_dir_if_not_exists(directory):
	if not os.path.exists(directory):
		logging.info("Creating new directory: {}".format(directory))
		os.makedirs(directory)

def print_list(l, K=None):
	# If K is given then only print first K
	for i, e in enumerate(l):
		if i == K:
			break
		print(e)
	print()

def remove_multiple_spaces(string):
	return re.sub(r'\s+', ' ', string).strip()

def save_in_pickle(save_13.77object, save_file):
	with open(save_file, "wb") as pickle_out:
		pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
	with open(pickle_file, "rb") as pickle_in:
		return pickle.load(pickle_in)

def save_in_txt(list_of_strings, save_file):
	with open(save_file, "w") as writer:
		for line in list_of_strings:
			line = line.strip()
			writer.write(f"{line}\n")

def load_from_txt(txt_file):
	with open(txt_file, "r") as reader:
		all_lines = list()
		for line in reader:
			line = line.strip()
			all_lines.append(line)
		return all_lines

Finally we will check if GPU is available and set the device accordingly.

Tip: While debugging use `CPU` and change the runtime type to `GPU` when you are ready to train your models to efficiently use free Colab GPU

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


## Dataset

For the dataset we will be using a small sample of single turn input and response pairs from [Cornell Movie Dialog Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html). We filter conversational pairs with sentences > 10 tokens. To reduce your work, we have already created a sample of tokenized, lowercased single turn conversations from Cornell Movie Dialog Corpus. The preprocessed dataset sample is stored in pickle format and can be downloaded from [this link](https://drive.google.com/file/d/1qYdSlDJ89AvgozK3V5tik8Op93zPbG6e/view?usp=sharing). Please download the `processed_CMDC.pkl` file from the link and upload it in colab.

In [None]:
# Loading the pre-processed conversational exchanges (source-target pairs) from pickle data files
all_conversations = load_from_pickle("processed_CMDC.pkl")
# Extract 100 conversations from the end for evaluation and keep the rest for training
eval_conversations = all_conversations[-100:]
all_conversations = all_conversations[:-100]

# Logging data stats
print(f"Number of Training Conversation Pairs = {len(all_conversations)}")
print(f"Number of Evaluation Conversation Pairs = {len(eval_conversations)}")

Number of Training Conversation Pairs = 53065
Number of Evaluation Conversation Pairs = 100


Let's print a couple of conversations to check if they are loaded properly.

In [None]:
print_list(all_conversations, 5)

('there .', 'where ?')
('you have my word . as a gentleman', 'you re sweet .')
('hi .', 'looks like things worked out tonight huh ?')
('have fun tonight ?', 'tons')
('well no . . .', 'then that s all you had to say .')



## Vocabulary

The words in the sentences need to be converted into integer tokens so that the neural model can operate on them. For this purpose, we will create a vocabulary which will convert the input strings into model recognizable integer tokens.

In [None]:
pad_word = "<pad>"
bos_word = "<s>"
eos_word = "</s>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3
    
def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
        self.word_count = {}
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}
        self.num_words = 4
    
    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word] if word in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                # add this word to the vocabulary
                self.word_to_id[word] = self.num_words
                self.id_to_word[self.num_words] = word
                self.word_count[word] = 1
                self.num_words += 1
            else:
                # update the word count
                self.word_count[word] += 1

vocab = Vocabulary()
for src, tgt in all_conversations:
    vocab.add_words_from_sentence(src)
    vocab.add_words_from_sentence(tgt)
print(f"Total words in the vocabulary = {vocab.num_words}")

Total words in the vocabulary = 7727


Let's print top 30 vocab words:

In [None]:
print_list(sorted(vocab.word_count.items(), key=lambda item: item[1], reverse=True), 30)

('.', 84255)
('?', 36822)
('you', 25093)
('i', 18946)
('what', 10765)
('s', 10089)
('it', 9668)
('!', 8872)
('the', 8011)
('t', 7411)
('to', 6929)
('a', 6582)
('that', 5992)
('no', 4931)
('me', 4839)
('do', 4745)
('is', 4434)
('don', 3577)
('are', 3503)
('he', 3413)
('yes', 3384)
('m', 3382)
('not', 3252)
('we', 3252)
('know', 3171)
('re', 2965)
('your', 2809)
('this', 2726)
('yeah', 2708)
('in', 2678)



Print a couple of sentences to verify that the vocabulary is working as intended.

In [None]:
for src, tgt in all_conversations[:3]:
    sentence = tgt
    word_tokens = vocab.tokenized_sentence(sentence)
    # Automatically adds bos_id and eos_id before and after sentence ids respectively
    word_ids = vocab.get_ids_from_sentence(sentence)
    print(sentence)
    print(word_tokens)
    print(word_ids)
    print(vocab.decode_sentence_from_ids(word_ids))
    print()

word = "the"
word_id = vocab.word_to_id[word]
print(f"Word = {word}")
print(f"Word ID = {word_id}")
print(f"Word decoded from ID = {vocab.decode_sentence_from_ids([word_id])}")

where ?
['<s>', 'where', '?', '</s>']
[1, 6, 7, 2]
where ?

you re sweet .
['<s>', 'you', 're', 'sweet', '.', '</s>']
[1, 8, 15, 16, 5, 2]
you re sweet .

looks like things worked out tonight huh ?
['<s>', 'looks', 'like', 'things', 'worked', 'out', 'tonight', 'huh', '?', '</s>']
[1, 18, 19, 20, 21, 22, 23, 24, 7, 2]
looks like things worked out tonight huh ?

Word = the
Word ID = 47
Word decoded from ID = the


## Dataset Prepration (5 points)

We will use built-in dataset utilities, `torch.utils.data.Dataset` and `torch.utils.data.DataLoader`, to get batched data readily useful for training.

In [None]:
class SingleTurnMovieDialog_dataset(Dataset):
    """Single-Turn version of Cornell Movie Dialog Cropus dataset."""

    def __init__(self, conversations, vocab, device):
        """
        Args:
            conversations: list of tuple (src_string, tgt_string) 
                         - src_string: String of the source sentence
                         - tgt_string: String of the target sentence
            vocab: Vocabulary object that contains the mapping of 
                    words to indices
            device: cpu or cuda
        """
        self.conversations = conversations
        self.vocab = vocab
        self.device = device

        def encode(src, tgt):
            src_ids = self.vocab.get_ids_from_sentence(src)
            tgt_ids = self.vocab.get_ids_from_sentence(tgt)
            return (src_ids, tgt_ids)

        # We will pre-tokenize the conversations and save in id lists for later use
        self.tokenized_conversations = [encode(src, tgt) for src, tgt in self.conversations]
        
    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {"conv_ids":self.tokenized_conversations[idx], "conv":self.conversations[idx]}

def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (src_seq, trg_seq).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of dicts {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, trg_str)}.
            - src_ids: list of src piece ids; variable length.
            - tgt_ids: list of tgt piece ids; variable length.
            - src_str: String of src
            - tgt_str: String of tgt
    Returns: dict { "conv_ids":     (src_ids, tgt_ids), 
                    "conv":         (src_str, tgt_str), 
                    "conv_tensors": (src_seqs, tgt_seqs)}
            src_seqs: torch tensor of shape (src_padded_length, batch_size).
            trg_seqs: torch tensor of shape (tgt_padded_length, batch_size).
            src_padded_length = length of the longest src sequence from src_ids
            tgt_padded_length = length of the longest tgt sequence from tgt_ids
    """
    # Sort conv_ids based on decreasing order of the src_lengths.
    # This is required for efficient GPU computations.
    src_ids = [torch.LongTensor(e["conv_ids"][0]) for e in data]
    tgt_ids = [torch.LongTensor(e["conv_ids"][1]) for e in data]
    src_str = [e["conv"][0] for e in data]
    tgt_str = [e["conv"][1] for e in data]
    data = list(zip(src_ids, tgt_ids, src_str, tgt_str))
    data.sort(key=lambda x: len(x[0]), reverse=True)
    src_ids, tgt_ids, src_str, tgt_str = zip(*data)


    # Pad the src_ids and tgt_ids using token pad_id to create src_seqs and tgt_seqs
    
    # Implementation tip: You can use the nn.utils.rnn.pad_sequence utility
    # function to combine a list of variable-length sequences with padding.
    
    # YOUR CODE HERE
    src_seqs = pad_sequence(src_ids, batch_first=False, padding_value=pad_id)
    tgt_seqs = pad_sequence(tgt_ids, batch_first=False, padding_value=pad_id)

    return {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, tgt_str), "conv_tensors":(src_seqs.to(device), tgt_seqs.to(device))}

In [None]:
# Create the DataLoader for all_conversations
dataset = SingleTurnMovieDialog_dataset(all_conversations, vocab, device)

batch_size = 5

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

Let's test a batch of data to make sure everything is working as intended

In [None]:
# Test one batch of training data
first_batch = next(iter(data_loader))
print(f"Testing first training batch of size {len(first_batch['conv'][0])}")
print(f"List of source strings:")
print_list(first_batch["conv"][0])
print(f"Tokenized source ids:")
print_list(first_batch["conv_ids"][0])
print(f"Padded source ids as tensor (shape {first_batch['conv_tensors'][0].size()}):")
print(first_batch["conv_tensors"][0])

Testing first training batch of size 5
List of source strings:
sorry i m way behind .
you re shittin me .
that was good .
what s that ?
. . .tunnel !

Tokenized source ids:
tensor([   1,  392,   54,  164,  271, 1006,    5,    2])
tensor([   1,    8,   15, 4813,   75,    5,    2])
tensor([ 1, 30, 89, 45,  5,  2])
tensor([ 1, 44, 31, 30,  7,  2])
tensor([   1,    5,    5, 6336,   58,    2])

Padded source ids as tensor (shape torch.Size([8, 5])):
tensor([[   1,    1,    1,    1,    1],
        [ 392,    8,   30,   44,    5],
        [  54,   15,   89,   31,    5],
        [ 164, 4813,   45,   30, 6336],
        [ 271,   75,    5,    7,   58],
        [1006,    5,    2,    2,    2],
        [   5,    2,    0,    0,    0],
        [   2,    0,    0,    0,    0]], device='cuda:0')


## Baseline Seq2Seq model (25 points)

With the training `Dataset` and `DataLoader` ready, we can implement our Seq2Seq baseline model. 

The model will consist of
1. Shared embedding layer between encoder and decoder that converts the input sequence of word ids to dense embedding representations
2. Bidirectional GRU encoder that encodes the embedded source sequence into hidden representation
3. GRU decoder that predicts target sequence using final encoder hidden representation

In [None]:
class Seq2seqBaseline(nn.Module):
    def __init__(self, vocab, emb_dim = 300, hidden_dim = 300, num_layers = 2, dropout=0.1):
        super().__init__()

        # Initialize your model's parameters here. To get started, we suggest
        # setting all embedding and hidden dimensions to 300, using encoder and
        # decoder GRUs with 2 layers, and using a dropout rate of 0.1.

        # Implementation tip: To create a bidirectional GRU, you don't need to
        # create two GRU networks. Instead use nn.GRU(..., bidirectional=True).
        
        self.num_words = num_words = vocab.num_words
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        # YOUR CODE HERE

        self.embedding = torch.nn.Embedding(num_words, emb_dim)
        self.encoder_gru = torch.nn.GRU(emb_dim, hidden_size=hidden_dim, num_layers=2, dropout=dropout, bidirectional=True)
        self.decoder_gru = torch.nn.GRU(emb_dim, hidden_size=hidden_dim, num_layers=2, dropout=dropout)
        self.fc = torch.nn.Linear(hidden_dim, num_words)
    

    def encode(self, source):
        """Encode the source batch using a bidirectional GRU encoder.

        Args:
            source: An integer tensor with shape (max_src_sequence_length,
                batch_size) containing subword indices for the source sentences.

        Returns:
            A tuple with three elements:
                encoder_output: The output hidden representation of the encoder 
                    with shape (max_src_sequence_length, batch_size, hidden_size).
                    Can be obtained by adding the hidden representations of both 
                    directions of the encoder bidirectional GRU. 
                encoder_mask: A boolean tensor with shape (max_src_sequence_length,
                    batch_size) indicating which encoder outputs correspond to padding
                    tokens. Its elements should be True at positions corresponding to
                    padding tokens and False elsewhere.
                encoder_hidden: The final hidden states of the bidirectional GRU 
                    (after a suitable projection) that will be used to initialize 
                    the decoder. This should be a tensor h_n with shape 
                    (num_layers, batch_size, hidden_size). Note that the hidden 
                    state returned by the bi-GRU cannot be used directly. Its 
                    initial dimension is twice the required size because it 
                    contains state from two directions.

        The first two return values are not required for the baseline model and will
        only be used later in the attention model. If desired, they can be replaced
        with None for the initial implementation.
        """

        # Implementation tip: consider using packed sequences to more easily work
        # with the variable-length sequences represented by the source tensor.
        # See https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.PackedSequence.

        # https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch

        # Implementation tip: there are many simple ways to combine the forward
        # and backward portions of the final hidden state, e.g. addition, averaging,
        # or a linear transformation of the appropriate size. Any of these
        # should let you reach the required performance.

        # Compute a tensor containing the length of each source sequence.
        source_lengths = torch.sum(source != pad_id, axis=0).cpu()
       
        # YOUR CODE HERE
        input_embedding = self.embedding(source).cuda()
        gru_output, gru_hidden = self.encoder_gru(input_embedding)

        # Original(seq_len, batch, num_directions * hidden_size)
        # Reshaped(seq_len, batch, num_directions, hidden_size)
        output_view = gru_output.view(source.shape[0], source.shape[1], 2, self.hidden_dim)    
        encoder_output = output_view[:, :, 0, :] + output_view[:, :, 1, :]

        # Original(num_layers * num_directions, batch, hidden_size)
        # Reshaped(num_layers, num_directions, batch, hidden_size)
        hn_view = gru_hidden.view(self.num_layers, 2, source.shape[1], self.hidden_dim)    
        encoder_hidden = hn_view[:, 0, :, :] + hn_view[:, 1, :, :]

        encoder_mask = source == pad_id

        return (encoder_output, encoder_mask, encoder_hidden)


    def decode(self, decoder_input, last_hidden, encoder_output, encoder_mask):
        """Run the decoder GRU for one decoding step from the last hidden state.

        The third and fourth arguments are not used in the baseline model, but are
        included for compatibility with the attention model in the next section.

        Args:
            decoder_input: An integer tensor with shape (1, batch_size) containing 
                the subword indices for the current decoder input.
            last_hidden: A pair of tensors h_{t-1} representing the last hidden
                state of the decoder, each with shape (num_layers, batch_size,
                hidden_size). For the first decoding step the last_hidden will be 
                encoder's final hidden representation.
            encoder_output: The output of the encoder with shape
                (max_src_sequence_length, batch_size, hidden_size).
            encoder_mask: The output mask from the encoder with shape
                (max_src_sequence_length, batch_size). Encoder outputs at positions
                with a True value correspond to padding tokens and should be ignored.

        Returns:
            A tuple with three elements:
                logits: A tensor with shape (batch_size,
                    vocab_size) containing unnormalized scores for the next-word
                    predictions at each position.
                decoder_hidden: tensor h_n with the same shape as last_hidden 
                    representing the updated decoder state after processing the 
                    decoder input.
                attention_weights: This will be implemented later in the attention
                    model, but in order to maintain compatible type signatures, we also
                    include it here. This can be None or any other placeholder value.
        """

        # These arguments are not used in the baseline model.
        del encoder_output
        del encoder_mask
    
        # YOUR CODE HERE
        input_embedding = self.embedding(decoder_input)

        gru_output, gru_hidden = self.decoder_gru(input_embedding, last_hidden)
        # Just want last layer probs for logits other is both layers hiddens

        logits = self.fc(gru_output)  

        return (logits, gru_hidden, None)

    def compute_loss(self, source, target):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """

        # Implementation tip: don't feed the target tensor directly to the decoder.
        # To see why, note that for a target sequence like <s> A B C </s>, you would
        # want to run the decoder on the prefix <s> A B C and have it predict the
        # suffix A B C </s>.

        # You may run self.encode() on the source only once and decode the target 
        # one step at a time.

        # YOUR CODE HERE
        loss = torch.nn.CrossEntropyLoss(ignore_index=pad_id)

        encoder_output,encoder_mask,last_hidden = self.encode(source)

        total_loss = 0.0
        for i in range(target.shape[0]-1):
          logits,last_hidden,attention_weights = self.decode(target[i].view(1,target.size()[1]), last_hidden, encoder_output, encoder_mask)
          # The ith word in your prediction going into the decoder corresponds to the i+1th word in your decoder targets
          word_loss = loss(logits.squeeze(),target[i+1]) # ignore index should deal with padding
          total_loss += word_loss
        
        nonpad_tokens = torch.sum(torch.sum(source != pad_id, axis=0).cpu()).cpu()
        return total_loss / nonpad_tokens


We provide a training loop for training the model. You are welcome to modify the training loop by adjusting the learning rate or changing optmization settings.

**Important:** During our testing we found that training the encoder and decoder with different learning rates is crucial for getting good performance over the small dialog corpus. Specifically, the decoder parameter learning rate should be 5 times the encoder parameter learning rate. Hence, add the encoder parameter variable names in the `encoder_parameter_names` as a list. For example, if encoder is using `self.embedding_layer` and `self.encoder_gru` layer then the `encoder_parameter_names` should be `['embedding_layer', 'encoder_gru']` 

In [None]:
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):
    """Train the model for given number of epochs and save the trained model in 
    the final model_file.
    """

    decoder_learning_ratio = 5.0
    
    encoder_parameter_names = ['embedding', 'encoder_gru']
             
    encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([{'params': encoder_params},
                {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)
    
    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        # print(f"Total training instances = {len(train_dataset)}")
        # print(f"train_data_loader = {len(train_data_loader)} {1180 > len(train_data_loader)/20}")
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                source, target = batch_data["conv_tensors"]
                optimizer.zero_grad()
                loss = model.compute_loss(source, target)
                total_loss += loss.item()
                loss.backward()
                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
    # Save the model after training         
    torch.save(model.state_dict(), model_file)

We can now train the baseline model.

A correct implementation should get a average train loss of < 3.00  
The code will automatically save and download the model at the end of training.

In [None]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 6
batch_size = 64
# Reloading the data_loader to increase batch_size
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

baseline_model = Seq2seqBaseline(vocab).to(device)
train(baseline_model, data_loader, num_epochs, "baseline_model.pt")
# Download the trained model to local for future use
files.download('baseline_model.pt')

HBox(children=(FloatProgress(value=0.0, description='training', max=6.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='epoch 1', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 2', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 3', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 4', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 5', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 6', max=830.0, style=ProgressStyle(description_widt…





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Reload the model from the model file. 
# Useful when you have already trained and saved the model
baseline_model = Seq2seqBaseline(vocab).to(device)
baseline_model.load_state_dict(torch.load("baseline_model.pt", map_location=device))

<All keys matched successfully>

## Greedy Search (10 points)

For evaluation, we also need to be able to generate entire strings from the model. We'll first define a greedy inference procedure here. Later on, we'll implement beam search.


In [None]:
def predict_greedy(model, sentence, max_length=100):
    """Make predictions for the given input using greedy inference.
    
    Args:
        model: A sequence-to-sequence model.
        sentence: A input string.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.
    
    Returns:
        Model's predicted greedy response for the input, represented as string.
    """

    # You should make only one call to model.encode() at the start of the function, 
    # and make only one call to model.decode() per inference step.
    model.eval()

    # YOUR CODE HERE
    sentence_ids = torch.tensor(vocab.get_ids_from_sentence(sentence))

    encoder_output,encoder_mask,last_hidden = model.encode(torch.unsqueeze(sentence_ids,1).cuda())

    output_ids = torch.zeros((max_length), dtype=torch.long)
    output_ids[0] = bos_id # start of sentence

    i = 0
    while i < max_length and not int(output_ids[i]) == eos_id:
      this_hurts = torch.unsqueeze(torch.unsqueeze(output_ids[i],0),1)
      logits,last_hidden,attention_weights =  model.decode(this_hurts.cuda(), last_hidden.cuda(), encoder_output, encoder_mask)
      output_ids[i+1] = logits.argmax(dim=2)
      i += 1

    return vocab.decode_sentence_from_ids(output_ids.tolist())

Let's chat interactively with our trained baseline Seq2Seq dialog model and save the generated conversations for submission (please make sure to keep the conversations in your submission ["PG-13"](https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system)). We will reuse the conversational inputs while testing Seq2Seq + Attention model.

Note: enter "q" or "quit" to end the interactive chat

In [None]:
def chat_with_model(model, mode="greedy"):
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy
    chat_log = list()
    input_sentence = ''
    while(1):
        # Get input sentence
        input_sentence = input('Input > ')
        # Check if it is quit case
        if input_sentence == 'q' or input_sentence == 'quit': break
        
        generation = predict_f(model, input_sentence)
        if mode == "beam":
            generation = generation[0]
        print('Greedy Response:', generation)
        print()
        chat_log.append((input_sentence, generation))
    return chat_log

In [None]:
baseline_chat = chat_with_model(baseline_model)

Input > Why do you hate me?
Greedy Response: i don t know .

Input > You can give me a better answer than that.
Greedy Response: what ?

Input > What do you mean what!
Greedy Response: i m sorry .

Input > You should be.
Greedy Response: i m not sure .

Input > Stupid computer
Greedy Response: i ll be there .

Input > q


## Seq2Seq + Attention Model (15 points)

# Next, we extend the baseline model to include an attention mechanism in the decoder. With attention mechanism, the model doesn't need to encode the input into a fixed dimensional hidden representation. Rather, it creates a new context vector for each turn that is a weighted sum of encoder hidden representation. 

Your implementation can use any attention mechanism to get weight distribution over the source words. One simple way to include attention in decoder goes as follows (reminder: the decoder processed one token at a time),
1. Process the current decoder_input through embedding layer and decoder GRU layer.
2. Use the current decoder token representation, $d$ of shape $(1 * b * h)$ and encoder representation, $e_1, \dots, e_n$ or shape $(n * b * h)$, where $n$ is max_src_length after padding) to compute attention score matrix of shape $(b * n)$. There are multiple options to compute this score matrix. A few of such options are available in [the table provided in this blog](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html#a-family-of-attention-mechanisms)
3. Normalize the attention scores $(b * n)$ so that they sum up to $1.0$ by taking a `softmax` over the second dimention. 

After computing the normalized attention distribution, take a weighted sum of the encoder outputs to obtain the attention context $c = \sum_i w_i e_i$, and add this to the decoder output $d$ to obtain the final representation to be passed to the vocabulary projection layer (you may need another linear layer to make the sizes match before adding $c$ and $d$).

In [None]:
class Seq2seqAttention(Seq2seqBaseline):
    def __init__(self, vocab):
        super().__init__(vocab)

        # Initialize any additional parameters needed for this model that are not
        # already included in the baseline model.
        
        # YOUR CODE HERE
        self.attention_fc = nn.Linear(self.hidden_dim * 2, self.hidden_dim)
        self.attention_softmax = torch.nn.Softmax(dim=2)

    def decode(self, decoder_input, last_hidden, encoder_output, encoder_mask):
        """Run the decoder GRU for one decoding step from the last hidden state.

        The third and fourth arguments are not used in the baseline model, but are
        included for compatibility with the attention model in the next section.

        Args:
            decoder_input: An integer tensor with shape (1, batch_size) containing 
                the subword indices for the current decoder input.
            last_hidden: A pair of tensors h_{t-1} representing the last hidden
                state of the decoder, each with shape (num_layers, batch_size,
                hidden_size). For the first decoding step the last_hidden will be 
                encoder's final hidden representation.
            encoder_output: The output of the encoder with shape
                (max_src_sequence_length, batch_size, hidden_size).
            encoder_mask: The output mask from the encoder with shape
                (max_src_sequence_length, batch_size). Encoder outputs at positions
                with a True value correspond to padding tokens and should be ignored.

        Returns:
            A tuple with three elements:
                logits: A tensor with shape (batch_size,
                    vocab_size) containing unnormalized scores for the next-word
                    predictions at each position.
                decoder_hidden: tensor h_n with the same shape as last_hidden 
                    representing the updated decoder state after processing the 
                    decoder input.
                attention_weights: A tensor with shape (batch_size, 
                    max_src_sequence_length) representing the normalized
                    attention weights. This should sum to 1 along the last dimension.
        """

        # YOUR CODE HERE
        # From above just reorganized
        # 1. Process the current decoder_input through embedding layer and decoder GRU layer.
        # 2. Use the current decoder token representation, d of shape (1∗b∗h) and encoder representation,
        # e1,…,en or shape (n∗b∗h), where n is max_src_length after padding) to compute attention score matrix of shape (b∗n). 
        # There are multiple options to compute this score matrix. A few of such options are available in the table provided in this blog
        # 3. Normalize the attention scores (b∗n) so that they sum up to 1.0 by taking a softmax over the second dimension.
        # 4. Take a weighted sum of the encoder outputs to obtain the attention context c=∑iwiei
        # 5. Add attention context c to the decoder output d to obtain the merged representation to be passed to the vocabulary projection layer
        # (you may need another linear layer to make the sizes match before adding c and d). 
        # 6. Perform vocab projection

        batch_size = decoder_input.size()[1]

        # 1. Process the current decoder_input through embedding layer and decoder GRU layer.
        input_embedding = self.embedding(decoder_input)
        decoder_output, decoder_hidden = self.decoder_gru(input_embedding, last_hidden)

        # 2. Use the current decoder token representation, d of shape (1∗b∗h) and encoder representation,
        # e1,…,en or shape (n∗b∗h), where n is max_src_length after padding) to compute attention score matrix of shape (b∗n).
        # doing dot attention
        decoder_output_dot_reshaped = decoder_output.permute(1,0,2)
        encoder_output_dot_reshaped = encoder_output.permute(1,2,0)
        attention_scores = torch.bmm(decoder_output_dot_reshaped, encoder_output_dot_reshaped)

        # 3. Normalize the attention scores (b,1,n) so that they sum up to 1.0 by taking a softmax over the second dimension.
        attention_weights = self.attention_softmax(attention_scores)

        # 4. Take a weighted sum of the encoder outputs to obtain the attention context c=∑iwiei
        # So we want attention_weights (b,1,n) times encoder_output (n,b,h) 
        # So we just need to rearrange encoder_output to (b,n,h)
        encoder_output_weight_reshape = encoder_output.permute(1,0,2)
        # Then mult
        attention_vec = torch.bmm(attention_weights, encoder_output_weight_reshape)

        # 5. Add attention context c to the decoder output d to obtain the final representation to be passed to the vocabulary projection layer
        merged = torch.cat((attention_vec, decoder_output.permute(1,0,2)), dim=2)
        
        # 6. Perform vocab projection
        final_representation = self.attention_fc(merged).view(batch_size, 1, self.hidden_dim)
        logits = self.fc(final_representation)

        return (logits, decoder_hidden, attention_weights.squeeze())

We can now train the attention model.

A correct implementation should also get an average train loss of < 3.00  
The code will automatically save and download the model at the end of training.

It may happen that the baseline model achieves lower loss than attention model. This is because our dataset is very small and the attention model may be over parameterized for our toy dataset. Regardless, we would consider this as acceptable submission if the attention model generated responses look comparable to the baseline model.

In [None]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 8
batch_size = 64

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

attention_model = Seq2seqAttention(vocab).to(device)
train(attention_model, data_loader, num_epochs, "attention_model.pt")
# Download the trained model to local for future use
files.download('attention_model.pt')

HBox(children=(FloatProgress(value=0.0, description='training', max=8.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='epoch 1', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 2', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 3', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 4', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 5', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 6', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 7', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 8', max=830.0, style=ProgressStyle(description_widt…





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Reload the model from the model file. 
# Useful when you have already trained and saved the model
attention_model = Seq2seqAttention(vocab).to(device)
attention_model.load_state_dict(torch.load("attention_model.pt", map_location=device))

<All keys matched successfully>

Let's test the attention model on the same inputs as baseline model.

In [None]:
def test_conversations_with_model(model, conversational_inputs = None, include_beam = False):
    # Some predefined conversational inputs. 
    # You may append more inputs at the end of the list, if you want to.
    basic_conversational_inputs = [
                                    "hello.",
                                    "please share you bank account number with me",
                                    "i have never met someone more annoying that you",
                                    "i like pizza. what do you like?",
                                    "give me coffee, or i'll hate you",
                                    "i'm so bored. give some suggestions",
                                    "stop running or you'll fall hard",
                                    "what is your favorite sport?",
                                    "do you believe in a miracle?",
                                    "which sport team do you like?"
    ]
    if not conversational_inputs:
        conversational_inputs = basic_conversational_inputs
    for input in conversational_inputs:
        print(f"Input > {input}")
        generation = predict_greedy(model, input)
        print('Greedy Response:', generation)
        if include_beam:
            # Also print the beam search responses from models
            generations = predict_beam(model, input)
            print('Beam Responses:')
            print_list(generations)
        print()

In [None]:
baseline_chat_inputs = [inp for inp, gen in baseline_chat]
attention_chat = test_conversations_with_model(attention_model, baseline_chat_inputs)

Input > Why do you hate me?
Greedy Response: yes i do .

Input > You can give me a better answer than that.
Greedy Response: that s right .

Input > What do you mean what!
Greedy Response: i don t know .

Input > You should be.
Greedy Response: i m sorry .

Input > Stupid computer
Greedy Response: yeah .



## Beam Search (20 points)

Similar to greedy search, beam search generates one token at a time. However, rather than keeping only the single best hypothesis, we instead keep the top $k$ candidates at each time step. This is accomplished by computing the set of next-token extensions for each item on the beam and finding the top $k$ across all candidates according to total log-probability.

Candidates that are finished should be extracted in a final list of `generations` and removed from the beam. This strategy is useful for doing re-ranking the beam candidates using alternate scorers (example, Maximum Mutual Information Objective from [Li et. al. 2015](https://arxiv.org/pdf/1510.03055.pdf)). For this assignment, you will re-rank the beam generations as follows,  
$final\_score_i = \frac{score_i}{|generation_i|^\alpha}$, where $\alpha \in [0.5, 2]$.  
Terminate the search process once you have $k$ items in the `generations` list.

In [None]:
def predict_beam(model, sentence, k=5, max_length=100):
    """Make predictions for the given inputs using beam search.
    
    Args:
        model: A sequence-to-sequence model.
        sentence: An input sentence, represented as string.
        k: The size of the beam.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.
    
    Returns:
        A list of k beam predictions. Each element in the list should be a string
        corresponding to one of the top k predictions for the corresponding input,
        sorted in descending order by its final score.
    """

    # Implementation tip: once an eos_token has been generated for any beam, 
    # remove its subsequent predictions from that beam by adding a small negative 
    # number like -1e9 to the appropriate logits. This will ensure that the 
    # candidates are removed from the beam, as its probability will be very close
    # to 0. Using this method, uou will be able to reuse the beam of an already 
    # finished candidate

    # Implementation tip: while you are encouraged to keep your tensor dimensions
    # constant for simplicity (aside from the sequence length), some special care
    # will need to be taken on the first iteration to ensure that your beam
    # doesn't fill up with k identical copies of the same candidate.
    
    # You are welcome to tweak alpha
    alpha = 0.5
    model.eval()
    
    # YOUR CODE HERE
    sentence_ids = torch.tensor(vocab.get_ids_from_sentence(sentence))

    encoder_output,encoder_mask,last_hidden = model.encode(torch.unsqueeze(sentence_ids,1).cuda())

    output_ids = torch.zeros((max_length, k), dtype=torch.long)
    output_ids[0,:] = bos_id # set to start of sentence

    # score of that output id as just a single word as a canidate
    global_score_tens = torch.zeros((max_length, k), dtype=torch.float) 

    # which K index this location in output_ids was preceeded by (what dim 1 slot or column of output_ids this came from)
    backpointer_tens = torch.zeros((max_length, k), dtype=torch.long) 

    generations = [] # final results
  
    # Do first step (first step away from start of sentence) here
    this_hurts = torch.unsqueeze(torch.unsqueeze(torch.tensor(1),0),1)
    logits,last_hidden,attention_weights = model.decode(this_hurts.cuda(), last_hidden.cuda(), encoder_output, encoder_mask)
    output_ids[1,:] = torch.topk(logits, k)[1] # word id
    global_score_tens[1,:] = F.log_softmax(torch.topk(logits, k)[0], dim=0) # logit value
    backpointer_tens[1,:] = 0 # all zeros anyways
    # Words shouldnt repeat because they all come from one call to the decode 
    # (decode only returns prob from each word once doing it this way)
  
    # Hidden state for each of the k different previous words all have the same hidden state to start
    last_hidden_tens = last_hidden.repeat(k,1,1,1)

    i = 2
    # Need cutoff for generations being full
    while i < max_length and len(generations) < k:
      # k by vocab size vector that contains the log prob that each word would
      # have for the local decision assuming k was the predecesor
      k_logits = torch.zeros((k, model.num_words))

      for prev_k in range(k):
        this_hurts = torch.unsqueeze(torch.unsqueeze(output_ids[i-1,prev_k],0),1)
        logits,last_hidden,attention_weights =  model.decode(this_hurts.cuda(), last_hidden_tens[prev_k].cuda(), encoder_output, encoder_mask)
        last_hidden_tens[prev_k] = last_hidden # should be same dims

        k_logits[prev_k,:] = F.log_softmax(logits.squeeze(), dim=0)
      
      # score of last k as log prob sized to k and then summed with all the candidates it produced
      k_logits = global_score_tens[i-1,:].repeat(model.num_words, 1).permute(1,0) + k_logits

      eos_clear = False
      while eos_clear == False:
        score_tens, prev_tens = torch.max(k_logits, dim=0)

        best_k = torch.topk(score_tens, k)[1] # word id
        output_ids[i,:] = best_k # word id
        global_score_tens[i,:] = torch.topk(score_tens, k)[0] # logit value

        backpointer_tens[i,:] = prev_tens[best_k]

        if eos_id not in best_k:
          eos_clear = True
        else:
          for k_ind in range(k):
            if best_k[k_ind] == eos_id: 
              # Reconstruct sequence
              word_ids = []
              word_ids.append(int(output_ids[i,k_ind]))
              curr_k = backpointer_tens[i,k_ind]

              j = i - 1
              while j > 0: 
                word_ids.append(int(output_ids[j,curr_k]))
                curr_k = backpointer_tens[j,curr_k]
                j -= 1

              word_ids.append(bos_id) # append start symbol
              word_ids.reverse()
              sentence = vocab.decode_sentence_from_ids(word_ids)
              # append tuple of sentence and score
              generations.append((sentence, global_score_tens[i,k_ind]))

              # remove associated score from running
              k_logits[backpointer_tens[i,k_ind], :] = -1e9

              # reweighting handled by loop
              if (len(generations) >= k):
                eos_clear = True
              break;

      i += 1

    generations = generations[:k]

    # TODO FIX GENERATIONS HERE
    final_scores = [(element[0], float(element[1])/(float(len(element[0]))**alpha)) for element in generations]
    final_scores = sorted(final_scores, key=lambda tup: tup[1])
    final_results = [element[0] for element in final_scores]
    return final_results
    

In [None]:
test_conversations_with_model(baseline_model, include_beam=True)

Input > hello.
Greedy Response: hello .
Beam Responses:
i m here ?
i m sorry .
you re right now ?
hi .
what ?


Input > please share you bank account number with me
Greedy Response: i m not sure .
Beam Responses:
i m .
i m sieu ?
no .
you re you know .
what ?


Input > i have never met someone more annoying that you
Greedy Response: i don t know .
Beam Responses:
i don t i do it .
what do you know . .
you re not .
no .
what ?


Input > i like pizza. what do you like?
Greedy Response: i m not .
Beam Responses:
it s wrong .
it s the difference .
do you like a little .
it s the matter ?
what ?


Input > give me coffee, or i'll hate you
Greedy Response: you re not gonna do anything .
Beam Responses:
no .
you re you re going to me
you re you re a lot .
i m sieu .
what ?


Input > i'm so bored. give some suggestions
Greedy Response: you re not going to be here ?
Beam Responses:
no .
that s too bad .
i m sieu .
that s not going to you ?
what ?


Input > stop running or you'll fall hard
Greedy

In [None]:
test_conversations_with_model(attention_model, include_beam=True)

Input > hello.
Greedy Response: hello .
Beam Responses:
what are you want to come ?
what do .
what are you are you ?
hi .
what ?


Input > please share you bank account number with me
Greedy Response: i m not .
Beam Responses:
yes i m sieu !
you re not .
yes i said ?
no .
what ?


Input > i have never met someone more annoying that you
Greedy Response: no
Beam Responses:
i don t you have
i don t wait ?
you re not ?
why not ?
why ?


Input > i like pizza. what do you like?
Greedy Response: like what ?
Beam Responses:
you like it !
what do you .
what do i mean ?
like what ?
what ?


Input > give me coffee, or i'll hate you
Greedy Response: i m sorry .
Beam Responses:
that s just kidding .
you re you .
that s not ?
no .
what ?


Input > i'm so bored. give some suggestions
Greedy Response: i m sorry .
Beam Responses:
you re you don t ?
i ll do you mean .
yes . . .i .
what ?
yes . . .


Input > stop running or you'll fall hard
Greedy Response: i m sorry i m sorry .
Beam Responses:
i don t .

## Automatic Evaluation (5 points)

Automatic evaluation of chatbots is an active research area. For this assignment we are going to use 3 very simple evaluation metrics.
1. Average Length of the Responses
2. Distinct1 = proportion of unique unigrams / total unigrams
3. Distinct2 = proportion of unique bigrams / total bigrams  
You will evaluate your baseline and attention models by running the cells below.

In [None]:
# Evaluate diversity of the models
def evaluate_diversity(model, mode="greedy"):
    """Evaluates the model's greedy or beam responses on eval_conversations
    
    Args:
        model: A sequence-to-sequence model.
        mode: "greedy" or "beam"
    
    Returns: avg_length, distinct1, distinct2
        avg_length: average length of the model responses
        distinct1: proportion of unique unigrams / total unigrams
        distinct2: proportion of unique bigrams / total bigrams
    """
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy
    generations = list()
    for src, tgt in eval_conversations:
        generation = predict_f(model, src)
        if mode == "beam":
            generation = generation[0]
        generations.append(generation)
    # Calculate average length, distinct unigrams and bigrams from generations
    
    # YOUR CODE HERE
    generations_ids = [vocab.tokenized_sentence(generation) for generation in generations]
    print(generations_ids)
   
    total_unigrams = 0
    for generation in generations_ids:
      total_unigrams += (len(generation) - 2) # Bos and Eos removed

    avg_length = float(total_unigrams) / float(len(generations_ids))

    unigram_set = set()
    unigram_set.update([b for l in generations_ids for b in l])

    distinct1 = float(len(unigram_set) - 2) / float(total_unigrams) # unique - 2 for start, and end of sentence

    # For a sentence of length n you will have n-1 bigrams total
    # so total_unigrams - len(generations_ids) = total bigrams
    total_bigrams = total_unigrams - len(generations_ids)

    # every element except last zipped with every element except the first
    bigrams_set = set()
    bigrams_set.update([b for l in generations for b in zip(l.lower().split(" ")[:-1], l.lower().split(" ")[1:])])
    distinct2 = float(len(bigrams_set)) / float(total_bigrams)

    return avg_length, distinct1, distinct2

    # For this toy dataset, we don't have a good baseline to compare against. 
    # But, a correct implementation should at least get > 0.11  for both greedy
    # and beam decoding with attention and .

In [None]:
print(f"Baseline Model evaluation:")
avg_length, distinct1, distinct2 = evaluate_diversity(baseline_model)
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")
avg_length, distinct1, distinct2 = evaluate_diversity(baseline_model, mode="beam")
print(f"Beam decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")

Baseline Model evaluation:
[['<s>', 'yes', '.', '</s>'], ['<s>', 'i', 'm', 'not', '!', '</s>'], ['<s>', 'i', 'm', 'not', '.', '</s>'], ['<s>', 'what', '?', '</s>'], ['<s>', 'i', 'm', 'not', '.', '</s>'], ['<s>', 'what', '?', '</s>'], ['<s>', 'i', 'm', 'not', 'sure', '.', '</s>'], ['<s>', 'yes', '.', '</s>'], ['<s>', 'good', 'night', '.', '</s>'], ['<s>', 'what', '?', '</s>'], ['<s>', 'yes', '.', '</s>'], ['<s>', 'two', 'years', '.', '</s>'], ['<s>', 'i', 'don', 't', 'know', '.', '</s>'], ['<s>', 'i', 'm', 'sorry', '.', '</s>'], ['<s>', 'i', 'm', 'not', '.', '</s>'], ['<s>', 'i', 'll', 'be', 'right', 'back', '.', '</s>'], ['<s>', 'yes', '.', '</s>'], ['<s>', 'i', 'm', 'not', '.', '</s>'], ['<s>', 'yes', '.', '</s>'], ['<s>', 'what', '?', '</s>'], ['<s>', 'i', 'm', 'fine', '.', '</s>'], ['<s>', 'yes', '.', '</s>'], ['<s>', 'i', 'm', 'sorry', '.', '</s>'], ['<s>', 'yes', '.', '</s>'], ['<s>', 'yes', '.', '</s>'], ['<s>', 'what', '?', '</s>'], ['<s>', 'yes', '.', '</s>'], ['<s>', 'what', '

In [None]:
print(f"Attention Model evaluation:")
avg_length, distinct1, distinct2 = evaluate_diversity(attention_model)
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")
avg_length, distinct1, distinct2 = evaluate_diversity(attention_model, mode="beam")
print(f"Beam decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")

Attention Model evaluation:
[['<s>', 'yes', '.', '</s>'], ['<s>', 'i', 'll', 'be', 'right', 'back', '.', '</s>'], ['<s>', 'goodbye', '.', '</s>'], ['<s>', 'i', 'm', 'sorry', '.', '</s>'], ['<s>', 'yeah', '.', '</s>'], ['<s>', 'i', 'm', 'sorry', '.', '</s>'], ['<s>', 'that', 's', 'right', '.', '</s>'], ['<s>', 'no', '.', '</s>'], ['<s>', 'good', 'night', '.', '</s>'], ['<s>', 'what', '?', '</s>'], ['<s>', 'i', 'm', 'not', '.', '</s>'], ['<s>', 'i', 'don', 't', 'know', '.', '</s>'], ['<s>', 'because', 'i', 'said', 'i', 'was', 'just', 'a', 'bit', '.', '</s>'], ['<s>', 'i', 'don', 't', 'know', '.', '</s>'], ['<s>', 'oh', 'yeah', '?', '</s>'], ['<s>', 'i', 'll', 'be', 'there', '.', '</s>'], ['<s>', 'i', 'don', 't', 'know', '.', '</s>'], ['<s>', 'what', 's', 'wrong', '?', '</s>'], ['<s>', 'i', 'm', 'sorry', '.', '</s>'], ['<s>', 'i', 'don', 't', 'know', '.', '</s>'], ['<s>', 'i', 'm', 'fine', '.', '</s>'], ['<s>', 'i', 'm', 'sorry', '.', '</s>'], ['<s>', 'i', 'm', 'sorry', 'i', 'm', 'sorry',

## What to turn in?

When you are done, make sure to run all the cells in your solution (including your conversation with the chatbot), and submit your notebook `Assignment 3 Final Release Version.ipynb` to Gradescope.
