In [None]:
# Licensing Information:  You are free to use or extend this project for
# educational purposes provided that (1) you do not distribute or publish
# solutions, (2) you retain this notice, and (3) you provide clear
# attribution to the license.

# Attribution Information:
# This Project was brrowed from the Georgia Institute of Technology by Ashutosh Baheti (ashutosh.baheti@cc.gatech.edu),
# and from the Neural Machine Translation Project (Project 2)
# of the UC Berkeley NLP course https://cal-cs288.github.io/sp20/

# Neural Chatbot





Neural Dialog Model are Sequence-to-Sequence (Seq2Seq) models that produce conversational response given the dialog history. State-of-the-art dialog models are trained on millions of multi-turn conversations. However, in this assignment we will narrow our scope to single turn conversations to make the problem easier.  

In this assignment you will implement,
1. Seq2Seq encoder-decoder model
2. Seq2Seq model with attention mechanism
3. Greedy and Beam search decoding algorithms  

First import libraries required for the implementation

In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
from io import open
import math
import pickle

from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, unpack_sequence
import tqdm
from google.colab import files

Then we implement some standard util functions that will be useful in the rest of the code.

In [None]:
# General util functions

def print_list(l, K=None):
	# If K is given then only print first K
	for i, e in enumerate(l):
		if i == K:
			break
		print(e)
	print()

def save_in_pickle(save_object, save_file):
	with open(save_file, "wb") as pickle_out:
		pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
	with open(pickle_file, "rb") as pickle_in:
		return pickle.load(pickle_in)

Finally we will check if GPU is available and set the device accordingly.

Tip: While debugging use `CPU` and change the runtime type to `GPU` when you are ready to train your models to efficiently use free Colab GPU

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


## Dataset

For the dataset we will be using a small sample of single turn input and response pairs from [Cornell Movie Dialog Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html). We filter conversational pairs with sentences > 10 tokens. To reduce your work, we have already created a sample of tokenized, lowercased single turn conversations from Cornell Movie Dialog Corpus. The preprocessed dataset sample is stored in pickle format and can be downloaded from [this link](https://drive.google.com/file/d/1qYdSlDJ89AvgozK3V5tik8Op93zPbG6e/view?usp=sharing). The following code downloads the file and store it as `processed_CMDC.pkl`.

In [None]:
!pip install --upgrade gdown==4.7.3
!gdown 1qYdSlDJ89AvgozK3V5tik8Op93zPbG6e --output processed_CMDC.pkl

Downloading...
From: https://drive.google.com/uc?id=1qYdSlDJ89AvgozK3V5tik8Op93zPbG6e
To: /content/processed_CMDC.pkl
100% 3.49M/3.49M [00:00<00:00, 137MB/s]


In [None]:
# Loading the pre-processed conversational exchanges (source-target pairs) from pickle data files
all_conversations = load_from_pickle("processed_CMDC.pkl")
# Extract 100 conversations from the end for evaluation and keep the rest for training
eval_conversations = all_conversations[-100:]
all_conversations = all_conversations[:-100]

# Logging data stats
print(f"Number of Training Conversation Pairs = {len(all_conversations)}")
print(f"Number of Evaluation Conversation Pairs = {len(eval_conversations)}")

Number of Training Conversation Pairs = 53065
Number of Evaluation Conversation Pairs = 100


Let's print a couple of conversations to check if they are loaded properly.

In [None]:
print_list(all_conversations, 5)

('there .', 'where ?')
('you have my word . as a gentleman', 'you re sweet .')
('hi .', 'looks like things worked out tonight huh ?')
('have fun tonight ?', 'tons')
('well no . . .', 'then that s all you had to say .')



## Vocabulary

The words in the sentences need to be converted into integer tokens so that the neural model can operate on them. For this purpose, we will create a vocabulary which will convert the input strings into model recognizable integer tokens.

In [None]:
pad_word = "<pad>"
bos_word = "<s>"
eos_word = "</s>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3

def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
        self.word_count = defaultdict(int)
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}

    @property
    def num_words(self):
        return len(self.word_to_id)

    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word] if word in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids

    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                word_id = self.num_words
                self.word_to_id[word] = word_id
                self.id_to_word[word_id] = word
                # self.num_words will increase by one

                self.word_count[word] = 1
            else:
                # update the word count
                self.word_count[word] += 1

vocab = Vocabulary()
for src, tgt in all_conversations:
    vocab.add_words_from_sentence(src)
    vocab.add_words_from_sentence(tgt)
print(f"Total words in the vocabulary = {vocab.num_words}")

Total words in the vocabulary = 7727


Let's print top 30 vocab words:

In [None]:
print_list(sorted(vocab.word_count.items(), key=lambda item: item[1], reverse=True), 30)

('.', 84255)
('?', 36822)
('you', 25093)
('i', 18946)
('what', 10765)
('s', 10089)
('it', 9668)
('!', 8872)
('the', 8011)
('t', 7411)
('to', 6929)
('a', 6582)
('that', 5992)
('no', 4931)
('me', 4839)
('do', 4745)
('is', 4434)
('don', 3577)
('are', 3503)
('he', 3413)
('yes', 3384)
('m', 3382)
('not', 3252)
('we', 3252)
('know', 3171)
('re', 2965)
('your', 2809)
('this', 2726)
('yeah', 2708)
('in', 2678)



Print a couple of sentences to verify that the vocabulary is working as intended.

In [None]:
for src, tgt in all_conversations[:3]:
    sentence = tgt
    word_tokens = vocab.tokenized_sentence(sentence)
    # Automatically adds bos_id and eos_id before and after sentence ids respectively
    word_ids = vocab.get_ids_from_sentence(sentence)
    print(sentence)
    print(word_tokens)
    print(word_ids)
    print(vocab.decode_sentence_from_ids(word_ids))
    print()

word = "the"
word_id = vocab.word_to_id[word]
print(f"Word = {word}")
print(f"Word ID = {word_id}")
print(f"Word decoded from ID = {vocab.decode_sentence_from_ids([word_id])}")

where ?
['<s>', 'where', '?', '</s>']
[1, 6, 7, 2]
where ?

you re sweet .
['<s>', 'you', 're', 'sweet', '.', '</s>']
[1, 8, 15, 16, 5, 2]
you re sweet .

looks like things worked out tonight huh ?
['<s>', 'looks', 'like', 'things', 'worked', 'out', 'tonight', 'huh', '?', '</s>']
[1, 18, 19, 20, 21, 22, 23, 24, 7, 2]
looks like things worked out tonight huh ?

Word = the
Word ID = 47
Word decoded from ID = the


## Dataset Prepration

We will use built-in dataset utilities, `torch.utils.data.Dataset` and `torch.utils.data.DataLoader`, to get batched data readily useful for training.

In [None]:
class SingleTurnMovieDialog_dataset(Dataset):
    """Single-Turn version of Cornell Movie Dialog Cropus dataset."""

    def __init__(self, conversations, vocab, device):
        """
        Args:
            conversations: list of tuple (src_string, tgt_string)
                         - src_string: String of the source sentence
                         - tgt_string: String of the target sentence
            vocab: Vocabulary object that contains the mapping of
                    words to indices
            device: cpu or cuda
        """
        self.conversations = conversations
        self.vocab = vocab
        self.device = device

        def encode(src, tgt):
            src_ids = self.vocab.get_ids_from_sentence(src)
            tgt_ids = self.vocab.get_ids_from_sentence(tgt)
            return (src_ids, tgt_ids)

        # We will pre-tokenize the conversations and save in id lists for later use
        self.tokenized_conversations = [encode(src, tgt) for src, tgt in self.conversations]

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {"conv_ids":self.tokenized_conversations[idx], "conv":self.conversations[idx]}

def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (src_seq, tgt_seq).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of dicts {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, trg_str)}.
            - src_ids: list of src piece ids; variable length.
            - tgt_ids: list of tgt piece ids; variable length.
            - src_str: String of src
            - tgt_str: String of tgt
    Returns: dict { "conv_ids":     (src_ids, tgt_ids),
                    "conv":         (src_str, tgt_str),
                    "conv_tensors": (src_seqs, tgt_seqs)}
            src_seqs: torch tensor of shape (src_padded_length, batch_size).
            tgt_seqs: torch tensor of shape (tgt_padded_length, batch_size).
            src_padded_length = length of the longest src sequence from src_ids
            tgt_padded_length = length of the longest tgt sequence from tgt_ids
    """
    # Sort conv_ids based on decreasing order of the src_lengths.
    # This is required for efficient GPU computations.
    src_ids = [torch.LongTensor(e["conv_ids"][0]) for e in data]
    tgt_ids = [torch.LongTensor(e["conv_ids"][1]) for e in data]
    src_str = [e["conv"][0] for e in data]
    tgt_str = [e["conv"][1] for e in data]
    data = list(zip(src_ids, tgt_ids, src_str, tgt_str))
    data.sort(key=lambda x: len(x[0]), reverse=True)
    src_ids, tgt_ids, src_str, tgt_str = zip(*data)


    # Pad the src_ids and tgt_ids using token pad_id to create src_seqs and tgt_seqs

    # Implementation tip: You can use the nn.utils.rnn.pad_sequence utility
    # function to combine a list of variable-length sequences with padding.

    # YOUR CODE HERE
    ...
    src_seqs=pad_sequence(src_ids,batch_first=True,padding_value=pad_id)
    tgt_seqs=pad_sequence(tgt_ids,batch_first=True,padding_value=pad_id)
    src_padded_length = len(src_seqs[0])
    tgt_padded_length = len(tgt_seqs[0])
    return {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, tgt_str), "conv_tensors":(src_seqs.to(device), tgt_seqs.to(device))}

In [None]:
# Create the DataLoader for all_conversations
dataset = SingleTurnMovieDialog_dataset(all_conversations, vocab, device)

batch_size = 5

data_loader = DataLoader(dataset=dataset, batch_size=batch_size,
                               shuffle=True, collate_fn=collate_fn)

Let's test a batch of data to make sure everything is working as intended

In [None]:
# Test one batch of training data
first_batch = next(iter(data_loader))
print(f"Testing first training batch of size {len(first_batch['conv'][0])}")
print(f"List of source strings:")
print_list(first_batch["conv"][0])
print(f"Tokenized source ids:")
print_list(first_batch["conv_ids"][0])
print(f"Padded source ids as tensor (shape {first_batch['conv_tensors'][0].size()}):")
print(first_batch["conv_tensors"][0])

Testing first training batch of size 5
List of source strings:
i can take care of myself jeffrey .
anything i d know ?
i don t mind .
guess what .
yes .

Tokenized source ids:
tensor([   1,   54,  286,  183,  235,  147, 1789, 2240,    5,    2])
tensor([  1, 312,  54, 132,  97,   7,   2])
tensor([  1,  54, 198, 103,  73,   5,   2])
tensor([  1, 297,  44,   5,   2])
tensor([  1, 272,   5,   2])

Padded source ids as tensor (shape torch.Size([5, 10])):
tensor([[   1,   54,  286,  183,  235,  147, 1789, 2240,    5,    2],
        [   1,  312,   54,  132,   97,    7,    2,    0,    0,    0],
        [   1,   54,  198,  103,   73,    5,    2,    0,    0,    0],
        [   1,  297,   44,    5,    2,    0,    0,    0,    0,    0],
        [   1,  272,    5,    2,    0,    0,    0,    0,    0,    0]],
       device='cuda:0')


## Baseline Seq2Seq model

With the training `Dataset` and `DataLoader` ready, we can implement our Seq2Seq baseline model.

The model will consist of
1. Shared embedding layer between encoder and decoder that converts the input sequence of word ids to dense embedding representations
2. Bidirectional LSTM encoder that encodes the embedded source sequence into hidden representation
3. LSTM decoder that predicts target sequence using final encoder hidden representation

In [None]:
class Seq2seqBaseline(nn.Module):
    def __init__(self, vocab, emb_dim = 300, hidden_dim = 300, num_layers = 2, dropout=0.1):
        super().__init__()

        # Initialize your model's parameters here. To get started, we suggest
        # setting all embedding and hidden dimensions to 300, using encoder and
        # decoder LSTMs with 2 layers, and using a dropout rate of 0.1.

        # Implementation tip: To create a bidirectional LSTM, you don't need to
        # create two LSTM networks. Instead use nn.LSTM(..., bidirectional=True).

        self.num_words = num_words = vocab.num_words
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.sentences_id=vocab
        # YOUR CODE HERE
        ...
        self.embedding = nn.Embedding(num_words, emb_dim)

        # Define the bidirectional LSTM encoder
        self.encoder_lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            dropout=dropout,
            bidirectional=True,


        )

        # Define the unidirectional LSTM decoder
        self.decoder_lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim * 2,  # Double the hidden size for bidirectional LSTM
            num_layers=num_layers,
            dropout=dropout,


        )

        # Define the output layer (fully connected layer)
        self.output_layer = nn.Linear(hidden_dim * 2, num_words)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def encode(self, source):
        """Encode the source batch using a bidirectional LSTM encoder.

        Args:
            source: An integer tensor with shape (max_src_sequence_length,
                batch_size) containing subword indices for the source sentences.

        Returns:
            A tuple with three elements:
                encoder_output: The output hidden representation of the encoder
                    with shape (max_src_sequence_length, batch_size, hidden_size).
                    Can be obtained by adding the hidden representations of both
                    directions of the encoder bidirectional LSTM.
                encoder_mask: A boolean tensor with shape (max_src_sequence_length,
                    batch_size) indicating which encoder outputs correspond to padding
                    tokens. Its elements should be True at positions corresponding to
                    padding tokens and False elsewhere.
                encoder_hidden: The final hidden states of the bidirectional LSTM
                    (after a suitable projection) that will be used to initialize
                    the decoder. This should be a tensor h_n with shape
                    (num_layers, batch_size, hidden_size). Note that the hidden
                    state returned by the bi-LSTM cannot be used directly. Its
                    initial dimension is twice the required size because it
                    contains state from two directions.

        The first two return values are not required for the baseline model and will
        only be used later in the attention model. If desired, they can be replaced
        with None for the initial implementation.
        """

        # Implementation tip: consider using packed sequences to more easily work
        # with the variable-length sequences represented by the source tensor.
        # See https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.PackedSequence.html#torch.nn.utils.rnn.PackedSequence .

        # https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch

        # Implementation tip: there are many simple ways to combine the forward
        # and backward portions of the final hidden state, e.g. addition, averaging,
        # or a linear transformation of the appropriate size. Any of these
        # should let you reach the required performance.

        # Compute a tensor containing the length of each source sequence.
        # This lengths can be used on pack_padded_sequence function
        source_lengths = torch.sum(source != pad_id, axis=0).cpu()

        # YOUR CODE HERE
        ...

        embedded_source = self.embedding(source)
        # Pack the embedded source sequence
        packed_source = nn.utils.rnn.pack_padded_sequence(embedded_source, source_lengths)

        # Apply the bidirectional LSTM encoder
        encoder_output_packed, (h_n, c_n) = self.encoder_lstm(packed_source)

        # Unpack the encoder output for attention
        encoder_output, _ = nn.utils.rnn.pad_packed_sequence(
            encoder_output_packed
        )

        # h_n is the final hidden layer for encoder  with shape (2*num_layers, batch_size, hidden_size)
        final_encoder_hidden = h_n

        # encoder_hidden is input for the first hidden layer in decoder with shape(num_layers,batch_size,2*hidden_size)
        encoder_hidden=torch.empty(self.num_layers, source.size(1),2*self.hidden_dim)
        encoder_hidden_temp=[]

        for i in range(int(h_n.size(0)/2)):
          temp=torch.cat([h_n[i, :, :], h_n[i+1, :, :]], dim=1)
          encoder_hidden_temp.append(temp.tolist())

        encoder_hidden = torch.tensor(encoder_hidden_temp)

        # encoder_cell is input for the first cell layer in decoder with shape(num_layers,batch_size,2*hidden_size)
        encoder_cell=torch.empty(self.num_layers, source.size(1),2*self.hidden_dim)
        encoder_cell_temp=[]
        for i in range(int(c_n.size(0)/2)):
          temp=torch.cat([c_n[i, :, :], c_n[i+1, :, :]], dim=1)
          encoder_cell_temp.append(temp.tolist())

          encoder_cell= torch.tensor(encoder_cell_temp)

        # Return the encoder_mask
        padding_mask = (source == 0)
        encoder_mask = padding_mask

        return encoder_output, encoder_mask.type(torch.bool), (encoder_hidden,encoder_cell)
    def decode(self, decoder_input, last_hidden, encoder_output, encoder_mask):
        """Run the decoder LSTM for one decoding step from the last hidden state.

        The third and fourth arguments are not used in the baseline model, but are
        included for compatibility with the attention model in the next section.

        Args:
            decoder_input: An integer tensor with shape (1, batch_size) containing
                the subword indices for the current decoder input.
            last_hidden: A pair of tensors h_{t-1} representing the last hidden
                state of the encoder, each with shape (num_layers, batch_size,
                hidden_size). For the first decoding step the last_hidden will be
                encoder's final hidden representation.
            encoder_output: The output of the encoder with shape
                (max_src_sequence_length, batch_size, hidden_size).
            encoder_mask: The output mask from the encoder with shape
                (max_src_sequence_length, batch_size). Encoder outputs at positions
                with a True value correspond to padding tokens and should be ignored.

        Returns:
            A tuple with three elements:
                logits: A tensor with shape (batch_size,
                    vocab_size) containing unnormalized scores for the next-word
                    predictions at each position.
                decoder_hidden: tensor h_n with the same shape as last_hidden
                    representing the updated decoder state after processing the
                    decoder input.
                attention_weights: This will be implemented later in the attention
                    model, but in order to maintain compatible type signatures, we also
                    include it here. This can be None or any other placeholder value.
        """


        # YOUR CODE HERE

        embedded_input = self.embedding(decoder_input)
        embedded_input = self.dropout(embedded_input)

        #print(decoder_input)
        # Run the unidirectional LSTM decoder
        hidden_state, cell_state = last_hidden

        # Move each tensor to the same device as decoder_input
        hidden_state = hidden_state.to(decoder_input.device)
        cell_state = cell_state.to(decoder_input.device)

        # Pack the tensors back into a tuple
        last_hidden = (hidden_state, cell_state)
        decoder_output, decoder_hidden = self.decoder_lstm(embedded_input, last_hidden)

        # Apply the output layer
        logits = self.output_layer(decoder_output.squeeze(0))

        return logits, decoder_hidden, None
        # These arguments are not used in the baseline model.
        #del encoder_output
        #del encoder_mask

        #return output
    def compute_loss(self, source, target):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss
            from these tokens using appropriate mask on the target tokens loss.
        """

        # Implementation tip: don't feed the target tensor directly to the decoder.
        # To see why, note that for a target sequence like <s> A B C </s>, you would
        # want to run the decoder on the prefix <s> A B C and have it predict the
        # suffix A B C </s>.

        # You may run self.encode() on the source only once and decode the target
        # one step at a time.

        # YOUR CODE HERE
        ...
        source=source.T
        target=target.T
        encoder_output, source_mask, encoder_hidden = self.encode(source)
        #print(encoder_output[0].size())
        # Initialize decoder hidden state with the encoder's final hidden state
        #decoder_hidden = (encoder_hidden.unsqueeze(0), torch.zeros_like(encoder_hidden).unsqueeze(0))
        decoder_hidden=encoder_hidden

        # Initialize loss
        loss = 0.0

        # input for the first sequence in decoder, decoder_input shape is (1,batch)
        source_lengths = torch.sum(source != 0, axis=0).cpu()
        source_lengths=source_lengths-1

        # last token in the source is the input for decoder
        decoder_input=source.T[torch.arange((source.T).size(0)),source_lengths]

        decoder_input=(decoder_input.unsqueeze(0))
        decoder_input = torch.where(decoder_input % 2 == 0, decoder_input / 2, decoder_input).int()
        # Iterate over target sequence with teacher-forcing
        for t in range(target.shape[0]-1):
            # Decoder input for current time step
            # Decoder step
            logits, decoder_hidden, _ = self.decode(decoder_input, decoder_hidden, encoder_output, None)
            decoder_input = target[t+1].unsqueeze(0)
            # Calculate cross-entropy loss
            loss += nn.CrossEntropyLoss(ignore_index=0)(logits, target[t+1])

        # mask for target
        padding_mask_target = (target == 1)

        # number of target tokens in the batch which are not pad tokens
        target_tokens=torch.masked_select(target, padding_mask_target).sum()
        # Average the loss over target sequence length
        loss /= target_tokens

        return loss
# Example usage:
"""
first_batch = next(iter(data_loader))
model = Seq2seqBaseline(vocab)
source =  first_batch['conv_tensors'][0].T # Example source sequence
target = first_batch['conv_tensors'][1].T  # Example target sequence
loss = model.compute_loss(source, target)
print(loss.item())
"""


"\nfirst_batch = next(iter(data_loader))\nmodel = Seq2seqBaseline(vocab)\nsource =  first_batch['conv_tensors'][0].T # Example source sequence\ntarget = first_batch['conv_tensors'][1].T  # Example target sequence\nloss = model.compute_loss(source, target)\nprint(loss.item())\n"

We provide a training loop for training the model. You are welcome to modify the training loop by adjusting the learning rate or changing optmization settings.

**Important:** During our testing we found that training the encoder and decoder with different learning rates is crucial for getting good performance over the small dialog corpus. Specifically, the decoder parameter learning rate should be 5 times the encoder parameter learning rate. Hence, add the encoder parameter variable names in the `encoder_parameter_names` as a list. For example, if encoder is using `self.embedding_layer` and `self.encoder_lstm` layer then the `encoder_parameter_names` should be `['embedding_layer', 'encoder_lstm']`

In [None]:
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):
    """Train the model for given number of epochs and save the trained model in
    the final model_file.
    """

    decoder_learning_ratio = 5.0

    encoder_parameter_names = ['word_embedding', 'encoder']

    encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([{'params': encoder_params},
                {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)

    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        # print(f"Total training instances = {len(train_dataset)}")
        # print(f"train_data_loader = {len(train_data_loader)} {1180 > len(train_data_loader)/20}")
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                source, target = batch_data["conv_tensors"]
                optimizer.zero_grad()
                loss = model.compute_loss(source, target)
                total_loss += loss.item()
                loss.backward()
                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
    # Save the model after training
    torch.save(model.state_dict(), model_file)

We can now train the baseline model.

A correct implementation should get a average train loss of < 3.00  
The code will automatically save and download the model at the end of training.

In [None]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 6
batch_size = 64
# Reloading the data_loader to increase batch_size
data_loader = DataLoader(dataset=dataset, batch_size=batch_size,
                               shuffle=True, collate_fn=collate_fn)

baseline_model = Seq2seqBaseline(vocab).to(device)
train(baseline_model, data_loader, num_epochs, "baseline_model.pt")
# Download the trained model to local for future use
files.download('baseline_model.pt')

training:   0%|          | 0/6 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 5:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 6:   0%|          | 0/830 [00:00<?, ?batch/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Reload the model from the model file.
# Useful when you have already trained and saved the model
baseline_model = Seq2seqBaseline(vocab).to(device)
baseline_model.load_state_dict(torch.load("baseline_model.pt", map_location=device))

<All keys matched successfully>

## Greedy Search

For evaluation, we also need to be able to generate entire strings from the model. We'll first define a greedy inference procedure here. Later on, we'll implement beam search.


In [None]:
def predict_greedy(model, sentence, max_length=100):
    """Make predictions for the given input using greedy inference.

    Args:
        model: A sequence-to-sequence model.
        sentence: A input string.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.

    Returns:
        Model's predicted greedy response for the input, represented as string.
    """
    model.eval()

    # Move the model to the same device as the input tensor
    device = next(model.parameters()).device

    # Convert input sentence to tensor
    sentence_ids = model.sentences_id.get_ids_from_sentence(sentence)
    sentence_tensor = torch.tensor(sentence_ids, device=device).unsqueeze(1)

    # Encode the input sentence
    with torch.no_grad():
        encoder_output, _, encoder_hidden = model.encode(sentence_tensor)

    # Initialize decoder input with a special token indicating the start of decoding
    decoder_input = torch.ones(1, 1, device=device, dtype=torch.long)

    decoded_words = [1]  # Start token
    decoder_hidden = encoder_hidden

    for _ in range(max_length):
        # Decode a step
        with torch.no_grad():
            decoder_output_logit, decoder_hidden, _ = model.decode(decoder_input, decoder_hidden,encoder_output, None)
        # Get the most likely token
        probs = F.softmax(decoder_output_logit, dim=1)
        scores_id, word_id = torch.max(probs, dim=1)
        # Record token and score
        decoded_words.append(word_id.item())

        # If the end-of-sequence token is generated, stop decoding
        if word_id == 2:
            break

        # Update decoder input for the next step
        decoder_input = word_id.view(1, 1)

        #decoded_words = [item.item() if isinstance(item, torch.Tensor) else item for item in decoded_words]

    return model.sentences_id.decode_sentence_from_ids(decoded_words)


Let's chat interactively with our trained baseline Seq2Seq dialog model and save the generated conversations for submission (please make sure to keep the conversations in your submission ["PG-13"](https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system)). We will reuse the conversational inputs while testing Seq2Seq + Attention model.

Note: enter "q" or "quit" to end the interactive chat

In [None]:
def chat_with_model(model, mode="greedy"):
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy
    chat_log = list()
    input_sentence = ''
    while True:
        # Get input sentence
        input_sentence = input('Input > ')
        # Check if it is quit case
        if input_sentence == 'q' or input_sentence == 'quit': break

        generation = predict_f(model, input_sentence)
        if mode == "beam":
            generation = generation[0]
        print('Greedy Response:', generation)
        print()
        chat_log.append((input_sentence, generation))
    return chat_log

In [None]:
baseline_chat = chat_with_model(baseline_model)
"""
"please share you bank account number with me",
                                    "i have never met someone more annoying that you",
                                    "i like pizza. what do you like?",
                                    "give me coffee, or i'll hate you",
                                    "i'm so bored. give some suggestions",
                                    "stop running or you'll fall hard",
                                    "what is your favorite sport?",
                                    "do you believe in a miracle?",
                                    "which sport team do you like?" """

Input > hi
Greedy Response: hi .

Input > how are you?
Greedy Response: i m not sure .

Input > i m not sure.
Greedy Response: you re not going to be here .

Input > where?
Greedy Response: i m going to go .

Input > where?
Greedy Response: i m going to go .

Input > how can i pay?
Greedy Response: i m not sure .

Input > q


'\n"please share you bank account number with me",\n                                    "i have never met someone more annoying that you",\n                                    "i like pizza. what do you like?",\n                                    "give me coffee, or i\'ll hate you",\n                                    "i\'m so bored. give some suggestions",\n                                    "stop running or you\'ll fall hard",\n                                    "what is your favorite sport?",\n                                    "do you believe in a miracle?",\n                                    "which sport team do you like?" '

## Seq2Seq + Attention Model

Next, we extend the baseline model to include an attention mechanism in the decoder. With attention mechanism, the model doesn't need to encode the input into a fixed dimensional hidden representation. Rather, it creates a new context vector for each turn that is a weighted sum of encoder hidden representation.

Your implementation can use any attention mechanism to get weight distribution over the source words. One simple way to include attention in decoder goes as follows (reminder: the decoder processed one token at a time),
1. Process the current decoder_input through embedding layer and decoder LSTM layer.
2. Use the current decoder token representation, $d$ of shape $(1 * b * h)$ and encoder representation, $e_1, \dots, e_n$ or shape $(n * b * h)$, where $n$ is max_src_length after padding) to compute attention score matrix of shape $(b * n)$. There are multiple options to compute this score matrix. A few of such options are available in [the table provided in this blog](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html#a-family-of-attention-mechanisms). Please leave a comment in your code with the name of the method you choose to implement
3. Normalize the attention scores $(b * n)$ so that they sum up to $1.0$ by taking a `softmax` over the second dimention.

After computing the normalized attention distribution, take a weighted sum of the encoder outputs to obtain the attention context $c = \sum_i w_i e_i$, and add this to the decoder output $d$ to obtain the final representation to be passed to the vocabulary projection layer (you may need another linear layer to make the sizes match before adding $c$ and $d$).

In [None]:
class Seq2seqAttention(Seq2seqBaseline):
    def __init__(self, vocab):
        super().__init__(vocab)

        # Initialize any additional parameters needed for this model that are not
        # already included in the baseline model.
        self.attention = nn.Linear(self.hidden_dim * 4, self.hidden_dim*2)
        self.attn_scoring_fn = nn.Linear(self.hidden_dim*2, 1, bias=False)

        self.decoder_lstm = nn.LSTM(
            input_size=self.emb_dim*3,
            hidden_size=self.hidden_dim * 2,  # Double the hidden size for bidirectional LSTM
            num_layers=self.num_layers,
            dropout=0.1
        )

    def decode(self, decoder_input, last_hidden, encoder_output, encoder_mask):
        """Run the decoder LSTM for one decoding step from the last hidden state.

        The third and fourth arguments are not used in the baseline model, but are
        included for compatibility with the attention model in the next section.

        Args:
            decoder_input: An integer tensor with shape (1, batch_size) containing
                the subword indices for the current decoder input.
            last_hidden: A pair of tensors h_{t-1} representing the last hidden
                state of the decoder, each with shape (num_layers, batch_size,
                hidden_size). For the first decoding step the last_hidden will be
                encoder's final hidden representation.
            encoder_output: The output of the encoder with shape
                (max_src_sequence_length, batch_size, hidden_size).
            encoder_mask: The output mask from the encoder with shape
                (max_src_sequence_length, batch_size). Encoder outputs at positions
                with a True value correspond to padding tokens and should be ignored.

        Returns:
            A tuple with three elements:
                logits: A tensor with shape (batch_size,
                    vocab_size) containing unnormalized scores for the next-word
                    predictions at each position.
                decoder_hidden: tensor h_n with the same shape as last_hidden
                    representing the updated decoder state after processing the
                    decoder input.
                attention_weights: A tensor with shape (batch_size,
                    max_src_sequence_length) representing the normalized
                    attention weights. This should sum to 1 along the last dimension.
        """

 # YOUR CODE HERE
        decoder_input = decoder_input.to(next(self.parameters()).device)
        last_hidden = (last_hidden[0].to(next(self.parameters()).device), last_hidden[1].to(next(self.parameters()).device))

        if not isinstance(encoder_output, torch.Tensor):
          encoder_output = torch.tensor(encoder_output, device=next(self.parameters()).device)

        #encoder_mask = encoder_mask.to(next(self.parameters()).device)



        embedded_input = self.embedding(decoder_input)
        embedded_input = self.dropout(embedded_input)


        # Unpack last hidden state
        last_hidden_state, last_cell_state = last_hidden

        # Get batch size
        #batch_size = encoder_output.size(1)

        # Concatenate last decoder hidden state with each encoder output

        #encoder_hidden_temp=[]
        #for i in range(batch_size):

          #temp=torch.cat([encoder_output[i, : ,:], last_hidden_state[-1, :, :]], dim=1)
          #encoder_hidden_temp.append(temp.tolist())
        src_len = encoder_output.size(0)
        last_hidden_state=last_hidden_state[-1, :, :].unsqueeze(0)
        last_hidden_state = last_hidden_state.repeat(src_len, 1, 1)


# concat output of the encoder with last hidden state of decoder (h_n). shape (), attention_input is (max_src_sequence_length, batch_size, 2*hidden_size)
        #attention_input = torch.tensor(encoder_hidden_temp)
        attention_input=torch.cat((encoder_output,last_hidden_state), dim=2)


        # Calculate attention scores
        attention_scores    =      self.attention(attention_input)
        attention_scores    =      torch.tanh(attention_scores)
        attn_scoring_vector =      self.attn_scoring_fn(attention_scores).squeeze(2)

        # The attn_scoring_vector has dimension of [source len, batch size]
        # Since we need to calculate the softmax per record in the batch
        # we will switch the dimension to [batch size,source len]
        attn_scoring_vector = attn_scoring_vector.permute(1, 0)
        attn_scoring_vector=attn_scoring_vector.unsqueeze(1)




        # We need to perform the batch wise dot product.
        # Hence need to shift the batch dimension to the front.
        encoder_output = encoder_output.permute(1, 0, 2)


        # Use PyTorch's bmm function to calculate the weight W.
        W = torch.bmm(attn_scoring_vector, encoder_output)

        #print('j',encoder_output.size())
        #print('j',W.size())
        # Revert the batch dimension.
        W = W.permute(1, 0, 2)

        # concatenate the previous output with W
        embedded_input = torch.cat((embedded_input, W), dim=2)
        #print(embedded_input.size())
        decoder_output, decoder_hidden = self.decoder_lstm(embedded_input, last_hidden)
        logits = self.output_layer(decoder_output.squeeze(0))

        # Softmax function for normalizing the weights to
        # probability distribution


        return logits, decoder_hidden,F.softmax(attn_scoring_vector, dim=1)


We can now train the attention model.

A correct implementation should also get an average train loss of < 3.00  
The code will automatically save and download the model at the end of training.

It may happen that the baseline model achieves a worse loss than attention model. This is because our dataset is very small and the attention model may be over parameterized for our toy dataset. Regardless, we would consider this as acceptable submission if the attention model generated responses look comparable to the baseline model.

In [None]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 8
batch_size = 64

data_loader = DataLoader(dataset=dataset, batch_size=batch_size,
                               shuffle=True, collate_fn=collate_fn)

attention_model = Seq2seqAttention(vocab).to(device)
train(attention_model, data_loader, num_epochs, "attention_model.pt")
# Download the trained model to local for future use
files.download('attention_model.pt')

training:   0%|          | 0/8 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 5:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 6:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 7:   0%|          | 0/830 [00:00<?, ?batch/s]

epoch 8:   0%|          | 0/830 [00:00<?, ?batch/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Given list of lists
original_list = [[1, 2, 3, 0, 0], [4, 5, 6, 0, 0], [40, 50, 0, 0, 0]]

# Remove zeros from each sublist
result_list = [[elem for elem in sublist if elem != 0] for sublist in original_list]

print("Resulting list:")
print(result_list)


Resulting list:
[[1, 2, 3], [4, 5, 6], [40, 50]]


In [None]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [None]:
# Reload the model from the model file.
# Useful when you have already trained and saved the model
attention_model = Seq2seqAttention(vocab).to(device)
attention_model.load_state_dict(torch.load("attention_model.pt", map_location=device))

<All keys matched successfully>

Let's test the attention model on the same inputs as baseline model.

In [None]:
def test_conversations_with_model(model, conversational_inputs = None, include_beam = False):
    # Some predefined conversational inputs.
    # You may append more inputs at the end of the list, if you want to.
    basic_conversational_inputs = [
                                    "hello.",
                                    "please share you bank account number with me",
                                    "i have never met someone more annoying that you",
                                    "i like pizza. what do you like?",
                                    "give me coffee, or i'll hate you",
                                    "i'm so bored. give some suggestions",
                                    "stop running or you'll fall hard",
                                    "what is your favorite sport?",
                                    "do you believe in a miracle?",
                                    "which sport team do you like?"
    ]
    if not conversational_inputs:
        conversational_inputs = basic_conversational_inputs
    for input in conversational_inputs:
        print(f"Input > {input}")
        generation = predict_greedy(model, input)
        print('Greedy Response:', generation)
        if include_beam:
            # Also print the beam search responses from models
            generations = predict_beam(model, input)
            print('Beam Responses:')
            print_list(generations)
        print()

In [None]:
baseline_chat_inputs = [inp for inp, gen in baseline_chat]
attention_chat = test_conversations_with_model(attention_model, baseline_chat_inputs)

Input > i'm so bored. give some suggestions
Greedy Response: i m sorry .

Input > i like pizza. what do you like?
Greedy Response: i don t know .

Input > stop running or you'll fall hard
Greedy Response: i m sorry .

Input > do you believe in a miracle?
Greedy Response: yes .

Input > which sport team do you like?
Greedy Response: yes .

Input > what is your name ?
Greedy Response: i don t know .

Input > where?
Greedy Response: the trunk .

Input > where are you going to go?
Greedy Response: i m going to be a minute .

Input > give me coffee, or i'll hate you
Greedy Response: you re not .

Input > fuck
Greedy Response: what ?



## Automatic Evaluation

Automatic evaluation of chatbots is an active research area. For this assignment we are going to use 3 very simple evaluation metrics.
1. Average Length of the Responses
2. Distinct1 = proportion of unique unigrams / total unigrams
3. Distinct2 = proportion of unique bigrams / total bigrams  
You will evaluate your baseline and attention models by running the cells below.

In [None]:
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def evaluate_diversity(model, mode="greedy"):
    """Evaluates the model's greedy or beam responses on eval_conversations

    Args:
        model: A sequence-to-sequence model.
        mode: "greedy" or "beam"

    Returns: avg_length, distinct1, distinct2
        avg_length: average length of the model responses
        distinct1: proportion of unique unigrams / total unigrams
        distinct2: proportion of unique bigrams / total bigrams
    """
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy

    total_tokens = 0
    total_length = 0
    unique_unigrams = set()
    unique_bigrams = set()
    for src, tgt in eval_conversations:
        generation = predict_f(model, src)

        if mode == "beam":
            generation = generation[0]

        # Calculate average length
        total_length += len(generation.split())
        total_tokens += 1

        # Calculate distinct unigrams and bigrams
        tokens = word_tokenize(generation.lower())
        unigrams = set(tokens)
        bigrams = set(ngrams(tokens, 2))

        unique_unigrams.update(unigrams)
        unique_bigrams.update(bigrams)


    avg_length = total_length / total_tokens if total_tokens > 0 else 0
    distinct1 = len(unique_unigrams) / total_tokens if total_tokens > 0 else 0
    distinct2 = len(unique_bigrams) / total_tokens if total_tokens > 0 else 0

    return avg_length, distinct1, distinct2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print(f"Baseline Model evaluation:")
avg_length, distinct1, distinct2 = evaluate_diversity(baseline_model)
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")
print(f"Attention Model evaluation:")
avg_length, distinct1, distinct2 = evaluate_diversity(attention_model)
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")

Baseline Model evaluation:
Greedy decoding:
Avg Response Length = 4.4
Distinct1 = 0.24
Distinct2 = 0.29
Attention Model evaluation:
Greedy decoding:
Avg Response Length = 3.86
Distinct1 = 0.43
Distinct2 = 0.61


## Beam Search (optional)

Similar to greedy search, beam search generates one token at a time. However, rather than keeping only the single best hypothesis, we instead keep the top $k$ candidates at each time step. This is accomplished by computing the set of next-token extensions for each item on the beam and finding the top $k$ across all candidates according to total log-probability.

Candidates that are finished should be extracted in a final list of `generations` and removed from the beam. This strategy is useful for doing re-ranking the beam candidates using alternate scorers (example, Maximum Mutual Information Objective from [Li et. al. 2015](https://arxiv.org/pdf/1510.03055.pdf)). For this assignment, you will re-rank the beam generations as follows,  
$final\_score_i = \frac{score_i}{|generation_i|^\alpha}$, where $\alpha \in [0.5, 2]$.  
Terminate the search process once you have $k$ items in the `generations` list.

In [None]:
def predict_beam(model, sentence, k=5, max_length=100):
    """Make predictions for the given inputs using beam search.

    Args:
        model: A sequence-to-sequence model.
        sentence: An input sentence, represented as string.
        k: The size of the beam.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.

    Returns:
        A list of k beam predictions. Each element in the list should be a string
        corresponding to one of the top k predictions for the corresponding input,
        sorted in descending order by its final score.
    """

    # Implementation tip: once an eos_token has been generated for any beam,
    # remove its subsequent predictions from that beam by adding a small negative
    # number like -1e9 to the appropriate logits. This will ensure that the
    # candidates are removed from the beam, as its probability will be very close
    # to 0. Using this method, uou will be able to reuse the beam of an already
    # finished candidate

    # Implementation tip: while you are encouraged to keep your tensor dimensions
    # constant for simplicity (aside from the sequence length), some special care
    # will need to be taken on the first iteration to ensure that your beam
    # doesn't fill up with k identical copies of the same candidate.

    # You are welcome to tweak alpha
    alpha = 0.7
    model.eval()

    # YOUR CODE HERE
    ...

Now let's test both baseline and attention models on some predefined inputs and compare their greedy and beam responses side by side.

In [None]:
test_conversations_with_model(attention_model, include_beam=False)

Input > hello.
Greedy Response: hello .

Input > please share you bank account number with me
Greedy Response: i m not .

Input > i have never met someone more annoying that you
Greedy Response: no .

Input > i like pizza. what do you like?
Greedy Response: i don t know .

Input > give me coffee, or i'll hate you
Greedy Response: you re not .

Input > i'm so bored. give some suggestions
Greedy Response: i m sorry .

Input > stop running or you'll fall hard
Greedy Response: i m sorry .

Input > what is your favorite sport?
Greedy Response: i don t know .

Input > do you believe in a miracle?
Greedy Response: yes .

Input > which sport team do you like?
Greedy Response: yes .



In [None]:
test_conversations_with_model(baseline_model, include_beam=False)

Input > hello.
Greedy Response: i m sorry .

Input > please share you bank account number with me
Greedy Response: i m sorry .

Input > i have never met someone more annoying that you
Greedy Response: i m sorry .

Input > i like pizza. what do you like?
Greedy Response: i don t know .

Input > give me coffee, or i'll hate you
Greedy Response: i m sorry .

Input > i'm so bored. give some suggestions
Greedy Response: you re not going to be here .

Input > stop running or you'll fall hard
Greedy Response: i m sorry .

Input > what is your favorite sport?
Greedy Response: i m going to go .

Input > do you believe in a miracle?
Greedy Response: i m not sure .

Input > which sport team do you like?
Greedy Response: i m not sure .



Let's also check how our models do using our automatic evaluation metrics.

In [None]:
for i in range(1000):
  print(i)

In [None]:
print(f"Baseline Model evaluation:")
avg_length, distinct1, distinct2 = evaluate_diversity(baseline_model)
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")
avg_length, distinct1, distinct2 = evaluate_diversity(baseline_model, mode='beam')
print(f"Beam search decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")
print(f"Attention Model evaluation:")
avg_length, distinct1, distinct2 = evaluate_diversity(attention_model,)
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")
avg_length, distinct1, distinct2 = evaluate_diversity(attention_model, mode='beam')
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")

## What to turn in?

When you are done, make sure to run all the cells in your solution (including your conversation with the chatbot), and submit your notebook `DNN-EX03-ChatBot.ipynb` with other problems to cw.sharif.edu.


**When submitting the .ipynb notebook, please make sure that all the cells are run and up-to-date with the outputs and accuracies.** If the code doesn't take too long to run, you can re-run everything with `Runtime -> Restart and run all`