# High Performance Machine Learning - Homework Assignment 3
By Amrutha Patil (ap7982)

## PART 1

In [1]:
# For tips on running notebooks in Google Colab, see
# https://pytorch.org/tutorials/beginner/colab
%matplotlib inline

## Preparations

To start, Download the data ZIP file
[here](https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip)



In [4]:
!mkdir -p data
!wget -O data/movie-corpus.zip https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip
!unzip -o data/movie-corpus.zip -d data

--2024-04-01 02:09:25--  https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip
Resolving zissou.infosci.cornell.edu (zissou.infosci.cornell.edu)... 128.253.51.179
Connecting to zissou.infosci.cornell.edu (zissou.infosci.cornell.edu)|128.253.51.179|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40854701 (39M) [application/zip]
Saving to: ‘data/movie-corpus.zip’


2024-04-01 02:09:26 (42.1 MB/s) - ‘data/movie-corpus.zip’ saved [40854701/40854701]

Archive:  data/movie-corpus.zip
  inflating: data/movie-corpus/utterances.jsonl  
  inflating: data/movie-corpus/conversations.json  
  inflating: data/movie-corpus/corpus.json  
  inflating: data/movie-corpus/speakers.json  
  inflating: data/movie-corpus/index.json  


Install wandb

In [3]:
pip install wandb --upgrade

Collecting wandb
  Downloading wandb-0.16.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.44.0-py2.py3-none-any.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.9/264.9 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wan

In [5]:
# and put in a ``data/`` directory under the current directory.
#
# After that, let’s import some necessities.
#

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import json
import wandb
import typing
import time
import numpy as np
import pandas as pd


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

## Load & Preprocess Data

In [6]:
corpus_name = "movie-corpus"
corpus = os.path.join("data", corpus_name)

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus, "utterances.jsonl"))

b'{"id": "L1045", "conversation_id": "L1044", "text": "They do not!", "speaker": "u0", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "not", "tag": "RB", "dep": "neg", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": "L1044", "timestamp": null, "vectors": []}\n'
b'{"id": "L1044", "conversation_id": "L1044", "text": "They do to!", "speaker": "u2", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "to", "tag": "TO", "dep": "dobj", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": null, "timestamp": null, "vectors": []}\n'
b'{"id": "L985", "conversation_id": "L984", "text": "I hope so.", "speaker": "u0", "meta": {

## Create formatted data file

In [7]:
# Splits each line of the file to create lines and conversations
def loadLinesAndConversations(fileName):
    lines = {}
    conversations = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            lineJson = json.loads(line)
            # Extract fields for line object
            lineObj = {}
            lineObj["lineID"] = lineJson["id"]
            lineObj["characterID"] = lineJson["speaker"]
            lineObj["text"] = lineJson["text"]
            lines[lineObj['lineID']] = lineObj

            # Extract fields for conversation object
            if lineJson["conversation_id"] not in conversations:
                convObj = {}
                convObj["conversationID"] = lineJson["conversation_id"]
                convObj["movieID"] = lineJson["meta"]["movie_id"]
                convObj["lines"] = [lineObj]
            else:
                convObj = conversations[lineJson["conversation_id"]]
                convObj["lines"].insert(0, lineObj)
            conversations[convObj["conversationID"]] = convObj

    return lines, conversations


# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations.values():
        # Iterate over all the lines of the conversation
        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [8]:
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict and conversations dict
lines = {}
conversations = {}
# Load lines and conversations
print("\nProcessing corpus into lines and conversations...")
lines, conversations = loadLinesAndConversations(os.path.join(corpus, "utterances.jsonl"))

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)


Processing corpus into lines and conversations...

Writing newly formatted file...

Sample lines from file:
b'They do to!\tThey do not!\n'
b'She okay?\tI hope so.\n'
b"Wow\tLet's go.\n"
b'"I\'m kidding.  You know how sometimes you just become this ""persona""?  And you don\'t know how to quit?"\tNo\n'
b"No\tOkay -- you're gonna need to learn how to lie.\n"
b"I figured you'd get to the good stuff eventually.\tWhat good stuff?\n"
b'What good stuff?\t"The ""real you""."\n'
b'"The ""real you""."\tLike my fear of wearing pastels?\n'
b'do you listen to this crap?\tWhat crap?\n'
b"What crap?\tMe.  This endless ...blonde babble. I'm like, boring myself.\n"


## Load and trim data

In [9]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [10]:
MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using the ``filterPair`` condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 221282 sentence pairs
Trimmed to 64313 sentence pairs
Counting words...
Counted words: 18082

pairs:
['they do to !', 'they do not !']
['she okay ?', 'i hope so .']
['wow', 'let s go .']
['what good stuff ?', 'the real you .']
['the real you .', 'like my fear of wearing pastels ?']
['do you listen to this crap ?', 'what crap ?']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['have fun tonight ?', 'tons']


In [11]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7833 / 18079 = 0.4333
Trimmed from 64313 pairs to 53131, 0.8261 of total


## Prepare Data for Models

In [12]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[307,  19,  36, 254, 162],
        [651, 181, 201, 943,  14],
        [ 24,  24, 182,  14,   2],
        [124, 297, 210,   2,   0],
        [ 24,  66,  14,   0,   0],
        [ 48,  10,   2,   0,   0],
        [ 36,   2,   0,   0,   0],
        [ 10,   0,   0,   0,   0],
        [  2,   0,   0,   0,   0]])
lengths: tensor([9, 7, 6, 4, 3])
target_variable: tensor([[  11,  948,  128,   20,   33],
        [  48,  101,   36,  129,  409],
        [  36,   22,  140,   14,  158],
        [  14,  752,   14,   16,   99],
        [   2, 4069,   36,   17,   14],
        [   0,   14,   17,   18,    2],
        [   0,    2,   72, 6184,    0],
        [   0,    0, 2606,  160,    0],
        [   0,    0,   14,   14,    0],
        [   0,    0,    2,    2,    0]])
mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  

## Define Models

## Encoder

In [13]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size parameters are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,dropout=dropout, bidirectional=True)

    def forward(self, input_seq : torch.Tensor, input_lengths : torch.Tensor):
        # shape[0] = num_layers * num_directions
        # Initiating hidden with zeros
        hidden = torch.zeros(self.n_layers * 2, input_seq.shape[1], self.hidden_size).to(input_seq.device)
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

## Decoder

In [14]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size, hidden_size)
        self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        # Initialize attn_energies
        attn_energies = torch.randn(1,1,1)
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)


In [15]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step : torch.Tensor, last_hidden : torch.Tensor, encoder_outputs : torch.Tensor):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

## Define Training Procedure

### Masked loss

In [16]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

### Single training iteration

In [17]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip,teacher_forcing_ratio, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for RNN packing should always be on the CPU
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder

    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Record loss
    wandb.log({"loss": loss.item()})

    # Perform backpropagation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

### Training iterations

In [18]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name,teacher_forcing_ratio, profile=False, save_model = False):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0

    # Training loop
    print("Training...")

    # Code for profiling
    if profile:
        myprofiler = torch.profiler.profile(
                schedule=torch.profiler.schedule(wait=1, warmup=10, active=10, repeat=1),
                on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./profiler_logs/{wandb.run.name}.log'),
                record_shapes=True,
                with_stack=True)
        # Start the profiler
        myprofiler.start()

    for iteration in range(start_iteration, n_iteration + 1):


        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip, teacher_forcing_ratio)
        print_loss += loss

        if profile:
            myprofiler.step()

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("\rIteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg),  end="", flush=True)
            print_loss = 0

    # End the profiler
    if profile:
        myprofiler.stop()

    if save_model:
        directory = os.path.join(save_dir, model_name, corpus_name, '{}'.format(wandb.run.name))
        if not os.path.exists(directory):
            os.makedirs(directory)
        torch.save({
            'iteration': iteration,
            'en': encoder.state_dict(),
            'de': decoder.state_dict(),
            'en_opt': encoder_optimizer.state_dict(),
            'de_opt': decoder_optimizer.state_dict(),
            'loss': loss,
            'voc_dict': voc.__dict__,
            'embedding': embedding.state_dict()
        }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))


## Define Evaluation

### Greedy decoding

In [19]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, input_seq : torch.Tensor, input_length : torch.Tensor, max_length : int):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:self.decoder.n_layers]
        # Initialize decoder input with SOS_token
        batch_size = input_seq.shape[1]
        device = input_seq.device
        SOS_token = 1
        decoder_input = torch.ones(1, batch_size, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

### Evaluate my text

In [20]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    print(input_batch.shape)
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words

def evaluateTrainData(searcher, eval_batches, n_iteration, device, max_length=MAX_LENGTH):
    time_diffs = []
    searcher = searcher.to(device)
    for iteration in range(n_iteration):
        training_batch = eval_batches[iteration]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch
        input_variable = input_variable.to(device)
        if iteration > 5:
            torch.cuda.synchronize()
            start = time.monotonic_ns()
        with torch.no_grad():
            output = searcher(input_variable, lengths, max_length)
        if iteration > 5:
            torch.cuda.synchronize()
            end = time.monotonic_ns()
        if iteration > 5:
            time_diffs.append(end-start)
    return np.array(time_diffs) / 1e6

def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            print("User:", input_sentence)
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

## Run Model

In [21]:
# Load model if a ``loadFilename`` is provided
def wrapper_train(config=None, profile = False, save_model = False):
    run = wandb.init(project="HPMLHW3", entity="amruthapatil")
    config = run.config if config is None else config
    model_name = 'cb_model'
    attn_model = 'dot'
    #``attn_model = 'general'``
    #``attn_model = 'concat'``
    hidden_size = 500
    encoder_n_layers = 2
    decoder_n_layers = 2
    dropout = 0.1
    batch_size = 64

    print('Building encoder and decoder ...')
    # Initialize word embeddings
    embedding = nn.Embedding(voc.num_words, hidden_size)
    # Initialize encoder & decoder models
    encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
    # Use appropriate device
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    print('Models built and ready to go!')

    # Configure training/optimization
    clip = config.clip
    teacher_forcing_ratio = config.tf_ratio
    learning_rate = config.lr
    decoder_learning_ratio = config.decoder_lrn_ratio
    n_iteration = 4000
    print_every = 1
    save_every = 500

    # Ensure dropout layers are in train mode
    encoder.train()
    decoder.train()

    # Initialize optimizers
    print('Building optimizers ...')
    optimizer_fn = optim.Adam if config.optimizer == "adam" else optim.SGD
    encoder_optimizer = optimizer_fn(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optimizer_fn(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

    # If you have CUDA, configure CUDA to call
    for state in encoder_optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.cuda()

    for state in decoder_optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.cuda()

    # Run training iterations
    print("Starting Training!")
    trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
            embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
            print_every, save_every, clip, corpus_name, teacher_forcing_ratio, profile, save_model)
    return encoder, decoder

## Run Training

In [22]:
from dataclasses import dataclass
@dataclass
class Config:
    clip: float
    tf_ratio: float
    lr: float
    optimizer: str
    decoder_lrn_ratio: float
encoder, decoder = wrapper_train(Config(clip=0.0,tf_ratio=0.0,lr=0.0001,optimizer="adam", decoder_lrn_ratio=1.0), profile=True, save_model=True)

[34m[1mwandb[0m: Currently logged in as: [33mamruthapatil[0m. Use [1m`wandb login --relogin`[0m to force relogin


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.1210

In [27]:
encoder.eval()
decoder.eval()
n_iteration = 100
random.seed(1234)
eval_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(64)])
                    for _ in range(n_iteration)]

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)
pytorch_gpu = evaluateTrainData(searcher,eval_batches, n_iteration, "cuda")
pytorch_gpu.mean()

24.19891691489362

In [28]:
pytorch_cpu = evaluateTrainData(searcher,eval_batches, n_iteration, "cpu")
pytorch_cpu.mean()

303.01220423404254

## Run Evaluation

In [29]:
# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
# evaluateInput(encoder, decoder, searcher, voc)

## W&B Random Search Sweep

In [25]:
sweep_config = {
    'method': 'random',
    'name': 'Lab3-sweep',
    'metric': {
      'goal': 'minimize',
      'name': 'loss'
    },
    'parameters': {
        'lr': {
            'values': [0.0001, 0.00025, 0.0005, 0.001]
        },
        'optimizer': {
            'values': ["adam", "sgd"]
        },
        'clip': {
            'values': [0, 25, 50, 100]
        },
        'tf_ratio': {
            'values': [0, 0.5, 1.0]
        },
        "decoder_lrn_ratio":{
            "values": [1.0, 3.0, 5.0, 10.0]
        }
    },
    'max_sweeps': 25
}

# Initialize the sweep
sweep_id = wandb.sweep(sweep=sweep_config, project="HPMLHW3")

# # Run the sweep
wandb.agent(sweep_id, function=wrapper_train)



Create sweep with ID: s6qeq5nv
Sweep URL: https://wandb.ai/amruthapatil/HPMLHW3/sweeps/s6qeq5nv


[34m[1mwandb[0m: Agent Starting Run: t09mytld with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 5
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0.5


Exception in thread IntMsgThr:
Traceback (most recent call last):
Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
Exception in thread   File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
NetStatThr:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner


        self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
        self._target(*self._args, **self._kwargs)
self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 268, in check_network_status
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 286, in check_stop_status
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 300, in check_internal_messages
    self._loop_check_status(
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 224, in _loop_check_status
        self._loop_check_status(
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 224, in _loop_check_status
    local_handle = request()
  File "/usr/lo

Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.1724

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▇▇▇▆█▇▄▅▅▂▄▄▄▄▃▇▇▄▆▆▂▃▇▂▁▃▂▂▅▂▂▆▆▅▆▆▅▂▆

0,1
loss,32.78409


[34m[1mwandb[0m: Agent Starting Run: a5d6aa92 with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_lrn_ratio: 3
[34m[1mwandb[0m: 	lr: 0.00025
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0.5
[34m[1mwandb[0m: Currently logged in as: [33mamruthapatil[0m. Use [1m`wandb login --relogin`[0m to force relogin


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9596

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,▆▅▇▇▄▅▄▅▆▄▄▄▅▆█▅▅▇▃▆▇▇▇▅▃▆▃▆▄▄▅▆▆█▁▃▅▆▄▅

0,1
loss,89.57716


[34m[1mwandb[0m: Agent Starting Run: juudddqj with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 3
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.8252

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,██▇▅▄▄▃▃▂▂▃▃▃▂▃▂▃▂▂▂▁▂▁▂▂▁▂▂▂▂▁▂▁▂▂▁▁▁▂▂

0,1
loss,39.99966


[34m[1mwandb[0m: Agent Starting Run: n0s45h7q with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 3
[34m[1mwandb[0m: 	lr: 0.00025
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 1.9853

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▇▅▆▅▅▅▅▃▅▅▄▄▅▅▃▃▄▄▄▃▃▄▃▅▃▃▃▃▂▂▂▂▂▂▁▂▂▂▁

0,1
loss,13.97928


[34m[1mwandb[0m: Agent Starting Run: ksof0n0m with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 1
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.0291

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▄▃▂▃▂▂▃▂▂▂▁▂▂▁▂▁▂▂▁▁▁▂▁▂▂▁▁▁▂▁▂▁▁▂▁▁▂▁

0,1
loss,39.81433


[34m[1mwandb[0m: Agent Starting Run: dv26orhb with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 5
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 2.5599

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▆▇▆▅▄▅▄▄▅▄▄▄▃▃▃▄▃▂▂▄▃▄▂▂▂▂▁▂▂▂▁▂▁▂▂▁▁▁▂

0,1
loss,20.63436


[34m[1mwandb[0m: Agent Starting Run: 7i26rxv4 with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9604

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,▆▆▅▄▅▄▄▅▆▆▃▅▅▆▅▄▅▄▇█▆▃▆▁▄▆█▄▅█▅▂▄▄▅▅▃▁▃▄

0,1
loss,89.52032


[34m[1mwandb[0m: Agent Starting Run: no3wlfgu with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_lrn_ratio: 3
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.8561

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,██▇▅▄▄▃▂▃▃▂▃▃▂▂▂▂▁▂▁▁▂▂▁▂▂▁▁▁▁▁▁▂▁▁▂▁▁▁▁

0,1
loss,36.05902


[34m[1mwandb[0m: Agent Starting Run: l01sfgjb with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: nan

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,▄▁▃█▇

0,1
loss,


[34m[1mwandb[0m: Agent Starting Run: p6bbhxo2 with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9797

VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded\r'), FloatProgress(value=0.10829147942550005, max=1.…

0,1
loss,▆▄▅▃▅█▄▄▆▄▄▅▄█▄▆▆▆▆▇▄▄▅▆▁▅▆▄▅▇▅▄▅▅▅▄▄▄▇▆

0,1
loss,89.75955


[34m[1mwandb[0m: Agent Starting Run: yxlmnfep with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 1
[34m[1mwandb[0m: 	lr: 0.00025
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112443200004742, max=1.0…

Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.0832

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▆▄▄▃▃▃▂▂▃▂▂▂▂▂▂▂▂▁▂▂▂▁▂▂▂▂▁▂▁▂▁▂▁▂▁▁▁▁▂

0,1
loss,41.48812


[34m[1mwandb[0m: Agent Starting Run: jh339dgn with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.5786

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▂▂▂▂▁▁▁▁▂▁▂

0,1
loss,37.33269


[34m[1mwandb[0m: Agent Starting Run: 4twlgup8 with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.4766

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▄▃▄▄▄▄▃▃▃▄▃▃▂▃▃▃▃▃▂▂▃▃▂▃▂▃▂▂▃▃▂▁▂▃▃▁▃▂

0,1
loss,34.34956


[34m[1mwandb[0m: Agent Starting Run: qrt4hk4v with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_lrn_ratio: 5
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.7424

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,▄▅▄▄▄▆▅▅▄▅▄▅▆▄▆▅▇▃▆▆▅▁▅▇▅▅▇▇▅▅▇▅▆▆▇▃█▆▆▅

0,1
loss,48.0771


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: gukxkcl9 with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.5882

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▄▄▃▃▃▄▂▃▂▃▃▂▂▃▂▂▃▂▃▃▂▂▂▂▂▃▂▂▂▂▃▂▂▂▁▂▂▁

0,1
loss,28.37185


[34m[1mwandb[0m: Agent Starting Run: meuvyqer with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 5
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.1260

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▄▇▅▆▆▅▅▃▆▅▄▃▄▄▅▄▅▄▄▃▃▄▅▃▄▃▄▃▃▃▄▄▄▂▃▃▃▁▂

0,1
loss,33.70003


[34m[1mwandb[0m: Agent Starting Run: 5jgvszd3 with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 1
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.1579

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▆█▄▇█▆▆█▆▆▆▇▅▃▅█▄▅▃▄▇▇▄▃▅▄▃▅▄▃▄▃▅▂▅▂▁▁▂

0,1
loss,33.27174


[34m[1mwandb[0m: Agent Starting Run: k197bw2t with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.0095

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▂▁▂▂▂▂▂▁▂▁▂▂

0,1
loss,43.56062


[34m[1mwandb[0m: Agent Starting Run: ke0rwpzj with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 1
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.7092

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▂▂▁▂▁▂▁▂▂

0,1
loss,38.81927


[34m[1mwandb[0m: Agent Starting Run: 9t4zcrmt with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 5
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.5026

VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded\r'), FloatProgress(value=0.10953527785210954, max=1.…

0,1
loss,█▆▅▃▄▃▄▂▃▄▂▃▃▂▂▃▂▃▄▃▃▃▃▂▁▄▂▂▃▂▁▂▂▂▂▂▃▃▃▁

0,1
loss,34.87053


[34m[1mwandb[0m: Agent Starting Run: 0gehlusy with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 1
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.7536

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▄▄▄▄▃▃▂▃▂▃▃▃▂▂▃▃▂▂▂▂▂▁▂▂▂▂▂▁▁▁▂▂▂▁▂▂▁▁▁

0,1
loss,30.11032


[34m[1mwandb[0m: Agent Starting Run: xhe149ux with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_lrn_ratio: 1
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9781

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,▃▅▅▃▆▇▁▅▄▆▅▆▆▅▃█▂▄█▂▄▅▅▄▅▆▃▁▅▇▅▅▃▇▅▃▄▃▆▇

0,1
loss,89.81565


[34m[1mwandb[0m: Agent Starting Run: jzdjma1p with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 5
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.9752

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▄▃▄▃▃▃▃▃▃▂▃▂▃▄▃▃▃▃▃▂▂▃▂▃▂▃▂▂▂▃▂▂▂▂▁▂▂▁

0,1
loss,32.45069


[34m[1mwandb[0m: Agent Starting Run: qnp9r4h9 with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 5
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.6301

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,▇█▅▅▆▄▄▄▆▅▅▅▆▄▆▅▆▄▄▅▅▄▆▅▅▅▅▇▄▃▅▅▅▁▂▄▅▅▆▆

0,1
loss,37.28604


[34m[1mwandb[0m: Agent Starting Run: bl5xnct7 with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 3
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.8672

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁▁▂▁▁

0,1
loss,39.27121


[34m[1mwandb[0m: Agent Starting Run: 0s33llkj with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.8382

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▅▅▃▅▄▅▅▄▂▃▃▃▂▂▃▃▂▂▃▃▃▂▃▃▃▂▃▂▃▂▂▃▂▂▂▁▂▃

0,1
loss,28.64373


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5qd5msbt with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.8169

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,▇▇██▇▃▄▆▆▅▅▅▅▅▄▅▃▄▄▄▅▃▄▄▄▂▃▃▃▃▂▂▃▄▂▂▁▂▂▁

0,1
loss,32.30785


[34m[1mwandb[0m: Agent Starting Run: pe5y3rc1 with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_lrn_ratio: 3
[34m[1mwandb[0m: 	lr: 0.00025
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	tf_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9759

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█████████████████▁██████████████████████

0,1
loss,89.80953


[34m[1mwandb[0m: Agent Starting Run: iu4mkt2p with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 1
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.5088

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▄▄▂▂▃▂▃▃▃▂▂▂▂▃▄▃▂▃▂▃▂▃▂▃▃▂▂▂▂▃▁▂▂▂▂▃▂▁▃

0,1
loss,38.02567


[34m[1mwandb[0m: Agent Starting Run: z8q2ht31 with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: nan

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,▄▁▂▄▃▇▁█

0,1
loss,


[34m[1mwandb[0m: Agent Starting Run: 77f4hmxz with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_lrn_ratio: 10
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	tf_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 510; Percent complete: 12.8%; Average loss: 5.5675

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Iteration: 523; Percent complete: 13.1%; Average loss: 4.8016Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x79f565857c40>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

## Best result - hyperparameter

As per the analysis, values of the hyperparameters that give the best result (Minimum loss of the trained model) is below:

- clip=50.0,
- tf_ratio=1.0,
- lr=0.00025,
- optimizer = "adam",
- decoder_lrn_ratio=3.0


In [35]:
best_result = Config(
    clip=50.0,
    tf_ratio=1.0,
    lr=0.00025,
    optimizer = "adam",
    decoder_lrn_ratio=3.0
)
_, _ = wrapper_train(best_result, profile=True, save_model=True)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▇▅▅▅▄▄▃▅▄▃▄▄▃▃▃▃▂▃▄▃▃▃▃▃▃▃▃▂▁▂▂▄▃▂▂▂▁▂▂

0,1
loss,24.4586


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 4000; Percent complete: 100.0%; Average loss: 2.1052

## PART 2

## TorchScript


In [30]:
# Converting the model that I trained in the previous exercise to Torchscript
torchscript_searcher = torch.jit.script(searcher)

# Printing the graph of the converted model
torchscript_searcher.graph

graph(%self : __torch__.GreedySearchDecoder,
      %input_seq.1 : Tensor,
      %input_length.1 : Tensor,
      %max_length.1 : int):
  %60 : bool = prim::Constant[value=0]()
  %49 : bool = prim::Constant[value=1]() # <ipython-input-19-d0c786c2d337>:20:8
  %25 : int = prim::Constant[value=4]() # <ipython-input-19-d0c786c2d337>:15:71
  %15 : NoneType = prim::Constant()
  %SOS_token.1 : int = prim::Constant[value=1]() # <ipython-input-19-d0c786c2d337>:12:37
  %33 : int = prim::Constant[value=0]() # <ipython-input-19-d0c786c2d337>:17:34
  %encoder : __torch__.EncoderRNN = prim::GetAttr[name="encoder"](%self)
  %7 : (Tensor, Tensor) = prim::CallMethod[name="forward"](%encoder, %input_seq.1, %input_length.1) # <ipython-input-19-d0c786c2d337>:8:42
  %encoder_outputs.1 : Tensor, %encoder_hidden.1 : Tensor = prim::TupleUnpack(%7)
  %decoder.1 : __torch__.LuongAttnDecoderRNN = prim::GetAttr[name="decoder"](%self)
  %n_layers : int = prim::GetAttr[name="n_layers"](%decoder.1)
  %decoder_hidden.1

 ### Evaluate the Torchscript model

In [31]:
# Evaluating the Torchscript model with GPU
torchscript_gpu = evaluateTrainData(torchscript_searcher,eval_batches, n_iteration, "cuda")

  return forward_call(*args, **kwargs)


In [32]:
# Evaluating the Torchscript model with CPU
torchscript_cpu= evaluateTrainData(torchscript_searcher,eval_batches, n_iteration, "cpu")

In [39]:
# Latency comparison table

df_table = pd.DataFrame({
    'Framework': ['PyTorch', 'TorchScript'],
    'Latency on CPU (ms)': [pytorch_cpu.mean(), torchscript_cpu.mean()],
    'Latency on GPU (ms)': [pytorch_gpu.mean(), torchscript_gpu.mean()]
})

df_table

Unnamed: 0,Framework,Latency on CPU (ms),Latency on GPU (ms)
0,PyTorch,303.012204,24.198917
1,TorchScript,304.062399,12.50789


### Save model


In [34]:
# Save and serialize it for use in a non-Python deployment environment.
torchscript_searcher.save("torchscript_searcher.pt")