# Let to **rayquaza** use HyperBeam

You guys probably very excited about ChatGPT.  In today class, we will be implementing a very simple language model, which is basically what ChatGPT is, but with a simple LSTM.  You will be surprised that it is not so difficult at all.

Paper that we base on is *Regularizing and Optimizing LSTM Language Models*, https://arxiv.org/abs/1708.02182

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer

import torchtext, datasets, math
from tqdm import tqdm
from torchtext.vocab import build_vocab_from_iterator

from spacy.lang.en.stop_words import STOP_WORDS
import spacy
import re
import pickle


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
 # this is the best seed for training to get the best result
torch.manual_seed(3407)
torch.backends.cudnn.deterministic = True

cuda


## 1. Load data - Wiki Text

We will be using wikitext which contains a large corpus of text, perfect for language modeling task.  This time, we will use the `datasets` library from HuggingFace to load.

In [3]:

# https://huggingface.co/datasets/codeparrot/github-jupyter-code-to-text/tree/main dataset
#there are raw and preprocessed version; we used the raw one and preprocessed ourselves for fun
train_all_set= datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="train")
test_all_set = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="test")
nlp = spacy.load('en_core_web_md')

# print(train_jupyter, test_jupyter)

Using custom data configuration codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1
Found cached dataset parquet (/home/atichets/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Using custom data configuration codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1
Found cached dataset parquet (/home/atichets/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [4]:

train_set = [split for text in train_all_set['content'] for split in text.split('\n') if split != ""]
test_set = [split for text in test_all_set['content'] for split in text.split('\n') if split != ""]

In [5]:
len(train_set),len(test_set)

(11367363, 2875424)

## 2. Preprocessing

### Tokenizing

Simply tokenize the given text to tokens.

In [6]:
train_total = 300000
test_total = 3000

In [7]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

tokenized_dataset_train = yield_tokens(train_set[train_total:2*train_total])
tokenized_dataset_test = yield_tokens(test_set[:test_total])

In [8]:


def preprocessing(sentence):
    
    # Clear the html tag by using regular expression.
    sentence = re.sub("<[^>]*>", "", sentence)
    sentence = re.sub("[^\x00-\x7F]+", "", sentence) #extract non-english out
    #It matches any character which is not contained in the ASCII character set (0-127, i.e. 0x0 to 0x7F)
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and \
            token.pos_ != 'SYM' and token.pos_!= 'X':
                cleaned_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(cleaned_tokens)

### Numericalizing

We will tell torchtext to add any word that has occurred at least three times in the dataset to the vocabulary because otherwise it would be too big.  Also we shall make sure to add `unk` and `eos`.

In [9]:
RENEW = False

In [10]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(preprocessing(text))
if RENEW:
    vocab = build_vocab_from_iterator(yield_tokens(train_set[:test_total]), min_freq=5) 
    vocab.insert_token('<unk>', 0)           
    vocab.insert_token('<eos>', 3)            
    vocab.insert_token('<sos>', 2)            
    vocab.insert_token('<pad>', 1)            
    vocab.set_default_index(vocab['<unk>'])   
    print('Vocab Size',len(vocab))                         
    print(vocab.get_itos()[:10])       

In [11]:
if RENEW:
    with open('vocabjjngu.txt', 'w') as file:
        for item in vocab.get_itos():
            # write each item on a new line
            file.write("%s\n" % item)
        print('Done')
else: 
    v = [line.rstrip() for line in open('vocabjjngu.txt', mode = 'r')]
    print('Vocab Size check', len(v)) #not work

Vocab Size check 393


In [12]:
if RENEW:
    with open('vocabjjngu.atikeep', 'wb') as handle:
        pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open('vocabjjngu.atikeep', 'rb') as handle:
        vocab = pickle.load(handle)


## 3. Prepare the batch loader

### Prepare data

Given "Chaky loves eating at AIT", and "I really love deep learning", and given batch size = 3, we will get three batches of data "Chaky loves eating at", "AIT `<eos>` I really", "love deep learning `<eos>`".  

In [13]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:       
        #appends eos so we know it ends....so model learn how to end...                             
        tokens = example.append('<eos>') #end of sentence
        #numericalize          
        tokens = [vocab[token] for token in example] 
        data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)        
    return data

In [14]:
batch_size = 12
train_data = get_data(tokenized_dataset_train, vocab, batch_size)
valid_data = get_data(tokenized_dataset_test, vocab, batch_size)
# test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

In [15]:
train_data.shape #[batch_size, all the next length]

torch.Size([12, 232160])

In [16]:
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

In [17]:
# # helper function to yield list of tokens
# # here data can be `train` or `val` or `test`
# from torchtext.data.utils import get_tokenizer
# token_transform = get_tokenizer('spacy', language='en_core_web_sm')
# def yield_tokens(data):
#     for data_sample in data:
#         yield token_transform(data_sample) #either first or second index

In [18]:
# from torchtext.vocab import build_vocab_from_iterator

#     # Create torchtext's Vocab object 
# vocab_transform = build_vocab_from_iterator(yield_tokens(train_data), 
#                                                     min_freq=2,   #if not, everything will be treated as UNK
#                                                     specials=special_symbols,
#                                                     special_first=True) #indicates whether to insert symbols at the beginning or at the end                                            
# # Set UNK_IDX as the default index. This index is returned when the token is not found. 
# # If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 

# vocab_transform.set_default_index(UNK_IDX)

## 3.1. Preparing the dataloader

In [48]:
# from torch.nn.utils.rnn import pad_sequence
# from torch.utils.data import DataLoader

# BATCH_SIZE = 12
# UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3

# # helper function to club together sequential operations
# def sequential_transforms(*transforms):
#     def func(txt_input):
#         for transform in transforms:
#             txt_input = transform(txt_input)
#         return txt_input
#     return func

# # function to add BOS/EOS and create tensor for input sequence indices
# def tensor_transform(token_ids):
#     return torch.cat((torch.tensor([SOS_IDX]), 
#                       torch.tensor(token_ids), 
#                       torch.tensor([EOS_IDX])))

# # src and trg language text transforms to convert raw strings into tensors indices
# # text_transform = {}
# # for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
# text_transform = sequential_transforms(tokenizer, #Tokenization
#                                         vocab, #Numericalization
#                                         tensor_transform) # Add BOS/EOS and create tensor


# # function to collate data samples into batch tesors
# def collate_batch(batch):
#     src_batch, src_len_batch = [], []
#     for src_sample in batch:
#         processed_text = text_transform(src_sample.rstrip("\n"))
#         src_batch.append(processed_text)
#         # trg_batch.append(text_transform(trg_sample.rstrip("\n")))
#         src_len_batch.append(processed_text.size(0))

#     src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
#     # trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX)
#     return src_batch, torch.tensor(src_len_batch, dtype=torch.int64)#,# trg_batch

Create train, val, and test dataloaders

In [57]:
batch_size = 26

train_loader = DataLoader(train_set[train_total:2*train_total], batch_size=batch_size,
                              shuffle=True, collate_fn=collate_batch)
# valid_loader = DataLoader(val, batch_size=batch_size,
#                                shuffle=True, collate_fn=collate_batch)
# test_loader  = DataLoader(test, batch_size=batch_size,
#                              shuffle=True, collate_fn=collate_batch)

Let's test the train loader.

In [62]:
# for en,baz in train_loader:
#     break

In [59]:
# print("first shape: ", en.shape)  # (seq len, batch_size)
# print("sec shape: ", baz.shape)   # (seq len, batch_size)

English shape:  torch.Size([75, 26])
German shape:  torch.Size([26])


## 4. Modeling 

In [19]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F

import random, math, time


In [20]:
# class Attention(nn.Module):
#     def __init__(self, hid_dim):
#         super().__init__()
        
#         self.v = nn.Linear(hid_dim, 1, bias = False)
#         self.W = nn.Linear(hid_dim,     hid_dim) #for decoder
#         self.U = nn.Linear(hid_dim * 2, hid_dim) #for encoder outputs
                
#     def forward(self, hidden, encoder_outputs, mask):
        
#         #hidden = [batch size, hid dim]
#         #encoder_outputs = [src len, batch size, hid dim * 2]
        
#         batch_size = encoder_outputs.shape[1]
#         src_len = encoder_outputs.shape[0]
        
#         #repeat decoder hidden state src_len times
#         hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
#         #hidden = [batch size, src len, hid dim]

#         encoder_outputs = encoder_outputs.permute(1, 0, 2)
#         #encoder_outputs = [batch size, src len, hid dim * 2]
        
#         energy = torch.tanh(self.W(hidden) + self.U(encoder_outputs))
#         #energy = [batch size, src len, hid dim]
        
#         attention = self.v(energy).squeeze(2)
#         #attention = [batch size, src len]
        
#         #use masked_fill_ if you want in-place
#         attention = attention.masked_fill(mask, -1e10)
        
#         return F.softmax(attention, dim = 1)



class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        
        self.v = nn.Linear(hid_dim, 1, bias=False)
        self.W = nn.Linear(hid_dim, hid_dim)
                
    def forward(self, hidden, mask):
        
        #hidden = [batch size, hid dim]
        
        # batch_size = hidden.shape[0]
        # src_len = hidden.shape[1]
        return F.softmax(self.v(torch.tanh(self.W(hidden))),dim=0)         # attention = attention.masked_fill(mask, -1e10)  #<---- This still Bug
        # hidden = hidden.permute(1, 0)
        #hidden = [src len, batch size, hid dim]
        # print('here')
        # print(hidden.size())
        energy = torch.tanh(self.W(hidden))
        #energy = [src len, batch size, hid dim]
        # print('here 2')
        # print(energy.size())
        attention = self.v(energy)#.squeeze(2)
        #attention = [src len, batch size]
        # print('herte3')
        #use masked_fill_ if you want in-place
        # print(attention.size())
        # attention = attention.masked_fill(mask, -1e10)
        # print(attention.size())
        # print('here4')
        return F.softmax(attention, dim=0)


In [21]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout, attention,device):
        super().__init__()
        self.device = device
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = nn.GRU((hid_dim * 2) + emb_dim, hid_dim)
        self.fc = nn.Linear((hid_dim * 2) + hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):
             
        #input = [batch size]
        #hidden = [batch size, hid dim]
        #encoder_outputs = [src len, batch size, hid dim * 2]
        #mask = [batch size, src len]
        
        input = input.unsqueeze(0)
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
        
        # a = self.attention(hidden, encoder_outputs, mask)
        #a = [batch size, src len]
        # print('hgfd')
        # print(hidden.size(),mask.size())
        a = self.attention(hidden, mask)
        # print('aetter')    
        a = a.unsqueeze(1)
        #a = [batch size, 1, src len]
        # print(a.size())
        # encoder_outputs = encoder_outputs.permute(1, 0, 2)
        encoder_outputs = torch.ones((a.size(0),a.size(1),self.hid_dim*2)).to(self.device)
        #encoder_outputs = [batch size, src len, hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        #weighted = [batch size, 1, hid dim * 2]
        # print(weighted.size())
        weighted = weighted.permute(1, 0, 2)
        #weighted = [1, batch size, hid dim * 2]
        # print(weighted.size())
        # print(embedded.size())
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        #rnn_input = [1, batch size, (hid dim * 2) + emb dim]
            
        output, hidden = self.gru(rnn_input, hidden.unsqueeze(0))
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc(torch.cat((output, weighted, embedded), dim = 1))
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0), a.squeeze(1)

In [22]:
class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        self.h        = hiddenstate  #define the hidden state
        self.prevNode = previousNode  #where does it come from
        self.wordid   = wordId  #the numericalized integer of the word
        self.logp     = logProb  #the log probability
        self.len      = length  #the current length; first word starts at 1

    def eval(self, alpha=0.7):
        # the score will be simply the log probability penaltized by the length 
        # we add some small number to avoid division error
        # read https://arxiv.org/abs/1808.10006 to understand how alpha is selected
        return self.logp / float(self.len + 1e-6) ** (alpha)
    
    #this is the function for comparing between two beamsearchnodes, whether which one is better
    #it is called when you called "put"
    def __lt__(self, other):
        return self.len < other.len

    def __gt__(self, other):
        return self.len > other.len

In [23]:
from queue import PriorityQueue
import operator

class Seq2SeqBeam(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()
        
        # self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device
        
    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1, 0)  #permute so it's the same shape as attention
        return mask
        
    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0):
        
        #src = [src len, batch size]
        #src_len = [batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is the probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.5 we use teacher forcing 50% of the time
                    
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #tensor to store attentiont outputs from decoder
        attentions = torch.zeros(trg_len, batch_size, src.shape[0]).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        # encoder_outputs, hidden = self.encoder(src, src_len)
        encoder_outputs,hidden  = [],self.init_hidden(src)
        # print(hidden.shape)
        #first input to the decoder is the <sos> tokens
        input_ = trg[0,:]
        
        mask = self.create_mask(src)
        #mask = [batch size, src len]
                
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state, all encoder hidden states 
            #  and mask
            #receive output tensor (predictions) and new hidden state
            # print('before')
            output, hidden, attention = self.decoder(input_, hidden, encoder_outputs, mask)
            # print('after')
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #place attentions in a tensor holding attention for each token
            attentions[t] = attention
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input_ = trg[t] if teacher_force else top1
            
        return outputs, attentions
    
    
    #use during inference
    #encapsulates beam_decode or greedy_decode
    def decode(self, src, src_len, trg, method='beam-search'):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #src len = [batch size]

        encoder_outputs, hidden = self.encoder(src, src_len) 
        #encoder_outputs = [src len, batch size, hid dim * 2]  (*2 because of bidirectional)(every hidden states)
        #hidden = [batch size, hid dim]  #final hidden state
       
        hidden = hidden.unsqueeze(0)
        #hidden = [1, batch size, hid dim]
        
        if method == 'beam-search':
            return self.beam_decode(src, trg, hidden, encoder_outputs)
        else:
            return self.greedy_decode(trg, hidden, encoder_outputs)

    def greedy_decode(self, trg, decoder_hidden, encoder_outputs):
            # trg = [trg_len, batch_size]
        # decoder_hidden = [1, batch_size, hid_dim]
        # encoder_outputs = [src_len, batch_size, hid_dim * 2]
        
        trg_len, batch_size = trg.shape
        trg_vocab_size = self.decoder.output_dim
        decoded_outputs = torch.zeros((trg_len, batch_size)).to(self.device)
        
        # First input to the decoder is the <sos> tokens
        input_ = trg[0, :]
        for t in range(1, trg_len):
            output, decoder_hidden, _ = self.decoder(input_.unsqueeze(0), decoder_hidden, encoder_outputs)
            decoded = output.argmax(1)
            decoded_outputs[t] = decoded
            input_ = decoded
        
        # Transpose the output from [trg_len, batch_size] to [batch_size, trg_len]
        return decoded_outputs.permute(1, 0)
        pass

    def beam_decode(self, src_tensor, target_tensor, decoder_hiddens, encoder_outputs=None):
        #src_tensor      = [src len, batch size]
        #target_tensor   = [trg len, batch size]
        #decoder_hiddens = [1, batch size, hid dim]
        #encoder_outputs = [src len, batch size, hid dim * 2]
        
        target_tensor = target_tensor.permute(1, 0)
        #target_tensor = [batch size, trg len]
        
        #how many parallel searches
        beam_width = 3
        
        #how many sentence do you want to generate
        topk = 1  
        
        #final generated sentence
        decoded_batch = []
                
        #Another difference is that beam_search_decoding has 
        #to be done sentence by sentence, thus the batch size is indexed and reduced to only 1.  
        #To keep the dimension same, we unsqueeze 1 dimension for the batch size.
        for idx in range(target_tensor.size(0)):  # batch_size
            
            #decoder_hiddens = [1, batch size, dec hid dim]
            decoder_hidden = decoder_hiddens[:, idx, :]
            #decoder_hidden = [1, dec hid dim]
            
            #encoder_outputs = [src len, batch size, enc hid dim * 2]
            encoder_output = encoder_outputs[:, idx, :].unsqueeze(1)
            #encoder_output = [src len, 1, enc hid dim * 2]
            
            mask = self.create_mask(src_tensor[:, idx].unsqueeze(1))
            # print("mask shape: ", mask.shape)
            
            #mask = [1, src len]

            # Start with the start of the sentence token
            decoder_input = torch.LongTensor([SOS_IDX]).to(device)

            # Number of sentence to generate
            endnodes = []  #hold the nodes of EOS, so we can backtrack
            number_required = min((topk + 1), topk - len(endnodes))

            # starting node -  hidden vector, previous node, word id, logp, length
            node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
            nodes = PriorityQueue()  #this is a min-heap

            # start the queue
            nodes.put((-node.eval(), node))  #we need to put - because PriorityQueue is a min-heap
            qsize = 1

            # start beam search
            while True:
                # give up when decoding takes too long
                if qsize > 2000: break

                # fetch the best node
                # score is log p divides by the length scaled by some constants
                score, n = nodes.get()
                            
                # wordid is simply the numercalized integer of the word
                decoder_input  = n.wordid
                decoder_hidden = n.h

                if n.wordid.item() == EOS_IDX and n.prevNode != None:
                    endnodes.append((score, n))
                    # if we reached maximum # of sentences required
                    if len(endnodes) >= number_required:
                        break
                    else:
                        continue

                # decode for one step using decoder
                # decoder_input = SOS_IDX
                # decoder_hidden = [1, hid dim]
                # encoder_output = [src len, 1, hid dim * 2]
                # mask = [1, src len]
                
                prediction, decoder_hidden, _ = self.decoder(decoder_input, decoder_hidden, encoder_output, mask)
                #prediction     = [1, output dim]  #1 because the batch size is 1
                #decoder hidden = [1, hid dim]

                #so basically prediction is probabilities across all possible vocab
                #we gonna retrieve k top probabilities (which is defined by beam_width) and their indexes
                #recall that beam_width defines how many parallel searches we want
                log_prob, indexes = torch.topk(prediction, beam_width)
                # log_prob      = (1, beam width)
                # indexes       = (1, beam width)
                
                nextnodes = []  #the next possible node you can move to

                # we only select beam_width amount of nextnodes
                for top in range(beam_width):
                    pred_t = indexes[0, top].reshape(-1)  #reshape because wordid is assume to be []; see when we define SOS
                    log_p  = log_prob[0, top].item()
                                    
                    #decoder hidden, previous node, current node, prob, length
                    node = BeamSearchNode(decoder_hidden, n, pred_t, n.logp + log_p, n.len + 1)
                    score = -node.eval()
                    nextnodes.append((score, node))

                # put them into queue
                for i in range(len(nextnodes)):
                    score, nn = nextnodes[i]
                    nodes.put((score, nn))
                    # increase qsize
                qsize += len(nextnodes) - 1

            # Once everything is finished, choose nbest paths, back trace them
            
            ## in case it does not finish, we simply get couple of nodes with highest probability
            if len(endnodes) == 0:
                endnodes = [nodes.get() for _ in range(topk)]

            #look from the end and go back....
            utterances = []
            for score, n in sorted(endnodes, key=operator.itemgetter(0)):
                utterance = []
                utterance.append(n.wordid)
                # back trace by looking at the previous nodes.....
                while n.prevNode != None:
                    n = n.prevNode
                    utterance.append(n.wordid)

                utterance = utterance[::-1]  #reverse it....
                utterances.append(utterance) #append to the list of sentences....

            decoded_batch.append(utterances)

        return decoded_batch  #(batch size, length)
    
    def init_hidden(self,src):
        #this function gonna be run in the beginning of the epoch
        hidden = torch.zeros( src.size(0),self.decoder.hid_dim).to(self.device)
        return hidden #return as tuple

In [24]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [25]:
input_dim   = len(vocab)
output_dim  = len(vocab)
emb_dim     = 256  
hid_dim     = 512  
dropout     = 0.5
SRC_PAD_IDX = PAD_IDX

attn = Attention(hid_dim)
# enc  = Encoder(input_dim,  emb_dim,  hid_dim, dropout)
dec  = Decoder(output_dim, emb_dim,  hid_dim, dropout, attn,device)

model = Seq2SeqBeam('', dec, SRC_PAD_IDX, device).to(device)
model.apply(initialize_weights)

Seq2SeqBeam(
  (decoder): Decoder(
    (attention): Attention(
      (v): Linear(in_features=512, out_features=1, bias=False)
      (W): Linear(in_features=512, out_features=512, bias=True)
    )
    (embedding): Embedding(393, 256)
    (gru): GRU(1280, 512)
    (fc): Linear(in_features=1792, out_features=393, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [26]:
# class LSTMLanguageModel(nn.Module):
#     def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
#         super().__init__()
#         self.hid_dim = hid_dim
#         self.num_layers = num_layers
#         self.embedding = nn.Embedding(vocab_size,emb_dim)
#         self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
#                                         dropout = dropout_rate, batch_first = True)
#         self.dropout = nn.Dropout(dropout_rate)
#         #when you do LM, you look forward, so it does not make sense to do bidirectional
#         self.fc = nn.Linear(hid_dim,vocab_size)

#     def init_hidden(self, batch_size, device):
#         #this function gonna be run in the beginning of the epoch
#         hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
#         cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)

#         return hidden, cell #return as tuple

#     def detach_hidden(self, hidden):
#         hidden, cell = hidden
#         hidden = hidden.detach() #removing this hidden from gradients graph
#         cell =  cell.detach() #removing this hidden from gradients graph
#         return hidden, cell

#     def forward(self, src, hidden):
#         #src: [batch_size, seq_len]

#         #embed 
#         embedded = self.embedding(src)
#         #embed : [batch_size, seq_len, emb_dim]

#         #send this to the lstm
#         #we want to put hidden here... because we want to reset hidden .....
#         output, hidden = self.lstm(embedded, hidden)
#         #output : [batch_size, seq_len, hid_dim] ==> all hidden states
#         #hidden : [batch_size, seq_len, hid_dim] ==> last hidden states from each layer

#         output = self.dropout(output)
#         prediction = self.fc(output)
#         #prediction: [batch size, seq_len, vocab_size]
#         return prediction, hidden

## 5. Training 

Follows very basic procedure.  One note is that some of the sequences that will be fed to the model may involve parts from different sequences in the original dataset or be a subset of one (depending on the decoding length). For this reason we will reset the hidden state every epoch, this is like assuming that the next batch of sequences is probably always a follow up on the previous in the original dataset.

In [27]:
# vocab_size = len(vocab)
# emb_dim = 400                # 400 in the paper
# hid_dim = 1150               # 1150 in the paper
# num_layers = 3                # 3 in the paper
# dropout_rate = 0.5           
lr = 1e-3                     

In [28]:
# model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX) #combine softmax with cross entropy
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 3,824,009 trainable parameters


In [29]:
def get_batch(data, seq_len, idx):
    #this data is from get_data()
    #train_data.shape # [batch_size, number of batches....]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [30]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]


    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        # hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        hidden = model.init_hidden(src)
        prediction, hidden = model(src, 0,target)               

        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) #prevent gradient explosion - clip is basically 
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [33]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    # hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            # hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src,0, target)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [34]:
n_epochs = 5
seq_len  = 12
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'predictor_weight.pt')
    print(f'\tepoch: {epoch+1}')
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                               

	epoch: 1
	Train Perplexity: 7.729
	Valid Perplexity: 9.533


                                                               

	epoch: 2
	Train Perplexity: 7.736
	Valid Perplexity: 9.545


                                                               

	epoch: 3
	Train Perplexity: 7.705
	Valid Perplexity: 9.466


                                                             

KeyboardInterrupt: 

## 7. Real-world inference

Here we take the prompt, tokenize, encode and feed it into the model to get the predictions.  We then apply softmax while specifying that we want the output due to the last word in the sequence which represents the prediction for the next word.  We divide the logits by a temperature value to alter the model’s confidence by adjusting the softmax probability distribution.

Once we have the Softmax distribution, we randomly sample it to make our prediction on the next word. If we get <unk> then we give that another try.  Once we get <eos> we stop predicting.
    
We decode the prediction back to strings last lines.

In [35]:
model.load_state_dict(torch.load('predictor_weight.pt'))

<All keys matched successfully>

In [45]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=3407):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    # hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)

            prediction, hidden = model(src, 0,src*3)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [47]:
prompt = 'generate'
max_seq_len = 30
seed = 0
        #superdiverse   more diverse
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0] 
#sample from this distribution higher probability will get more change
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
<unk> line run_setting str learning latency 50hertz value epoch in renewable how='outer signature dense m1 sgd measure determine index sheet add need client setting image notebook want coordinate function analysis product

0.7
<unk> line run_setting str learning latency 50hertz value epoch in renewable how='outer signature dense m1 sgd measure determine index sheet add need client setting image notebook want coordinate function analysis product

0.75
<unk> line run_setting str learning latency 50hertz value epoch in renewable how='outer signature dense m1 sgd measure determine index sheet add need client setting image notebook want coordinate function analysis product

0.8
<unk> line run_setting str learning latency 50hertz value epoch in renewable how='outer signature dense m1 sgd measure determine index sheet add need client setting image notebook want coordinate function analysis product

1.0
<unk> line run_setting str learning latency 50hertz value epoch in renewable how='outer