# Recurrent Neural Networks and Language Models

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, math
import datasets
from tqdm import tqdm
import pickle

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
#make our work comparable if restarted the kernel
SEED = 122
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load Data
Source Credit: https://www.kaggle.com/datasets/moxxis/harry-potter-lstm

The dataset is taken from kaggle. The dataset is already clean enough as there are no extra spaces or weird characters.

In [4]:
from datasets import load_dataset_builder, Dataset

harry_potter_dataset = "./data/Harry_Potter_Books.txt"

# Read the data from the file
with open(harry_potter_dataset, 'r') as f:
    data = f.read()

# Creating list of dictionaries
data = data.split(" .")
data = [{"text": row} for row in data]

# Creating dataset object
dataset = Dataset.from_list(data)
dataset

Dataset({
    features: ['text'],
    num_rows: 67785
})

In [None]:
from datasets import DatasetDict

train_test = dataset.train_test_split(test_size=0.2)

# 10% test set and 10% validation set
train_test_valid = train_test['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': train_test['train'],
    'test': train_test_valid['test'],
    'validation': train_test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 54228
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6779
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 6778
    })
})

In [11]:
print(dataset['train'][333]['text']) # empty string

'''
If you try to change the index you might notice that sometimes there is no paragraph 
and rather an empty string so we will have to care of that later.
'''




'\nIf you try to change the index you might notice that sometimes there is no paragraph \nand rather an empty string so we will have to care of that later.\n'

# Preprocessing

## Tokenizing

In [17]:
from torchtext.data.utils import get_tokenizer



In [18]:
tokenizer = get_tokenizer('basic_english')

#function to tokenize
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  

#map the function to each example
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][33]['tokens'])

Map: 100%|██████████| 54228/54228 [00:02<00:00, 20048.07 examples/s]
Map: 100%|██████████| 6779/6779 [00:00<00:00, 20008.48 examples/s]
Map: 100%|██████████| 6778/6778 [00:00<00:00, 19637.77 examples/s]

['but', 'when', 'the', 'dormitory', 'door', 'closed', 'behind', 'ron', 'harry', 'made', 'no', 'effort', 'to', 'speed', 'up', 'his', 'packing']





## Numericializing

In [20]:
from torchtext.vocab import build_vocab_from_iterator



In [21]:
## numericalizing
vocab = build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
min_freq=3) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])    

11082
['<unk>', '<eos>', 'the', 'and', 'to', 'of', 'a', 'he', 'harry', 'was']


In [36]:
with open('model/vocab_lm.pkl', 'wb') as f:
    pickle.dump(vocab, f)

After loading and splitting the personally chosen dataset a DatasetDictonary is created. Then on that object the preprocessing steps are applied. Firstly, we tokenize the dataset using torchtext's `get_tokenizer`. The `tokenize_data` function is applied to each example where the `text` column is removed and a new `tokens` column containing the tokenized text is added.

Then our vocabulary is made using the `build_vocab_from_iterator` method from torchtext. We use the training dataset, and consider words that has occured at least three times. This is done to make sure that our vocab does not get too big. Then we add `<unk>` to signify unknown and `<eos>` to signify end of sentence. After all this the vocab size came out to be 11082.

## Prepare the batch loader

### Prepare data

In [22]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:         
            #appends eos so we know it ends....so model learn how to end...                             
            tokens = example['tokens'].append('<eos>')   
            #numericalize          
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size #get the int number of batches...
    data = data[:num_batches * batch_size] #make the batch evenly, and cut out any remaining                      
    data = data.view(batch_size, num_batches)          
    return data #[batch size, bunch of tokens]

In [23]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

## Modeling

In [24]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, 
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch size, seq len]
        embedding = self.dropout(self.embedding(src))
        #embedding: [batch size, seq len, emb_dim]
        output, hidden = self.lstm(embedding, hidden)      
        #output: [batch size, seq len, hid_dim]
        #hidden = h, c = [num_layers * direction, seq len, hid_dim)
        output = self.dropout(output) 
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab size]
        return prediction, hidden

The LSTM contains memory cells and gates that allows it to selectively remember and forget information from previous times steps. The input gate controls the flow of information from the previous memeory cell to the current memory cell. The forget gate controls the flow of information from previous cell to the current cell and allows it to selectively forget or remember information from previous steps. The memory cell stores information that can be selectively modified by the input and forget gates. And the output gate controls the flow of information from the memory cell to the hidden state and output.

Our defined class is firstly converting tokens into embeddings then processing those embeddings with stacked LSTM layers to capture temporal dependencies (relationships between past and future events or states in a time series). Then we apply dropout to embeddings to prevent overfitting by randomly zero-ing inputs during training. Then finally we use a linear layer to map LSTM outputs to vocabulary logits for predicting the next word.

## Training

Follows very basic procedure.  One note is that some of the sequences that will be fed to the model may involve parts from different sequences in the original dataset or be a subset of one (depending on the decoding length). For this reason we will reset the hidden state every epoch, this is like assuming that the next batch of sequences is probably always a follow up on the previous in the original dataset.

In [25]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [26]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 39,500,618 trainable parameters


In [27]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [28]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, bunch of tokens]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [29]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [30]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model/best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	Train Perplexity: 647.633
	Valid Perplexity: 455.890


                                                           

	Train Perplexity: 338.461
	Valid Perplexity: 230.998


                                                           

	Train Perplexity: 234.226
	Valid Perplexity: 184.518


                                                           

	Train Perplexity: 195.595
	Valid Perplexity: 162.165


                                                           

	Train Perplexity: 172.565
	Valid Perplexity: 148.446


                                                           

	Train Perplexity: 157.109
	Valid Perplexity: 138.962


                                                           

	Train Perplexity: 147.330
	Valid Perplexity: 133.701


                                                           

	Train Perplexity: 137.422
	Valid Perplexity: 127.962


                                                           

	Train Perplexity: 130.182
	Valid Perplexity: 122.618


                                                           

	Train Perplexity: 121.708
	Valid Perplexity: 119.079


                                                           

	Train Perplexity: 115.909
	Valid Perplexity: 116.534


                                                           

	Train Perplexity: 110.819
	Valid Perplexity: 114.299


                                                           

	Train Perplexity: 106.429
	Valid Perplexity: 112.190


                                                           

	Train Perplexity: 102.441
	Valid Perplexity: 110.643


                                                           

	Train Perplexity: 98.756
	Valid Perplexity: 109.325


                                                           

	Train Perplexity: 95.485
	Valid Perplexity: 108.442


                                                           

	Train Perplexity: 92.564
	Valid Perplexity: 107.949


                                                           

	Train Perplexity: 89.719
	Valid Perplexity: 107.470


                                                           

	Train Perplexity: 87.057
	Valid Perplexity: 106.740


                                                           

	Train Perplexity: 84.797
	Valid Perplexity: 106.361


                                                           

	Train Perplexity: 82.570
	Valid Perplexity: 105.928


                                                           

	Train Perplexity: 80.465
	Valid Perplexity: 105.485


                                                           

	Train Perplexity: 78.679
	Valid Perplexity: 105.689


                                                           

	Train Perplexity: 75.092
	Valid Perplexity: 104.891


                                                           

	Train Perplexity: 73.698
	Valid Perplexity: 104.654


                                                           

	Train Perplexity: 72.465
	Valid Perplexity: 104.666


                                                           

	Train Perplexity: 70.690
	Valid Perplexity: 104.049


                                                           

	Train Perplexity: 69.860
	Valid Perplexity: 104.198


                                                           

	Train Perplexity: 68.926
	Valid Perplexity: 104.091


                                                           

	Train Perplexity: 68.160
	Valid Perplexity: 104.034


                                                           

	Train Perplexity: 67.885
	Valid Perplexity: 104.086


                                                           

	Train Perplexity: 67.737
	Valid Perplexity: 104.050


                                                           

	Train Perplexity: 67.619
	Valid Perplexity: 104.052


                                                           

	Train Perplexity: 67.596
	Valid Perplexity: 104.048


                                                           

	Train Perplexity: 67.605
	Valid Perplexity: 104.045


                                                           

	Train Perplexity: 67.484
	Valid Perplexity: 104.044


                                                           

	Train Perplexity: 67.591
	Valid Perplexity: 104.042


                                                           

	Train Perplexity: 67.678
	Valid Perplexity: 104.042


                                                           

	Train Perplexity: 67.509
	Valid Perplexity: 104.042


                                                           

	Train Perplexity: 67.533
	Valid Perplexity: 104.042


                                                           

	Train Perplexity: 67.635
	Valid Perplexity: 104.042


                                                           

	Train Perplexity: 67.580
	Valid Perplexity: 104.042


                                                           

	Train Perplexity: 67.609
	Valid Perplexity: 104.042


                                                           

	Train Perplexity: 67.463
	Valid Perplexity: 104.041


                                                           

	Train Perplexity: 67.450
	Valid Perplexity: 104.041


                                                           

	Train Perplexity: 67.599
	Valid Perplexity: 104.041


                                                           

	Train Perplexity: 67.606
	Valid Perplexity: 104.041


                                                           

	Train Perplexity: 67.621
	Valid Perplexity: 104.041


                                                           

	Train Perplexity: 67.608
	Valid Perplexity: 104.041


                                                           

	Train Perplexity: 67.534
	Valid Perplexity: 104.041


Firstly, we initialize hyperparameters such as vocab size, embedding dimensions, hidden dimensions, number of layers, dropout rate and learning rate. We then move the model to device of user's choice (GPU in our case). Then adam optimizer is defined to optimize the model's parameters and CrossEntropyLoss criterion is use to compute the loss during training.

We train for 50 epochs. In each epoch, the training data is deivided into batches of specific sequece using the `get_batch` method that we defined. At the start of each epoch hidden state is reset and for each batch the model parameters are zeroed and forward pass is done. The loss is then calculated by using the predicted probabilites for a token compared with actual next token. The gradient is calculated using backpropagation and the model's parameters are updated using the optimizer. We also keep track of loss for each epoch.

At the end of an epoch, the model is put into eval mode and the validation data is processed with same process as stated before and validation loss is calculated. The learning rate scheduler is used to adjust the learning rate based on the validation loss. Finally, the model params are saved if the current validation loss is the best one observed so far. 

## Testing

In [31]:
# Load the best model state from the saved checkpoint
model.load_state_dict(torch.load('model/best-val-lstm_lm.pt',  map_location=device))

# Evaluate the model on the test data
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)

# Print the test perplexity
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 106.514


# Real-world inference

In [32]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [33]:
prompt = 'Harry Potter is '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
harry potter is a most interesting thing to do

0.7
harry potter is a most handsome thing

0.75
harry potter is in danger

0.8
harry potter is in danger

1.0
harry potter is those most handsome than the ministry but we are the potters and i was nearly wondering whether however so



# Web application interface documentation

I made the interface using Dash. It is a simple UI with a query field, search button, some validations and a result section. The demo can be found in the `README.md` file.

The model is integrated with the interface with simple steps. Firstly, the vocab is loaded using `pickle` and the model is setup using pre-trained weights into it. Temperature settings are passed to the language model during text generation to control the diversity. The model completes a sentence based on user's prompts.

The user interaction flow is as follows:
- User enters a prompt into the field
- Clicks the button
- Texts are generated completing the prompt and are showed in proper manner

Also, since the model is trained only on harry potter books - if the user enters anything other than the context relating to harry potter, the results wont make much sense.