# Recurrent Neural Networks and Language Models

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from tqdm import tqdm
import pickle

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [30]:
# seed for reproducability
SEED = 122
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load Data

Source Credit: https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books

In [31]:
#Cleaning the dataset

import re
import unicodedata
import pandas as pd

INPUT_PATH = "./data/book.txt"        
OUTPUT_TXT = "./data/cleaned_book.txt"
OUTPUT_CSV = "./data/cleaned_book.csv"

SMART_MAP = {
    "\u2018": "'", "\u2019": "'", "\u201C": '"', "\u201D": '"',
    "\u2013": "-", "\u2014": "-", "\u2212": "-",
    "\u2026": "...",
    "\u00A0": " ",
}

def clean_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s)

    # Normalize unicode
    s = unicodedata.normalize("NFKC", s)

    # Replace smart punctuation
    for k, v in SMART_MAP.items():
        s = s.replace(k, v)

    # Remove zero-width/invisible characters
    s = re.sub(r"[\u200B-\u200D\uFEFF]", "", s)

    # Convert newlines/tabs to spaces (we'll re-control splitting ourselves)
    s = re.sub(r"[\r\n\t]+", " ", s)

    # Collapse multiple spaces
    s = re.sub(r"\s{2,}", " ", s).strip()

    return s

# ---------- Option A: Treat EACH LINE as one record ----------
def load_as_lines(path: str):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        lines = [ln.strip("\n") for ln in f]
    # drop empty lines
    lines = [ln for ln in lines if ln.strip()]
    return lines

# ---------- Option B: Treat PARAGRAPHS (blank-line separated) as one record ----------
def load_as_paragraphs(path: str):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        raw = f.read()
    # split on 1+ blank lines
    paras = re.split(r"\n\s*\n+", raw.strip())
    paras = [p for p in paras if p.strip()]
    return paras

# Choose one:
records = load_as_paragraphs(INPUT_PATH)  # or: load_as_lines(INPUT_PATH)

# Clean
cleaned = [clean_text(r) for r in records]
cleaned = [c for c in cleaned if c]  # remove empties

# Remove duplicates
seen = set()
unique_cleaned = []
for c in cleaned:
    if c not in seen:
        seen.add(c)
        unique_cleaned.append(c)

# Save cleaned TXT (one record per line)
# Save cleaned TXT (ALL records concatenated into ONE line)
with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
    f.write(" ".join(unique_cleaned))


# Also save CSV
df = pd.DataFrame({"text": unique_cleaned})
df.to_csv(OUTPUT_CSV, index=False)

print("Done.")
print("Records:", len(unique_cleaned))
print("Preview:")
print(df.head(5))


Done.
Records: 1
Preview:
                                                text
0  THE BOY WHO LIVED Mr and Mrs Dursley of number...


In [32]:
from datasets import Dataset

alice_dataset = "./data/cleaned_book.txt"

# Read the data from the file
with open(alice_dataset, 'r') as f:
    data = f.read()

# Creating list of dictionaries
data = data.split(" .")
data = [{"text": row} for row in data]

# Creating dataset object
dataset = Dataset.from_list(data)
dataset

Dataset({
    features: ['text'],
    num_rows: 67785
})

In [33]:
from datasets import DatasetDict

train_test = dataset.train_test_split(test_size=0.2)

# 10% test set and 10% validation set
train_test_valid = train_test['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': train_test['train'],
    'test': train_test_valid['test'],
    'validation': train_test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 54228
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6779
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 6778
    })
})

In [34]:
print(dataset['train'][333]['text']) # empty string

'''
If you change the index you might notice that sometimes there is no paragraph shown
rather an empty string therefore, will have to care of that later.
'''

my poor dear boy


'\nIf you change the index you might notice that sometimes there is no paragraph shown\nrather an empty string therefore, will have to care of that later.\n'

Preprocessing

Tokenizing

In [35]:
from torchtext.data.utils import get_tokenizer

In [36]:
import random
i = random.randrange(len(dataset["train"]))
print(i, dataset["train"][i]["text"])


17502 Sso sweet Dudders


In [37]:
tokenizer = get_tokenizer('basic_english')

#function to tokenize
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  

#map the function to each example
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][33]['tokens'])

Map:   0%|          | 0/54228 [00:00<?, ? examples/s]

Map:   0%|          | 0/6779 [00:00<?, ? examples/s]

Map:   0%|          | 0/6778 [00:00<?, ? examples/s]

['dyou', 'really', 'think', 'theres', 'a', 'chamber', 'of', 'secrets', '?', 'ron', 'asked', 'hermione']


# Numericializing

In [38]:
from torchtext.vocab import build_vocab_from_iterator

In [39]:
from collections import Counter
from torchtext.vocab import Vocab

counter = Counter(tok for sent in tokenized_dataset["train"]["tokens"] for tok in sent)
vocab = Vocab(counter, min_freq=3, specials=["<unk>", "<eos>"])

print(len(vocab))
print(vocab.itos[:10])   # torchtext 0.6.0 uses .itos instead of get_itos()


11049
['<unk>', '<eos>', 'the', 'and', 'to', 'of', 'a', 'he', 'harry', 'was']


In [40]:
with open('model/vocab_lm.pkl', 'wb') as f:
    pickle.dump(vocab, f)

After loading and splitting the personally chosen dataset a DatasetDictonary is created. Then on that object the preprocessing steps are applied. Firstly, we tokenize the dataset using torchtext's get_tokenizer. The tokenize_data function is applied to each example where the text column is removed and a new tokens column containing the tokenized text is added.

Then our vocabulary is made using the build_vocab_from_iterator method from torchtext. We use the training dataset, and consider words that has occured at least three times. This is done to make sure that our vocab does not get too big. Then we add <unk> to signify unknown and <eos> to signify end of sentence. After all this the vocab size came out to be 928.

# Prepare the batch loader

In [41]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:         
            #appends eos so we know it ends....so model learn how to end...                             
            tokens = example['tokens'].append('<eos>')   
            #numericalize          
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size #get the int number of batches...
    data = data[:num_batches * batch_size] #make the batch evenly, and cut out any remaining                      
    data = data.view(batch_size, num_batches)          
    return data #[batch size, bunch of tokens]

In [42]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

PAD_IDX = 0
BATCH_SIZE = 128

def lm_collate(batch):
    batch = [b for b in batch if len(b) >= 2]
    x = [b[:-1] for b in batch]
    y = [b[1:]  for b in batch]
    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=lm_collate)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=lm_collate)
test_loader  = DataLoader(test_data,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=lm_collate)


# Modeling

In [43]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, 
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch size, seq len]
        embedding = self.dropout(self.embedding(src))
        #embedding: [batch size, seq len, emb_dim]
        output, hidden = self.lstm(embedding, hidden)      
        #output: [batch size, seq len, hid_dim]
        #hidden = h, c = [num_layers * direction, seq len, hid_dim)
        output = self.dropout(output) 
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab size]
        return prediction, hidden

An LSTM architecture incorporates a cell state and a set of gating mechanisms that regulate information flow across time steps, enabling it to model long-range dependencies. The input gate determines how much new information is written to the cell state, while the forget gate controls the extent to which information from the previous cell state is retained or discarded. The cell state functions as a persistent memory that is updated through these gates. The output gate then governs how much of the cell state is exposed as the hidden state, which directly influences the modelâ€™s output at each step.
In our implementation, the pipeline first maps discrete tokens into dense embedding representations. These embeddings are subsequently processed by stacked LSTM layers to capture temporal structure and sequential dependencies. To improve generalisation, dropout is applied to the embeddings during training, reducing overfitting by randomly masking a proportion of input features. Finally, a linear projection layer transforms the LSTM outputs into vocabulary-level logits, enabling next-word prediction.

# Training

In [44]:
vocab_size = len(vocab)
emb_dim = 1024                
hid_dim = 1024               
num_layers = 2                
dropout_rate = 0.65              
lr = 1e-3                     

In [45]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 39,433,001 trainable parameters


In [46]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [47]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, bunch of tokens]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [48]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [50]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model/best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                                                                                        

	Train Perplexity: 312.451
	Valid Perplexity: 220.791


                                                                                                                        

	Train Perplexity: 225.549
	Valid Perplexity: 176.646


                                                                                                                        

	Train Perplexity: 187.804
	Valid Perplexity: 154.635


                                                                                                                        

	Train Perplexity: 164.768
	Valid Perplexity: 141.004


                                                                                                                        

	Train Perplexity: 148.604
	Valid Perplexity: 131.062


                                                                                                                        

	Train Perplexity: 136.536
	Valid Perplexity: 124.534


                                                                                                                        

	Train Perplexity: 126.996
	Valid Perplexity: 119.081


                                                                                                                        

	Train Perplexity: 118.952
	Valid Perplexity: 114.858


                                                                                                                        

	Train Perplexity: 112.176
	Valid Perplexity: 111.804


                                                                                                                        

	Train Perplexity: 106.361
	Valid Perplexity: 109.029


                                                                                                                        

	Train Perplexity: 101.225
	Valid Perplexity: 107.053


                                                                                                                        

	Train Perplexity: 96.841
	Valid Perplexity: 105.192


                                                                                                                        

	Train Perplexity: 92.677
	Valid Perplexity: 103.966


                                                                                                                        

	Train Perplexity: 89.043
	Valid Perplexity: 102.707


                                                                                                                        

	Train Perplexity: 85.887
	Valid Perplexity: 102.211


                                                                                                                        

	Train Perplexity: 82.717
	Valid Perplexity: 101.347


                                                                                                                        

	Train Perplexity: 79.968
	Valid Perplexity: 100.722


                                                                                                                        

	Train Perplexity: 77.475
	Valid Perplexity: 100.839


                                                                                                                        

	Train Perplexity: 73.095
	Valid Perplexity: 100.001


                                                                                                                        

	Train Perplexity: 71.156
	Valid Perplexity: 99.716


                                                                                                                        

	Train Perplexity: 69.635
	Valid Perplexity: 99.706


                                                                                                                        

	Train Perplexity: 67.373
	Valid Perplexity: 99.320


                                                                                                                        

	Train Perplexity: 66.399
	Valid Perplexity: 99.192


                                                                                                                        

	Train Perplexity: 65.612
	Valid Perplexity: 99.240


                                                                                                                        

	Train Perplexity: 64.536
	Valid Perplexity: 99.296


                                                                                                                        

	Train Perplexity: 63.750
	Valid Perplexity: 99.243


                                                                                                                        

	Train Perplexity: 63.524
	Valid Perplexity: 99.233


                                                                                                                        

	Train Perplexity: 63.230
	Valid Perplexity: 99.199


                                                                                                                        

	Train Perplexity: 63.253
	Valid Perplexity: 99.170


                                                                                                                        

	Train Perplexity: 63.219
	Valid Perplexity: 99.147


                                                                                                                        

	Train Perplexity: 63.124
	Valid Perplexity: 99.138


                                                                                                                        

	Train Perplexity: 63.127
	Valid Perplexity: 99.137


                                                                                                                        

	Train Perplexity: 63.051
	Valid Perplexity: 99.139


                                                                                                                        

	Train Perplexity: 63.109
	Valid Perplexity: 99.138


                                                                                                                        

	Train Perplexity: 63.103
	Valid Perplexity: 99.137


                                                                                                                        

	Train Perplexity: 63.008
	Valid Perplexity: 99.137


                                                                                                                        

	Train Perplexity: 63.181
	Valid Perplexity: 99.137


                                                                                                                        

	Train Perplexity: 63.093
	Valid Perplexity: 99.137


                                                                                                                        

	Train Perplexity: 63.105
	Valid Perplexity: 99.137


                                                                                                                        

	Train Perplexity: 63.152
	Valid Perplexity: 99.137


                                                                                                                        

	Train Perplexity: 63.138
	Valid Perplexity: 99.137


                                                                                                                        

	Train Perplexity: 63.083
	Valid Perplexity: 99.136


                                                                                                                        

	Train Perplexity: 63.073
	Valid Perplexity: 99.136


                                                                                                                        

	Train Perplexity: 63.132
	Valid Perplexity: 99.136


                                                                                                                        

	Train Perplexity: 63.134
	Valid Perplexity: 99.136


                                                                                                                        

	Train Perplexity: 63.086
	Valid Perplexity: 99.136


                                                                                                                        

	Train Perplexity: 63.096
	Valid Perplexity: 99.136


                                                                                                                        

	Train Perplexity: 63.096
	Valid Perplexity: 99.136


                                                                                                                        

	Train Perplexity: 63.108
	Valid Perplexity: 99.136


                                                                                                                        

	Train Perplexity: 63.146
	Valid Perplexity: 99.136


# Testing

In [56]:
eval_batch_size = 8
valid_data = get_data(tokenized_dataset['validation'], vocab, eval_batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, eval_batch_size)

seq_len = min(25, valid_data.shape[1]-1, test_data.shape[1]-1)
print("valid_data:", valid_data.shape, "test_data:", test_data.shape, "seq_len:", seq_len)

test_loss = evaluate(model, test_data, criterion, eval_batch_size, seq_len, device)
print(f"Test Perplexity: {math.exp(test_loss):.3f}")


valid_data: torch.Size([8, 14499]) test_data: torch.Size([8, 14782]) seq_len: 25
Test Perplexity: 99.920


In [52]:
batch = next(iter(valid_loader))
inp, tgt = batch  # (B, T), (B, T)

print("inp shape:", inp.shape, "tgt shape:", tgt.shape)

# Since you are NOT padding, everything is a real token.
print("total target tokens:", tgt.numel())
print("unique target ids (first 50):", torch.unique(tgt)[:50].tolist())
print("min id:", int(tgt.min()), "max id:", int(tgt.max()))


inp shape: torch.Size([128, 905]) tgt shape: torch.Size([128, 905])
total target tokens: 115840
unique target ids (first 50): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
min id: 0 max id: 11048


# Real World Example

In [53]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.itos
    tokens = [itos[i] for i in indices]
    return tokens

In [57]:
prompt = 'Harry potter is '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
harry potter is a bit of a coincidence

0.7
harry potter is being a lot of time to find in

0.75
harry potter is being a lot of time to find in

0.8
harry potter is being a lot of time to find in

1.0
harry potter is being dressed an ounce of time making this cup

