In [46]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

print("Torch version:", torch.__version__)
print("TorchText version:", torchtext.__version__)
print("CUDA available:", torch.cuda.is_available())

Torch version: 2.2.0+cpu
TorchText version: 0.16.2+cpu
CUDA available: False


In [47]:
from datasets import load_dataset

ds = load_dataset("saracandu/harry-potter-trivia-human")


In [48]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 1023
    })
    test: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 256
    })
})


Observation: No validation set like LSTM code in the youtube video.

In [49]:
print(ds['train'][0])

{'question': 'What are the three European wizarding schools that participate in the Triwizard Tournament?', 'answer': 'Hogwarts, Beauxbatons, and Durmstrang.', 'text': '<s>[INST] What are the three European wizarding schools that participate in the Triwizard Tournament? [/INST] Hogwarts, Beauxbatons, and Durmstrang. </s>'}


2. Preprocessing

In [50]:

from torchtext.data.utils import get_tokenizer

# Load the dataset
ds = load_dataset("saracandu/harry-potter-trivia-human")

# Tokenizer for basic English text
tokenizer = get_tokenizer('basic_english')

# Define a function to tokenize the 'question' and 'answer' fields
def tokenize_data(example):
    # Tokenizing both the 'question' and 'answer' fields
    return {'question_tokens': tokenizer(example['question']),
            'answer_tokens': tokenizer(example['answer'])}

# Apply the tokenizer to the dataset
tokenized_dataset = ds.map(lambda x: tokenize_data(x), remove_columns=['question', 'answer'])

# Check tokenized dataset
print(tokenized_dataset['train'][0])


{'text': '<s>[INST] What are the three European wizarding schools that participate in the Triwizard Tournament? [/INST] Hogwarts, Beauxbatons, and Durmstrang. </s>', 'question_tokens': ['what', 'are', 'the', 'three', 'european', 'wizarding', 'schools', 'that', 'participate', 'in', 'the', 'triwizard', 'tournament', '?'], 'answer_tokens': ['hogwarts', ',', 'beauxbatons', ',', 'and', 'durmstrang', '.']}


Observation: The text field seems to have special tokens!!

In [65]:
from torchtext.vocab import build_vocab_from_iterator

# Combine the 'question_tokens' and 'answer_tokens' from the tokenized dataset
# to build a vocabulary that accounts for both question and answer tokens.
def generate_tokens(dataset):
    for example in dataset:
        # Yield tokens from both 'question_tokens' and 'answer_tokens'
        yield example['question_tokens']
        yield example['answer_tokens']

# Build the vocabulary from both the 'question_tokens' and 'answer_tokens'
vocab = build_vocab_from_iterator(generate_tokens(tokenized_dataset['train']), min_freq=3)

# Insert special tokens
vocab.insert_token('<unk>', 0)  # Unknown token index
vocab.insert_token('<eos>', 1)  # End-of-sequence token index
vocab.set_default_index(vocab['<unk>'])  # Default index for unknown tokens

# Check the vocabulary length and the first 10 items in the vocabulary
print(f"Vocabulary size: {len(vocab)}")
print(f"First 10 items in the vocabulary: {vocab.get_itos()[:10]}")


# convert 'question_tokens' and 'answer_tokens' into numerical indices
numericalized_question = [vocab[token] for token in tokenized_dataset['train'][0]['question_tokens']]
numericalized_answer = [vocab[token] for token in tokenized_dataset['train'][0]['answer_tokens']]

print(len(vocab))
torch.save(vocab, 'vocab.pth')  # Save the vocab for webapp

Vocabulary size: 743
First 10 items in the vocabulary: ['<unk>', '<eos>', '?', 'the', 'what', 'of', 'is', '.', 'a', "'"]
743


3. Prepare the batch loader

In [52]:
def get_data(dataset, vocab, batch_size):
    data = []  # List to hold the numericalized tokens
    
    for example in dataset:
        if example['question_tokens'] and example['answer_tokens']:  # all fields are present
            # Append <eos> token to both question and answer tokens
            question_tokens = example['question_tokens'] + ['<eos>']
            answer_tokens = example['answer_tokens'] + ['<eos>']
            
            # Combine question and answer tokens into one sequence
            tokens = question_tokens + answer_tokens
            
            # Numericalize the tokens
            tokens = [vocab[token] for token in tokens]
            
            # Add the numericalized tokens to our data list
            data.extend(tokens)
    
    # Convert the data to a LongTensor
    data = torch.LongTensor(data)
    
    # Number of complete batches that can make from the data
    num_batches = data.shape[0] // batch_size
    
    # Truncate the data to ensure it fits into full batches
    data = data[:num_batches * batch_size]
    
    # Reshape data into the shape [batch_size, num_batches]
    data = data.view(batch_size, num_batches)
    
    return data



In [53]:
"""# Assuming the tokenized dataset has only 'train' and 'test'
batch_size = 128

# Prepare the data for train, validation (use test as validation), and test
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['test'], vocab, batch_size)  # Using test for validation
# In this case, there is no 'validation' split, so using 'test' as validation
test_data  = valid_data  # set test_data to be the same as valid_data for now

# Check the shape of the train data
print(train_data.shape)  """



"# Assuming the tokenized dataset has only 'train' and 'test'\nbatch_size = 128\n\n# Prepare the data for train, validation (use test as validation), and test\ntrain_data = get_data(tokenized_dataset['train'], vocab, batch_size)\nvalid_data = get_data(tokenized_dataset['test'], vocab, batch_size)  # Using test for validation\n# In this case, there is no 'validation' split, so using 'test' as validation\ntest_data  = valid_data  # set test_data to be the same as valid_data for now\n\n# Check the shape of the train data\nprint(train_data.shape)  "

In [54]:
from sklearn.model_selection import train_test_split
batch_size = 15
# Get the indices for the dataset
train_size = len(tokenized_dataset['train'])
train_indices = list(range(train_size))

# Split the indices into train and validation
train_idx, valid_idx = train_test_split(train_indices, test_size=0.2)

# Select the data corresponding to these indices
train_tokens = tokenized_dataset['train'].select(train_idx)
valid_tokens = tokenized_dataset['train'].select(valid_idx)

# Prepare the data for train, validation, and test
train_data = get_data(train_tokens, vocab, batch_size)
valid_data = get_data(valid_tokens, vocab, batch_size)
test_data = get_data(tokenized_dataset['test'], vocab, batch_size)

print(train_data.shape)
print(valid_data.shape)  
print(test_data.shape)    

torch.Size([15, 861])
torch.Size([15, 213])
torch.Size([15, 266])


4. Modeling

In [55]:

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.hid_dim   = hid_dim
        self.num_layers= num_layers
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm      = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers,
                                 dropout=dropout_rate, batch_first=True)
        self.dropout   = nn.Dropout(dropout_rate)
        self.fc        = nn.Linear(hid_dim, vocab_size)

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        embed = self.embedding(src)  # [batch_size, seq_len, emb_dim]
        output, hidden = self.lstm(embed, hidden)  # LSTM output
        output = self.dropout(output)
        prediction = self.fc(output)  # [batch_size, seq_len, vocab_size]
        return prediction, hidden


5. Training

In [56]:

# Initialize the model
batch_size = 15
lr = 1e-3
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.30          

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# Print the number of trainable parameters
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 18,316,007 trainable parameters


In [57]:
def get_batch(data, seq_len, idx):
    # Given data from get_data() - src and target are offset by 1
    src = data[:, idx:idx+seq_len]                    
    target = data[:, idx+1:idx+seq_len+1]  # Target is shifted by 1 from source
    return src, target

In [58]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    epoch_loss = 0
    model.train()

    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches - 1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ', leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx)  # src, target: [batch_size, seq_len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]

        # Forward pass through the model
        prediction, hidden = model(src, hidden)

        # Reshaping prediction for cross-entropy loss (batch_size * seq_len, vocab_size)
        prediction = prediction.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)

        # Compute the loss
        loss = criterion(prediction, target)

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  # Prevent gradient explosion
        optimizer.step()

        epoch_loss += loss.item() * seq_len

    return epoch_loss / num_batches

In [59]:
def evaluate(model, data, criterion, batch_size, seq_len, device):
    epoch_loss = 0
    model.eval()

    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches - 1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)

            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size = src.shape[0]

            # Forward pass through the model
            prediction, hidden = model(src, hidden)

            # Reshaping prediction for cross-entropy loss (batch_size * seq_len, vocab_size)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            # Compute the loss
            loss = criterion(prediction, target)

            epoch_loss += loss.item() * seq_len

    return epoch_loss / num_batches

In [60]:
seq_len = 50
clip = 0.25
n_epochs = 30
# Learning rate scheduler
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2)

# Training loop
best_valid_loss = float('inf')

for epoch in range(n_epochs):
    # Train the model
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    
    # Evaluate the model
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    # Step the learning rate scheduler
    lr_scheduler.step(valid_loss)

    # Save the model with the best validation loss
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    # Print training and validation perplexity
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                         

	Train Perplexity: 138.599
	Valid Perplexity: 76.887


                                                         

	Train Perplexity: 57.460
	Valid Perplexity: 47.726


                                                         

	Train Perplexity: 37.167
	Valid Perplexity: 36.904


                                                         

	Train Perplexity: 27.408
	Valid Perplexity: 30.056


                                                         

	Train Perplexity: 20.908
	Valid Perplexity: 26.142


                                                         

	Train Perplexity: 16.167
	Valid Perplexity: 24.056


                                                         

	Train Perplexity: 13.209
	Valid Perplexity: 22.216


                                                         

	Train Perplexity: 10.935
	Valid Perplexity: 20.525


                                                         

	Train Perplexity: 9.032
	Valid Perplexity: 19.369


                                                         

	Train Perplexity: 7.636
	Valid Perplexity: 18.362


                                                         

	Train Perplexity: 6.492
	Valid Perplexity: 17.655


                                                         

	Train Perplexity: 5.587
	Valid Perplexity: 17.740


                                                         

	Train Perplexity: 4.942
	Valid Perplexity: 17.934


                                                         

	Train Perplexity: 4.308
	Valid Perplexity: 19.015


                                                         

	Train Perplexity: 3.751
	Valid Perplexity: 18.828


                                                         

	Train Perplexity: 3.384
	Valid Perplexity: 18.814


                                                         

	Train Perplexity: 3.113
	Valid Perplexity: 19.504


                                                         

	Train Perplexity: 2.861
	Valid Perplexity: 19.568


                                                         

	Train Perplexity: 2.709
	Valid Perplexity: 19.391


                                                         

	Train Perplexity: 2.567
	Valid Perplexity: 19.762


                                                         

	Train Perplexity: 2.448
	Valid Perplexity: 19.735


                                                         

	Train Perplexity: 2.397
	Valid Perplexity: 19.883


                                                         

	Train Perplexity: 2.346
	Valid Perplexity: 20.061


                                                         

	Train Perplexity: 2.274
	Valid Perplexity: 20.199


                                                         

	Train Perplexity: 2.227
	Valid Perplexity: 20.340


                                                         

	Train Perplexity: 2.205
	Valid Perplexity: 20.494


                                                         

	Train Perplexity: 2.169
	Valid Perplexity: 20.616


                                                         

	Train Perplexity: 2.163
	Valid Perplexity: 20.668


                                                         

	Train Perplexity: 2.148
	Valid Perplexity: 20.770


                                                         

	Train Perplexity: 2.134
	Valid Perplexity: 20.811


Conclutions: 
Training Perplexity is consistently decreasing. This means model is learning the training data well.
Validation Perplexity initially decreases, then stagnates and slightly increases. This means model is memorizing the training data instead of generalizing to unseen data(overfitting).Early Stopping can be solved this issue.

6. Testing

In [61]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 13.870


7. Real-world inference

In [63]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [68]:
generate("Harry Potter is", 20, 2, model, tokenizer, vocab, device, seed=None)

['harry',
 'potter',
 'is',
 'this',
 'department',
 'professor',
 'voldemort',
 'maladies',
 'and',
 'mad-eye',
 'north',
 'headmaster',
 'once',
 'seven',
 ',',
 'and',
 'her',
 'filius',
 'called',
 '.',
 'severus',
 'piece',
 'hospital']