In [57]:
import torch
import torchtext

print("Torch version:", torch.__version__)
print("TorchText version:", torchtext.__version__)


Torch version: 2.2.2+cu121
TorchText version: 0.17.2+cpu


In [58]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

In [59]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [60]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
#LOADING THE DATASET

In [8]:
#Here i will be using the dataset from the hugging face dataset library that has 1 to 7 books of harry potter.
#https://huggingface.co/datasets/WutYee/HarryPotter_books_1to7/viewer/default/train?p=813&row=81316

In [61]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("WutYee/HarryPotter_books_1to7")

In [10]:
#Checking the dataset

In [62]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 81349
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 23118
    })
    test: Dataset({
        features: ['text'],
        num_rows: 23620
    })
})


In [63]:
    print(dataset['train'].shape)

(81349, 1)


In [13]:
#PREPROCESSING THE TEXT DATA

In [14]:
#WE NEED TO TOKENIZE THE DATA and remove the empty strings from the tokenized data.

In [64]:
from torchtext.data.utils import get_tokenizer
from datasets import DatasetDict

# Tokenizer setup
tokenizer = get_tokenizer('basic_english')

# Tokenization function
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}

# Apply tokenization to all splits of the dataset (train, validation, test)
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})


In [70]:
print(tokenized_dataset['train'][223]['tokens'])

['cloak', ',', 'an', 'emerald', 'one', '.', 'her', 'black', 'hair', 'was', 'drawn', 'into', 'a', 'tight', 'bun', '.', 'she', 'looked']


In [16]:
# Check the first example after tokenization in the train split
print(tokenized_dataset['train'][:20])

{'tokens': [['hp', '1', '-', 'harry', 'potter', 'and', 'the'], ['sorcerer', "'", 's', 'stone'], ['harry', 'potter', 'and', 'the', 'sorcerer', "'", 's', 'stone'], [], [], ['harry', 'potter'], ['&'], ['the', 'sorcerer’s', 'stone'], [], [], ['by', 'j', '.', 'k', '.', 'rowling'], [], [], [], [], [], ['hp', '1', '-', 'harry', 'potter', 'and', 'the'], ['sorcerer', "'", 's', 'stone'], ['chapter', 'one'], []]}


In [65]:
#NOW TO PREPROCESS THE TOKENIZED DATASET

In [71]:
def preprocess_tokens(examples):
    # Clean tokens by removing unwanted characters and making them lowercase
    cleaned_tokens = [
        token.lower()  # Convert to lowercase
        for example in examples['tokens']  # Iterate over individual tokens in the batch
        for token in example  # Iterate over tokens in each example
        if isinstance(token, str) and token.isalpha()  # Remove non-alphabetic tokens
    ]
    
    return {'tokens': cleaned_tokens}  # Return cleaned tokens

# Apply preprocessing with batched=True for efficiency
cleaned_dataset = tokenized_dataset.map(preprocess_tokens, batched=True)

# Remove entries where the 'tokens' list is empty
cleaned_dataset = cleaned_dataset.filter(lambda x: len(x['tokens']) > 0)

# Check if the dataset is now correctly processed
print(cleaned_dataset['train'][:5])  # Print first 5 cleaned examples


Map:   0%|          | 0/81349 [00:00<?, ? examples/s]

Map:   0%|          | 0/23118 [00:00<?, ? examples/s]

Map:   0%|          | 0/23620 [00:00<?, ? examples/s]

Filter:   0%|          | 0/690212 [00:00<?, ? examples/s]

Filter:   0%|          | 0/168502 [00:00<?, ? examples/s]

Filter:   0%|          | 0/185948 [00:00<?, ? examples/s]

{'tokens': ['hp', 'harry', 'potter', 'and', 'the']}


In [75]:
# Checking THE PREPROCESSED DATA
print(cleaned_dataset['train'][:100])

{'tokens': ['hp', 'harry', 'potter', 'and', 'the', 'sorcerer', 's', 'stone', 'harry', 'potter', 'and', 'the', 'sorcerer', 's', 'stone', 'harry', 'potter', 'the', 'stone', 'by', 'j', 'k', 'rowling', 'hp', 'harry', 'potter', 'and', 'the', 'sorcerer', 's', 'stone', 'chapter', 'one', 'the', 'boy', 'who', 'lived', 'm', 'r', 'and', 'mrs', 'dursley', 'of', 'number', 'four', 'privet', 'drive', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', 'thank', 'you', 'very', 'much', 'they', 'were', 'the', 'last', 'people', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', 'because', 'they', 'just', 'hold', 'with', 'such', 'nonsense', 'mr', 'dursley', 'was', 'the', 'director', 'of', 'a', 'firm', 'called', 'grunnings', 'which', 'made', 'drills', 'he', 'was', 'a', 'big', 'beefy', 'man']}


In [78]:
# Before cleaning
print("Before cleaning:")
print(tokenized_dataset['train'][:10])

# After cleaning
print("After cleaning:")
print(cleaned_dataset['train'][:10])


Before cleaning:
{'tokens': [['hp', '1', '-', 'harry', 'potter', 'and', 'the'], ['sorcerer', "'", 's', 'stone'], ['harry', 'potter', 'and', 'the', 'sorcerer', "'", 's', 'stone'], [], [], ['harry', 'potter'], ['&'], ['the', 'sorcerer’s', 'stone'], [], []]}
After cleaning:
{'tokens': ['hp', 'harry', 'potter', 'and', 'the', 'sorcerer', 's', 'stone', 'harry', 'potter']}


In [80]:
import torchtext

vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3)

vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)

vocab.set_default_index(vocab['<unk>'])

Vocabulary size: 11368
First 10 tokens in vocab: ['<unk>', '<eos>', '.', ',', 'the', '”', 'and', 'to', 'of', 'a', 'he', 'harry', 'was', 'said', 'his', 'in', 'it', 'you', '?', 'had']


In [81]:
print(f"Size of vocabulary: {len(vocab)}")

Size of vocabulary: 11368


In [86]:
print(f"First 20 tokens in vocab: {vocab.get_itos()[:20]}")

First 20 tokens in vocab: ['<unk>', '<eos>', '.', ',', 'the', '”', 'and', 'to', 'of', 'a', 'he', 'harry', 'was', 'said', 'his', 'in', 'it', 'you', '?', 'had']


In [87]:
#Now we need to prepare the batch loader

In [88]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'] + ['<eos>']  # Add <eos> token at the end
            token_indices = [vocab[token] for token in tokens]  # Convert tokens to indices
            data.extend(token_indices)  # Add the indices to the data list
    data = torch.LongTensor(data)  # Convert to tensor
    num_batches = data.shape[0] // batch_size  # Calculate number of batches
    data = data[:num_batches * batch_size]  # Truncate to ensure data is divisible by batch size
    data = data.view(batch_size, -1)  # Reshape to [batch_size, seq_len]
    return data  # Return the data in the required shape


In [89]:
# Define batch size
batch_size = 128

# Prepare the data for training, validation, and testing
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data = get_data(tokenized_dataset['test'], vocab, batch_size)

# Check the shape of the data
print(train_data.shape)


torch.Size([128, 7660])


In [90]:
#MODELING

In [91]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh
    
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

In [92]:
vocab_size = len(vocab)  # Size of vocabulary
emb_dim = 1024           # Embedding dimension
hid_dim = 1024           # Hidden dimension for LSTM
num_layers = 2           # Number of LSTM layers
dropout_rate = 0.65      # Dropout rate
lr = 1e-3                # Learning rate

# Initialize the model
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)

# Optimizer and Loss function
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# Model summary (Number of parameters)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 40,086,632 trainable parameters


In [93]:
def get_batch(data, seq_len, idx):
    # Get source and target sequences from the batch
    src = data[:, idx:idx+seq_len]                     
    target = data[:, idx+1:idx+seq_len+1]                
    return src, target

In [94]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    epoch_loss = 0
    model.train()
    
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches - 1) % seq_len]  # Ensure data is multiple of seq_len
    num_batches = data.shape[-1]
    
    hidden = model.init_hidden(batch_size, device)  # Initialize hidden state for each epoch
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ', leave=False):
        optimizer.zero_grad()
        
        hidden = model.detach_hidden(hidden)  # Detach hidden state to avoid backprop through the entire history
        
        src, target = get_batch(data, seq_len, idx)  # Get source and target sequences
        src, target = src.to(device), target.to(device)
        
        batch_size = src.shape[0]  # Number of sequences in the batch
        prediction, hidden = model(src, hidden)  # Get model predictions
        
        # Reshape predictions and target to calculate loss
        prediction = prediction.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)
        
        loss = criterion(prediction, target)  # Compute loss
        loss.backward()  # Backpropagate gradients
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()  # Update parameters
        
        epoch_loss += loss.item() * seq_len  # Accumulate the loss
    return epoch_loss / num_batches  # Return the average loss for the epoch


In [95]:
def evaluate(model, data, criterion, batch_size, seq_len, device):
    epoch_loss = 0
    model.eval()
    
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches - 1) % seq_len]
    num_batches = data.shape[-1]
    
    hidden = model.init_hidden(batch_size, device)
    
    with torch.no_grad():  # Disable gradient computation during evaluation
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size = src.shape[0]
            
            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)
            
            loss = criterion(prediction, target)  # Compute loss
            epoch_loss += loss.item() * seq_len  # Accumulate loss
    return epoch_loss / num_batches  # Return the average loss for evaluation


In [96]:
n_epochs = 50  # Number of epochs
seq_len = 50    # Sequence length (decoding length)
clip = 0.25     # Gradient clipping

# Learning rate scheduler
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')  # Initialize the best valid loss as a large value

# Training loop
for epoch in range(n_epochs):
    print(f"Epoch {epoch + 1}/{n_epochs}")  # Display the current epoch number
    # Train the model for one epoch
    train_loss = train(model, train_data, optimizer, criterion, batch_size, seq_len, clip, device)
    
    # Evaluate the model on validation set
    valid_loss = evaluate(model, valid_data, criterion, batch_size, seq_len, device)
    
    # Step the learning rate scheduler
    lr_scheduler.step(valid_loss)
    
    # Save the model if validation loss improves
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    # Print train and validation perplexity for the current epoch
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')


Epoch 1/50


                                                           

	Train Perplexity: 476.751
	Valid Perplexity: 259.229
Epoch 2/50


                                                           

	Train Perplexity: 211.574
	Valid Perplexity: 128.636
Epoch 3/50


                                                           

	Train Perplexity: 128.877
	Valid Perplexity: 92.365
Epoch 4/50


                                                           

	Train Perplexity: 100.725
	Valid Perplexity: 79.424
Epoch 5/50


                                                           

	Train Perplexity: 86.640
	Valid Perplexity: 74.346
Epoch 6/50


                                                           

	Train Perplexity: 77.617
	Valid Perplexity: 70.838
Epoch 7/50


                                                           

	Train Perplexity: 70.873
	Valid Perplexity: 67.010
Epoch 8/50


                                                           

	Train Perplexity: 65.858
	Valid Perplexity: 63.099
Epoch 9/50


                                                           

	Train Perplexity: 61.493
	Valid Perplexity: 62.536
Epoch 10/50


                                                           

	Train Perplexity: 58.097
	Valid Perplexity: 60.603
Epoch 11/50


                                                           

	Train Perplexity: 55.110
	Valid Perplexity: 60.361
Epoch 12/50


                                                           

	Train Perplexity: 52.536
	Valid Perplexity: 59.270
Epoch 13/50


                                                           

	Train Perplexity: 50.200
	Valid Perplexity: 58.290
Epoch 14/50


                                                           

	Train Perplexity: 48.187
	Valid Perplexity: 58.370
Epoch 15/50


                                                           

	Train Perplexity: 45.398
	Valid Perplexity: 58.112
Epoch 16/50


                                                           

	Train Perplexity: 43.958
	Valid Perplexity: 57.807
Epoch 17/50


                                                           

	Train Perplexity: 42.876
	Valid Perplexity: 57.530
Epoch 18/50


                                                           

	Train Perplexity: 41.879
	Valid Perplexity: 57.592
Epoch 19/50


                                                           

	Train Perplexity: 40.565
	Valid Perplexity: 57.752
Epoch 20/50


                                                           

	Train Perplexity: 39.823
	Valid Perplexity: 57.226
Epoch 21/50


                                                           

	Train Perplexity: 39.450
	Valid Perplexity: 57.368
Epoch 22/50


                                                           

	Train Perplexity: 39.008
	Valid Perplexity: 56.996
Epoch 23/50


                                                           

	Train Perplexity: 38.765
	Valid Perplexity: 56.909
Epoch 24/50


                                                           

	Train Perplexity: 38.714
	Valid Perplexity: 56.808
Epoch 25/50


                                                           

	Train Perplexity: 38.538
	Valid Perplexity: 56.997
Epoch 26/50


                                                           

	Train Perplexity: 38.350
	Valid Perplexity: 56.738
Epoch 27/50


                                                           

	Train Perplexity: 38.239
	Valid Perplexity: 56.665
Epoch 28/50


                                                           

	Train Perplexity: 38.171
	Valid Perplexity: 56.694
Epoch 29/50


                                                           

	Train Perplexity: 38.071
	Valid Perplexity: 56.627
Epoch 30/50


                                                           

	Train Perplexity: 38.068
	Valid Perplexity: 56.723
Epoch 31/50


                                                           

	Train Perplexity: 37.976
	Valid Perplexity: 56.649
Epoch 32/50


                                                           

	Train Perplexity: 37.918
	Valid Perplexity: 56.627
Epoch 33/50


                                                           

	Train Perplexity: 37.935
	Valid Perplexity: 56.622
Epoch 34/50


                                                           

	Train Perplexity: 37.909
	Valid Perplexity: 56.627
Epoch 35/50


                                                           

	Train Perplexity: 37.908
	Valid Perplexity: 56.623
Epoch 36/50


                                                           

	Train Perplexity: 37.922
	Valid Perplexity: 56.618
Epoch 37/50


                                                           

	Train Perplexity: 37.903
	Valid Perplexity: 56.617
Epoch 38/50


                                                           

	Train Perplexity: 37.921
	Valid Perplexity: 56.617
Epoch 39/50


                                                           

	Train Perplexity: 37.911
	Valid Perplexity: 56.617
Epoch 40/50


                                                           

	Train Perplexity: 37.945
	Valid Perplexity: 56.618
Epoch 41/50


                                                           

	Train Perplexity: 37.927
	Valid Perplexity: 56.617
Epoch 42/50


                                                           

	Train Perplexity: 37.905
	Valid Perplexity: 56.617
Epoch 43/50


                                                           

	Train Perplexity: 37.962
	Valid Perplexity: 56.618
Epoch 44/50


                                                           

	Train Perplexity: 37.959
	Valid Perplexity: 56.618
Epoch 45/50


                                                           

	Train Perplexity: 37.933
	Valid Perplexity: 56.618
Epoch 46/50


                                                           

	Train Perplexity: 37.862
	Valid Perplexity: 56.618
Epoch 47/50


                                                           

	Train Perplexity: 37.936
	Valid Perplexity: 56.618
Epoch 48/50


                                                           

	Train Perplexity: 37.940
	Valid Perplexity: 56.618
Epoch 49/50


                                                           

	Train Perplexity: 37.879
	Valid Perplexity: 56.618
Epoch 50/50


                                                           

	Train Perplexity: 37.905
	Valid Perplexity: 56.618


In [97]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 70.518


In [98]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    # Set seed for reproducibility
    if seed is not None:
        torch.manual_seed(seed)
    
    model.eval()  # Set the model to evaluation mode
    tokens = tokenizer(prompt)  # Tokenize the input prompt
    indices = [vocab[t] for t in tokens]  # Convert tokens to indices
    batch_size = 1  # Single example batch size
    hidden = model.init_hidden(batch_size, device)  # Initialize hidden state
    
    with torch.no_grad():  # Disable gradient computation during generation
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)  # Input sequence for the model
            prediction, hidden = model(src, hidden)  # Get model predictions
            
            # Softmax on the last token's predictions to get probabilities
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)
            
            # Sample from the probability distribution
            prediction = torch.multinomial(probs, num_samples=1).item()
            
            # If the prediction is <unk>, sample again
            while prediction == vocab['<unk>']:
                prediction = torch.multinomial(probs, num_samples=1).item()
            
            # Stop if <eos> token is predicted
            if prediction == vocab['<eos>']:
                break
            
            indices.append(prediction)  # Add the predicted token to the sequence
        
    # Convert indices back to tokens
    itos = vocab.get_itos()  # Get the reverse vocabulary (index to token mapping)
    generated_tokens = [itos[i] for i in indices]  # Map indices back to tokens
    return generated_tokens


In [99]:
print(vocab.get_itos()[:100])  # Check if common words exist

['<unk>', '<eos>', '.', ',', 'the', '”', 'and', 'to', 'of', 'a', 'he', 'harry', 'was', 'said', 'his', 'in', 'it', 'you', '?', 'had', 'at', 'that', 'i', '!', 'on', 'as', 'him', 'with', '—', 'they', 'ron', "'", 'for', 'her', 'but', '\x91', 'hermione', 'up', 'out', 'she', 'be', 'were', 'not', 'all', 'them', 'from', 'have', 'what', 'back', 'into', 'been', 'there', 'this', 'me', 'is', 'their', 'so', 'one', 'who', 'about', 'could', 'then', 'down', 's', 'now', 'if', 'over', 'we', 'around', 'looked', '“i', 'like', 'very', 'just', 'professor', 'when', 'an', 'got', '—”', 'know', 'dumbledore', 'by', 'hagrid', 'do', 'would', 'your', 'off', 'again', 'no', 'see', 'though', 'more', 'are', 'did', 'my', 'potter', 'get', 'looking', 'weasley', 't']


In [102]:
prompt = "harry looked"
max_seq_len = 30
seed = 0

# Experimenting with a broader range of temperatures
temperatures = [0.3, 0.5, 0.7, 0.9]

for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed)
    sentence = " ".join(generation)

    print(f"Temperature: {temperature}\nGenerated Text: {sentence}\n" + "-"*50)


Temperature: 0.3
Generated Text: harry looked at harry .
--------------------------------------------------
Temperature: 0.5
Generated Text: harry looked at harry .
--------------------------------------------------
Temperature: 0.7
Generated Text: harry looked at it .
--------------------------------------------------
Temperature: 0.9
Generated Text: harry looked up at
--------------------------------------------------


In [101]:
#The language model is built using an LSTM (Long Short-Term Memory), which helps it understand and generate text. 
#It starts by converting words into numerical embeddings, then passes them through the LSTM layers to learn patterns in the text.
#Finally, a fully connected layer predicts the next word.
#During training, the model learns to predict the next word and adjusts its parameters when it makes mistakes.

In [104]:
# Saving the vocabulary to a file
torch.save(vocab, "vocab.pth")