In [None]:
import numpy as np
import torch

# PART 1: Text To Supervised Datset

### Load .txt file

In [None]:
with open('/content/text generator dataset.txt', 'r') as f:
  text = f.read().lower()

In [None]:
# Split text into sentences using '.'
sentences = text.split('.')

In [None]:
# Remove empty sentences and strip extra spaces
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

In [None]:
print("Total sentences:", len(sentences))

Total sentences: 268


### Tokenize the words

In [None]:
!pip install nltk



In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Tokenize each sentence separately
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [None]:
print(len(tokenized_sentences))

268


In [None]:
print("First tokenized sentence:", tokenized_sentences[0])

First tokenized sentence: ['the', 'sun', 'was', 'shining', 'brightly', 'in', 'the', 'clear', 'blue', 'sky', ',', 'and', 'a', 'gentle', 'breeze', 'rustled', 'the', 'leaves', 'of', 'the', 'tall', 'trees']


In [None]:
from collections import Counter

# Flatten the list of tokenized sentences to get all words
all_tokens = [word for sentence in tokenized_sentences for word in sentence]

In [None]:
# Create vocabulary
word_counts = Counter(all_tokens)
vocab = {word: idx for idx, (word, _) in enumerate(word_counts.items(), start=1)}

In [None]:
# Add a special token for unknown words
vocab["<UNK>"] = len(vocab) + 1

In [None]:
# Convert each sentence to its corresponding indexes
indexed_sentences = [[vocab.get(word, vocab["<UNK>"]) for word in sentence] for sentence in tokenized_sentences]

In [None]:
print("First indexed sentence:", indexed_sentences[0])

First indexed sentence: [1, 2, 3, 4, 5, 6, 1, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 16, 17, 1, 18, 19]


### Create a Supervised Dataset

In [None]:
len(indexed_sentences)

268

In [None]:
input_sequence = []
target_sequence = []

for sentence in indexed_sentences:
  for i in range(1, len(sentence)):
    context = sentence[:i]
    target = sentence[i]
    input_sequence.append(context)
    target_sequence.append(target)


In [None]:
len(target_sequence)

4883

In [None]:
# Find out the maximum length in sentence
max_length = max(len(seq) for seq in input_sequence)

print(max_length)

43


### Zero Padding using PyTorch

In [None]:
# Convert into pytorch tensors
tensor_sentences = [torch.tensor(sentence) for sentence in input_sequence]

In [None]:
from torch.nn.utils.rnn import pad_sequence

padded_sequences = pad_sequence(tensor_sentences, batch_first=True, padding_value=0)

In [None]:
print("Padded sequences shape:", padded_sequences.shape)

Padded sequences shape: torch.Size([4883, 43])


In [None]:
print(padded_sequences[:5])

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


### One Hot Encoding to `taget_sequence`

In [None]:
# convert to tensor
target_tensor = torch.tensor(target_sequence)

In [None]:
target_tensor.shape

torch.Size([4883])

In [None]:
import torch.nn.functional as F

# length of vacabulory (add 1 to cover the last word as onehotencoding starts from 0 & vocab starts from 1)
vocab_size = len(vocab) + 1

# Convert tensor to one-hot encoding
one_hot_targets = F.one_hot(target_tensor, num_classes=vocab_size).float()

In [None]:
one_hot_targets[0]

tensor([0., 0., 1.,  ..., 0., 0., 0.])

In [None]:
one_hot_targets.shape

torch.Size([4883, 1613])

In [None]:
X = padded_sequences
y = one_hot_targets

In [None]:
X.shape

torch.Size([4883, 43])

In [None]:
y.shape

torch.Size([4883, 1613])

# PART 2: Model Architecture

**3 layer architecture** <br>
1 -> embedding <br>
2 -> LSTM <br>
3 -> Dense <br>



### create model

In [None]:
import torch.nn as nn

class LSTMNextWordPredictor(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
    super(LSTMNextWordPredictor, self).__init__()

    # 1. Embedding Laye
    self.embedding = nn.Embedding(vocab_size, embedding_dim)

    # 2. LSTM Layer
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

    # 3. Dense Layer
    self.fc = nn.Linear(hidden_dim, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)

    lstm_out, _ = self.lstm(embedded)

    output = self.fc(lstm_out[:, -1, :])  # Output of the last time step

    return output

### Define Hyperparameters

In [None]:
# Define hyperparameters
vocab_size = len(vocab) + 1  # Add 1 for padding/unknown
embedding_dim = 100  # Size of word embeddings
hidden_dim = 150  # LSTM hidden state size
num_layers = 3  # Number of LSTM layers
num_epochs = 100 # Number of epochs
learning_rate = 0.0005

In [None]:
# Instantiate model
model = LSTMNextWordPredictor(vocab_size,
                              embedding_dim,
                              hidden_dim,
                              num_layers)

In [None]:
# Print model summary
print(model)

LSTMNextWordPredictor(
  (embedding): Embedding(1613, 100)
  (lstm): LSTM(100, 150, num_layers=3, batch_first=True)
  (fc): Linear(in_features=150, out_features=1613, bias=True)
)


In [None]:
# Define Loss & Optimizer
criterion = nn.CrossEntropyLoss()  # For multi-class classification
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Training Loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
model.to(device)

LSTMNextWordPredictor(
  (embedding): Embedding(1613, 100)
  (lstm): LSTM(100, 150, num_layers=3, batch_first=True)
  (fc): Linear(in_features=150, out_features=1613, bias=True)
)

In [None]:
def calculate_accuracy(output, y):
    """
    Computes accuracy by comparing model predictions with actual labels.

    Args:
        output (torch.Tensor): Raw model output logits of shape (batch_size, vocab_size)
        y (torch.Tensor): True class indices of shape (batch_size)

    Returns:
        float: Accuracy percentage
    """
    # Get the predicted class (highest probability in vocab_size dimension)
    predictions = torch.argmax(output, dim=1)  # Shape: [batch_size]

    # Ensure y is in index format, not one-hot
    if y.dim() > 1:
        y = torch.argmax(y, dim=1)

    # Compute the number of correct predictions
    correct = (predictions == y).sum().item()

    # Compute accuracy percentage
    accuracy = (correct / y.size(0)) * 100
    return accuracy

In [None]:
# Training loop
for epoch in range(num_epochs):

    x = X.to(device)
    y = y.to(device)

    optimizer.zero_grad()  # Clear previous gradients

    output = model(x)  # Forward pass

    loss = criterion(output, y.argmax(dim=1)) # Compute loss
    loss.backward()  # Backpropagation

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)  # Gradient clipping

    optimizer.step()  # Update weights

    # Compute accuracy
    accuracy = calculate_accuracy(output, y.argmax(dim=1))

    # if epoch % 100 == 0:
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")


Epoch [1/100], Loss: 7.3902, Accuracy: 0.0205
Epoch [2/100], Loss: 7.3804, Accuracy: 0.0410
Epoch [3/100], Loss: 7.3702, Accuracy: 7.2701
Epoch [4/100], Loss: 7.3582, Accuracy: 7.2701
Epoch [5/100], Loss: 7.3431, Accuracy: 7.2701
Epoch [6/100], Loss: 7.3231, Accuracy: 7.2701
Epoch [7/100], Loss: 7.2958, Accuracy: 7.2701
Epoch [8/100], Loss: 7.2584, Accuracy: 7.2701
Epoch [9/100], Loss: 7.2083, Accuracy: 7.2701
Epoch [10/100], Loss: 7.1441, Accuracy: 7.2701
Epoch [11/100], Loss: 7.0672, Accuracy: 7.2701
Epoch [12/100], Loss: 6.9817, Accuracy: 7.2701
Epoch [13/100], Loss: 6.8935, Accuracy: 7.2701
Epoch [14/100], Loss: 6.8079, Accuracy: 7.2701
Epoch [15/100], Loss: 6.7277, Accuracy: 7.2701
Epoch [16/100], Loss: 6.6534, Accuracy: 7.2701
Epoch [17/100], Loss: 6.5839, Accuracy: 7.2701
Epoch [18/100], Loss: 6.5180, Accuracy: 7.2701
Epoch [19/100], Loss: 6.4551, Accuracy: 7.2701
Epoch [20/100], Loss: 6.3953, Accuracy: 7.2701
Epoch [21/100], Loss: 6.3389, Accuracy: 7.2701
Epoch [22/100], Loss: 

# PART 3: Predict the Next Word

### Convert the input text into numerical indices (tokenization).

In [None]:
text = "As you ventured further"

In [None]:
tokenize = word_tokenize(text)

tokenize

['As', 'you', 'ventured', 'further']

In [None]:
indexed_text = [vocab.get(word, vocab["<UNK>"]) for word in tokenize]

indexed_text

[1612, 68, 160, 161]

In [None]:
def predict_next_word(model, word_tokenize, text, vocab, top_k=1):

  model.eval()

  # Toeknize the text
  tokenize = word_tokenize(text.lower())

  # Write the index of the text
  indexed_text = [vocab.get(word, vocab["<UNK>"]) for word in tokenize]

  # Convert to tensor and reshape to match input shape (batch_size=1, seq_length)
  input_tensor = torch.tensor(indexed_text, dtype=torch.long).unsqueeze(0).to(device)

  # Apply padding
  # pad_length = max_length - input_tensor.shape[1]
  # if pad_length > 0:
  #     padding = torch.zeros((1, pad_length), dtype=torch.long).to(device)  # Padding with 0s
  #     input_tensor = torch.cat((input_tensor, padding), dim=1)  # right padding
  print(input_tensor.shape)

  # Get model predictions
  with torch.no_grad():  # No gradient computation needed
      output = model(input_tensor)

  # Get the predicted word index (last time step)
  predicted_indices = torch.topk(output, top_k, dim=1).indices.squeeze(0)  # Get top-k predictions

  # Convert index to word
  index_to_word = {idx: word for word, idx in vocab.items()}  # Reverse tokenizer
  predicted_words = [index_to_word[idx.item()] for idx in predicted_indices]

  return predicted_words if top_k > 1 else predicted_words[0]

In [None]:
text = "Your RNN"

In [None]:
predicted_word = predict_next_word(model, word_tokenize, text, vocab, top_k=1)

print(predicted_word)

torch.Size([1, 2])
the
