#**Text Generation using RNN and LSTM**

**1 Objective**

The aim of this experiment is to explore text generation using Recurrent Neural
Networks (RNNs) and understand the impact of different word representations:
1. One-Hot Encoding
2. Trainable Word Embeddings

Train an RNN model on a dataset of 100 poems and compare the perfor-
mance of both encoding techniques.

**2 Dataset**

Use the provided dataset of 100 poems for training your text generation model.
The dataset consists of multiple lines of poetry, which will be used to generate
text sequences.

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter

# 1. Load Data
print("Loading data...")
try:
    df = pd.read_csv('poems-100.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Please upload 'poems-100.csv' to your Colab files.")

# 2. Preprocessing
# Combine all poems into one long string
corpus = " ".join(df['text'].astype(str).tolist()).lower()
words = corpus.split()

# Build Vocabulary
word_counts = Counter(words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_size = len(vocab)

# Create Mappings (Word -> Index and Index -> Word)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

print(f"Total Words: {len(words)}")
print(f"Vocabulary Size: {vocab_size}")

# 3. Create Sequences for Training
seq_length = 5
X_data = []
y_data = []

for i in range(0, len(words) - seq_length):
    seq_in = words[i:i + seq_length]
    seq_out = words[i + seq_length]
    X_data.append([word_to_idx[w] for w in seq_in])
    y_data.append(word_to_idx[seq_out])

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_data, dtype=torch.long)
y_tensor = torch.tensor(y_data, dtype=torch.long)

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Setup complete. Running on: {device}")

Loading data...
Dataset loaded successfully.
Total Words: 24734
Vocabulary Size: 6989
Setup complete. Running on: cpu


#**3 Part 1: One-Hot Encoding Approach**

**3.1 Preprocessing**

• Tokenize the text into words.

• Convert each word into a one-hot vector.

**3.2 Model Architecture**

• Use an RNN and LSTM model.

• The input should be one-hot encoded word sequences.

• Train the model to predict the next word in a sequence.

**3.3 Implementation Steps**

• Tokenize the dataset and create a vocabulary.

• Convert words into one-hot encoded vectors.

• Define an RNN model using PyTorch.

• Train the model using the dataset.

• Generate text using the trained model.

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

def train_model(model, X, y, epochs=20, lr=0.01, is_one_hot=False, batch_size=64):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Create a DataLoader to handle batching
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch_X, batch_y in dataloader:
            # Move batch to device
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            optimizer.zero_grad()

            if is_one_hot:
                # Convert ONLY this small batch to One-Hot
                # This saves RAM by not converting the whole dataset at once
                batch_X_enc = torch.nn.functional.one_hot(batch_X, num_classes=vocab_size).float()
            else:
                batch_X_enc = batch_X

            output, hidden = model(batch_X_enc)

            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        if (epoch+1) % 5 == 0:
            avg_loss = total_loss / len(dataloader)
            print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

    return model

def generate_text(model, start_text, length=10, is_one_hot=False):
    model.eval()
    words_in = start_text.lower().split()
    current_seq = [word_to_idx.get(w, 0) for w in words_in]

    # Pad or truncate
    if len(current_seq) < seq_length:
        current_seq = [0]*(seq_length - len(current_seq)) + current_seq
    else:
        current_seq = current_seq[-seq_length:]

    generated_text = list(words_in)

    for _ in range(length):
        input_tensor = torch.tensor([current_seq], dtype=torch.long).to(device)

        if is_one_hot:
            input_tensor = torch.nn.functional.one_hot(input_tensor, num_classes=vocab_size).float()

        with torch.no_grad():
            output, _ = model(input_tensor)

        predicted_idx = torch.argmax(output, dim=1).item()
        predicted_word = idx_to_word[predicted_idx]

        generated_text.append(predicted_word)
        current_seq.append(predicted_idx)
        current_seq = current_seq[1:]

    return " ".join(generated_text)

Training on: cpu


In [9]:
# --- Model Definitions: One-Hot ---
class RNN_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_dim, output_size):
        super(RNN_OneHot, self).__init__()
        self.rnn = nn.RNN(vocab_size, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        out, hidden = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out, hidden

class LSTM_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_dim, output_size):
        super(LSTM_OneHot, self).__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        out, (hidden, cell) = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out, hidden

# --- Training (With Batching) ---
HIDDEN_DIM = 128
EPOCHS = 10

print("Training RNN (One-Hot)...")
rnn_oh = RNN_OneHot(vocab_size, HIDDEN_DIM, vocab_size)
rnn_oh = train_model(rnn_oh, X_tensor, y_tensor, epochs=EPOCHS, is_one_hot=True)

print("\nTraining LSTM (One-Hot)...")
lstm_oh = LSTM_OneHot(vocab_size, HIDDEN_DIM, vocab_size)
lstm_oh = train_model(lstm_oh, X_tensor, y_tensor, epochs=EPOCHS, is_one_hot=True)

Training RNN (One-Hot)...
Epoch 5/10, Loss: 9.1683
Epoch 10/10, Loss: 8.8586

Training LSTM (One-Hot)...
Epoch 5/10, Loss: 0.9805
Epoch 10/10, Loss: 0.0191


#**4 Part 2: Trainable Word Embeddings Approach**

**4.1 Preprocessing**

• Tokenize the text into words.

• Convert each word into an index.

**4.2 Model Architecture**

• Use an embedding layer in the RNN model.

• Train the embedding layer along with the model.

• Predict the next word in a sequence.

**4.3 Implementation Steps**

1. Tokenize the dataset and create a vocabulary.

2. Convert words into indexed sequences.

3. Define an RNN model with an embedding layer using PyTorch.

4. Train the model and compare performance with the one-hot encoding
method.

5. Generate text using the trained model.

In [8]:
# --- Model Definitions: Embeddings ---
class RNN_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_size):
        super(RNN_Embedding, self).__init__()
        # Embedding layer converts integer indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        embeds = self.embedding(x)
        out, hidden = self.rnn(embeds)
        out = self.fc(out[:, -1, :])
        return out, hidden

class LSTM_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_size):
        super(LSTM_Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        embeds = self.embedding(x)
        out, (hidden, cell) = self.lstm(embeds)
        out = self.fc(out[:, -1, :])
        return out, hidden

# --- Training ---
EMBED_DIM = 50

print("Training RNN (Embeddings)...")
rnn_emb = RNN_Embedding(vocab_size, EMBED_DIM, HIDDEN_DIM, vocab_size)
rnn_emb = train_model(rnn_emb, X_tensor, y_tensor, epochs=20, is_one_hot=False)

print("\nTraining LSTM (Embeddings)...")
lstm_emb = LSTM_Embedding(vocab_size, EMBED_DIM, HIDDEN_DIM, vocab_size)
lstm_emb = train_model(lstm_emb, X_tensor, y_tensor, epochs=20, is_one_hot=False)

Training RNN (Embeddings)...
Epoch 5/20, Loss: 6.0397
Epoch 10/20, Loss: 4.6417
Epoch 15/20, Loss: 4.4079
Epoch 20/20, Loss: 4.1944

Training LSTM (Embeddings)...
Epoch 5/20, Loss: 2.4425
Epoch 10/20, Loss: 0.5441
Epoch 15/20, Loss: 0.4981
Epoch 20/20, Loss: 0.3876


**5 Comparison and Analysis**

• Compare the training time and loss for both methods.

• Evaluate the quality of generated text.

• Discuss the advantages and disadvantages of each approach.

In [12]:
# Test Prompt
seed_text = "hello india"

print(f"Seed Text: '{seed_text}'\n")
print("-" * 50)

# Generate text with One-Hot models
print("RNN (One-Hot):    ", generate_text(rnn_oh, seed_text, is_one_hot=True))
print("LSTM (One-Hot):   ", generate_text(lstm_oh, seed_text, is_one_hot=True))

print("-" * 50)

# Generate text with Embedding models
print("RNN (Embeddings): ", generate_text(rnn_emb, seed_text, is_one_hot=False))
print("LSTM (Embeddings):", generate_text(lstm_emb, seed_text, is_one_hot=False))

Seed Text: 'hello india'

--------------------------------------------------
RNN (One-Hot):     hello india natural natural natural natural natural natural natural natural natural natural
LSTM (One-Hot):    hello india pleasure and the printing-office boy? the well-taken photographs—but your wife
--------------------------------------------------
RNN (Embeddings):  hello india that’s boy the first, not then had did lovely heat
LSTM (Embeddings): hello india winds blow round the make toward the foot of the
