# Character-Level Language Modeling with LSTM in PyTorch

In this notebook, we will:
- Load and preprocess Shakespeare's complete works (character-level).
- Implement a simple LSTM-based model to predict the next character.
- Train the model and generate new text by sampling from it.

In [None]:
# Load Shakespeare from Project Gutenberg
import requests
import re

url = "https://www.gutenberg.org/cache/epub/100/pg100.txt"
response = requests.get(url)
text = response.text

# Extract main content
start_marker = "THE SONNETS"
end_marker = "End of the Project Gutenberg EBook"
start_idx = text.find(start_marker)
end_idx = text.find(end_marker)
text = text[start_idx:end_idx]

# Normalize whitespace
text = re.sub(r'\s+', ' ', text)

# Optionally truncate for faster training
text = text[:20000]

# Build vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Unique characters: {vocab_size}\n{chars}")

# Mappings
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}

# Encode entire text
encoded = [char_to_idx[c] for c in text]

Unique characters: 72
[' ', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’']


## Create input/output sequences

In [None]:
import torch

seq_length = 100
step = 1
inputs = []
targets = []

for i in range(0, len(encoded) - seq_length, step):
    inputs.append(encoded[i:i + seq_length])
    targets.append(encoded[i + 1:i + seq_length + 1])

# Convert to tensors
X = torch.tensor(inputs, dtype=torch.long)
y = torch.tensor(targets, dtype=torch.long)
print(X.shape, y.shape)

torch.Size([19900, 100]) torch.Size([19900, 100])


## Define the LSTM model

In [None]:
import torch.nn as nn

class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

## Train the model

In [None]:
model = CharRNN(vocab_size=vocab_size, hidden_size=256)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

from torch.utils.data import TensorDataset, DataLoader

# Create dataset and DataLoader for batching
batch_size = 64
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for xb, yb in loader:
        optimizer.zero_grad()
        output, _ = model(xb)
        loss = criterion(output.view(-1, vocab_size), yb.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

Epoch 1/10, Loss: 1.4149
Epoch 2/10, Loss: 0.3483
Epoch 3/10, Loss: 0.1639
Epoch 4/10, Loss: 0.1367
Epoch 5/10, Loss: 0.1253
Epoch 6/10, Loss: 0.1188
Epoch 7/10, Loss: 0.1143
Epoch 8/10, Loss: 0.1106
Epoch 9/10, Loss: 0.1080
Epoch 10/10, Loss: 0.1060


## Generate text from the model

In [None]:
import numpy as np

def generate(model, start_seq, length):
    model.eval()
    input_seq = torch.tensor([char_to_idx[c] for c in start_seq], dtype=torch.long).unsqueeze(0)
    generated = start_seq
    hidden = None
    with torch.no_grad():
        for _ in range(length):
            output, hidden = model(input_seq, hidden)
            last_logits = output[0, -1]
            p = torch.softmax(last_logits, dim=0).cpu().numpy()
            next_idx = np.random.choice(len(p), p=p)
            next_char = idx_to_char[next_idx]
            generated += next_char
            input_seq = torch.cat([input_seq[:, 1:], torch.tensor([[next_idx]])], dim=1)
    return generated

print(generate(model, start_seq="To be", length=500))

To be noth lire in this pence on the trese high dead, By boare window, And windows distress wilt abudly die. but not waid but not wail doth sweets wanted looks nature’s etcreding sight: For then in my thie thought beauthe or eyed, Doth rient ten fore-bed, A wan I for recoment of trusbed ear, Mine wellate widow well wayth shame And baren were it wees your life? For beauty o’er in hide on the resposed with adonous sweets want all her fave thine with beauteous rose desired not live betual falst prove no


### Exercises

1. Train on the full corpus.
2. Improve the generation quality:
   - Add temperature sampling.
   - Use a deeper or bidirectional LSTM.
   - Add dropout or layer normalization.
3. Explore variants:
   - Replace LSTM with GRU.
   - Add attention.