In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import requests

import torch.nn as nn

In [2]:
# Step 1: Download and prepare Shakespeare data
def get_shakespeare_data():
    """Download a sample of Shakespeare text"""
    url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    text = requests.get(url).text
    return text

In [3]:
# Step 2: Create a character-level tokenizer
class TextTokenizer:
    def __init__(self, text):
        # Create a mapping from characters to integers and vice versa
        self.chars = sorted(list(set(text)))
        self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)}
        self.vocab_size = len(self.chars)

    def encode(self, string):
        """Convert string to list of integers"""
        return [self.char_to_idx[ch] for ch in string]

    def decode(self, indices):
        """Convert list of integers back to string"""
        return ''.join([self.idx_to_char[idx] for idx in indices])

In [4]:
# Step 3: Create a Dataset class
class ShakespeareDataset(Dataset):
    def __init__(self, text, sequence_length=100):
        self.sequence_length = sequence_length
        self.text = text
        self.tokenizer = TextTokenizer(text)
        self.encoded = self.tokenizer.encode(text)

    def __len__(self):
        return len(self.text) - self.sequence_length

    def __getitem__(self, idx):
        # Get input sequence and target (next character)
        sequence = self.encoded[idx:idx + self.sequence_length]
        target = self.encoded[idx + 1:idx + self.sequence_length + 1]
        return torch.LongTensor(sequence), torch.LongTensor(target)

In [5]:
# Step 4: Define the RNN model
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=256):
        super().__init__()
        # Embedding layer converts character indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # RNN layer processes the sequence
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        # Linear layer for output
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # Convert indices to embeddings
        embedded = self.embedding(x)
        # Process through RNN
        output, hidden = self.rnn(embedded, hidden)
        # Convert RNN output to character probabilities
        output = self.fc(output)
        return output, hidden

In [6]:
# Step 5: Training function
def train_model(model, dataset, epochs=10, batch_size=64, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Create DataLoader for batch processing
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs, _ = model(inputs)
            loss = criterion(outputs.view(-1, dataset.tokenizer.vocab_size), targets.view(-1))

            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if batch_idx % 100 == 0:
                print(f'Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}')

        print(f'Epoch {epoch+1} completed, Average Loss: {total_loss/len(dataloader):.4f}')

In [7]:
# Step 6: Text generation function
def generate_text(model, tokenizer, seed_text, length=200, temperature=0.8):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    current_text = seed_text
    generated_text = seed_text

    with torch.no_grad():
        for _ in range(length):
            # Prepare input
            sequence = tokenizer.encode(current_text)
            sequence = torch.LongTensor([sequence]).to(device)

            # Get model prediction
            output, _ = model(sequence)

            # Apply temperature to adjust randomness
            probs = (output[0, -1] / temperature).softmax(dim=-1)
            next_char_idx = torch.multinomial(probs, 1).item()

            # Append generated character
            next_char = tokenizer.decode([next_char_idx])
            generated_text += next_char
            current_text = current_text[1:] + next_char

    return generated_text

In [8]:
# Step 7: Main execution
def main():
    # Get data
    print("Downloading Shakespeare text...")
    text = get_shakespeare_data()

    # Create dataset
    print("Creating dataset...")
    dataset = ShakespeareDataset(text)

    # Initialize model
    print("Initializing model...")
    model = SimpleRNN(dataset.tokenizer.vocab_size)

    # Train model
    print("Training model...")
    train_model(model, dataset, epochs=5)

    # Generate text
    print("\nGenerating text...")
    seed_text = "To be or not to be"
    generated = generate_text(model, dataset.tokenizer, seed_text)
    print(f"\nGenerated text:\n{generated}")

if __name__ == "__main__":
    main()

Downloading Shakespeare text...
Creating dataset...
Initializing model...
Training model...
Epoch 1, Batch 0, Loss: 4.2126
Epoch 1, Batch 100, Loss: 2.0825
Epoch 1, Batch 200, Loss: 1.8989
Epoch 1, Batch 300, Loss: 1.7842
Epoch 1, Batch 400, Loss: 1.7019
Epoch 1, Batch 500, Loss: 1.6748
Epoch 1, Batch 600, Loss: 1.6143
Epoch 1, Batch 700, Loss: 1.6090
Epoch 1, Batch 800, Loss: 1.6027
Epoch 1, Batch 900, Loss: 1.5451
Epoch 1, Batch 1000, Loss: 1.5328
Epoch 1, Batch 1100, Loss: 1.5413
Epoch 1, Batch 1200, Loss: 1.5318
Epoch 1, Batch 1300, Loss: 1.4999
Epoch 1, Batch 1400, Loss: 1.5014
Epoch 1, Batch 1500, Loss: 1.4462
Epoch 1, Batch 1600, Loss: 1.4818
Epoch 1, Batch 1700, Loss: 1.4752
Epoch 1, Batch 1800, Loss: 1.4756
Epoch 1, Batch 1900, Loss: 1.4748
Epoch 1, Batch 2000, Loss: 1.4046
Epoch 1, Batch 2100, Loss: 1.4100
Epoch 1, Batch 2200, Loss: 1.4579
Epoch 1, Batch 2300, Loss: 1.4027
Epoch 1, Batch 2400, Loss: 1.4286
Epoch 1, Batch 2500, Loss: 1.4242
Epoch 1, Batch 2600, Loss: 1.4664
Ep