In [4]:
import collections
import re
import torch
from torch import nn
import requests
import math
from torch.utils.data import Dataset, DataLoader

In [5]:
def load_and_preproccess():
    url = "http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt"
    try:
        raw_text = requests.get(url).text
    except:
        print("Error downloading text")
        raw_text = "Placeholder"
    text = re.sub("[^A-Za-z]+", " ", raw_text).lower()
    return text

In [6]:
def tokenize_char(text):
    return list(text)

In [None]:
class Vocab:
    def __init__(self, tokens=[], min_freq=0, reserved=["<unk>"]):
        if tokens and isinstance(tokens[0], list):
            tokens = [
                token for line in tokens for token in line
            ]  # unspread the tokens if they came as lines nested

        counter = collections.Counter(tokens)
        self.token_freq = sorted(
            counter.items(), key=lambda x: x[1], reverse=True
        )  # count the tokens in reverse

        # add all the unique tokens excecluding the ones with freq lower than min_freq
        unique_tokens = set(reserved)
        for token, freq in self.token_freq:
            if freq >= min_freq:
                unique_tokens.add(token)

        self.idx_to_token = list(sorted(unique_tokens))
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, "__len__") and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):
        return self.token_to_idx["<unk>"]

In [None]:
def build_corpus_and_vocab(raw_text):
    """
    return the vocabulary and the corpus(the entire text converted to indicies)
    """
    tokens = tokenize_char(raw_text)
    vocab = Vocab(tokens)
    corpus = torch.tensor(vocab[tokens], dtype=torch.long)
    return corpus, vocab

In [29]:
class TextDataset(Dataset):
    def __init__(self, corpus, seq_length):
        self.corpus = corpus
        self.seq_length = seq_length

    def __len__(self):
        return len(self.corpus) - self.seq_length

    def __getitem__(self, index):
        """
        returns a slice of the corpus from index to the seq_length
        """
        x = self.corpus[index : index + self.seq_length]
        y = self.corpus[index + 1 : index + self.seq_length + 1]

        return x, y

In [None]:
class RNN_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        # use the linear layer that one-hot vectors
        self.input_layer = nn.Linear(vocab_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, state=None):
        batch_size, seq_len = x.shape

        # maps the character to onehotencoding i.e. 0 => [1,0,0,0,0,...]
        one_hot = nn.functional.one_hot(x, num_classes=self.vocab_size).float()

        output, state = self.rnn(self.input_layer(one_hot), state)
        return self.fc(output), state

    def init_hidden(self, batch_size, device):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)


In [None]:
from tqdm import tqdm, trange


def train_epoch(model, dataloader, criterion, optimizer, device):
    total_loss = 0
    total_tokens = 0

    # Wrap dataloader with tqdm
    loop = tqdm(dataloader, desc="Training")

    model.train()
    for x, y in loop:
        x, y = x.to(device), y.to(device)
        batch_size = x.size(0)

        # Initialize hidden state
        hidden = model.init_hidden(batch_size, device)

        # Forward pass
        output, hidden = model(x, hidden)

        # Reshape for loss calculation
        loss = criterion(output.view(-1, output.size(-1)), y.view(-1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()

        # Clip gradient
        theta = 1.0
        nn.utils.clip_grad_norm_(model.parameters(), theta)

        optimizer.step()

        total_loss += loss.item() * x.size(0) * x.size(1)
        total_tokens += x.size(0) * x.size(1)

        # Update tqdm description
        loop.set_postfix(loss=loss.item())

    return total_loss / total_tokens

In [32]:
def predict(model, vocab, start_text, num_chars, device, temperature=1.0):
    """Generate text using the trained model."""
    model.eval()

    # Convert start text to indices
    tokens = tokenize_char(start_text.lower())
    indices = torch.tensor([vocab[tokens]], dtype=torch.long).to(device)

    # Initialize hidden state
    hidden = model.init_hidden(1, device)

    result = start_text

    with torch.no_grad():
        # Process the start text
        for i in range(len(tokens) - 1):
            output, hidden = model(indices[:, i : i + 1], hidden)

        # Generate new characters
        next_input = indices[:, -1:]
        for _ in range(num_chars):
            output, hidden = model(next_input, hidden)

            # Apply temperature and sample
            probs = torch.softmax(output[0, -1] / temperature, dim=0)
            next_idx = torch.multinomial(probs, 1).item()

            result += vocab.to_tokens(next_idx)
            next_input = torch.tensor([[next_idx]], dtype=torch.long).to(device)

    return result

In [None]:
seq_length = 35  # How many chars it looks at before predicting the next char
batch_size = 512
hidden_size = 256
num_layers = 2
num_epochs = 10
learning_rate = 0.001

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and preprocess data
print("Loading data...")
text = load_and_preproccess()
corpus, vocab = build_corpus_and_vocab(text)

print(f"Corpus length: {len(corpus)}")
print(f"Vocabulary size: {len(vocab)}")

# Create dataset and dataloader
dataset = TextDataset(corpus, seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# Initialize model
model = RNN_OneHot(len(vocab), hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
print("\nStarting training...")
for epoch in trange(num_epochs, desc="Epochs"):
    avg_loss = train_epoch(model, dataloader, criterion, optimizer, device)
    perplexity = math.exp(avg_loss)
    if (epoch + 1) % 5 == 0:
        print(
            f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Perplexity: {perplexity:.2f}"
        )

        # Generate sample text
        sample = predict(
            model, vocab, "the time traveller ", 100, device, temperature=0.8
        )
        print(f"Sample: {sample}\n")

# Final text generation
print("\n" + "=" * 80)
print("Final Generated Texts:")
print("=" * 80)

prompts = ["the time traveller ", "it was ", "the machine "]
for prompt in prompts:
    generated = predict(
        model, vocab, prompt, num_chars=50, device=device, temperature=0.8
    )
    print(f"\nPrompt: '{prompt}'")
    print(f"Generated: {generated}")
    print("-" * 80)

Using device: cuda
Loading data...
Corpus length: 173428
Vocabulary size: 28

Starting training...


Training: 100%|██████████| 338/338 [00:07<00:00, 43.90it/s, loss=1.52]
Training: 100%|██████████| 338/338 [00:07<00:00, 44.61it/s, loss=1.27]
Training: 100%|██████████| 338/338 [00:08<00:00, 40.07it/s, loss=1.13]
Training: 100%|██████████| 338/338 [00:07<00:00, 45.73it/s, loss=1.04]
Training: 100%|██████████| 338/338 [00:07<00:00, 44.64it/s, loss=0.962]
Epochs:  50%|█████     | 5/10 [00:38<00:38,  7.73s/it]

Epoch [5/10], Loss: 0.9957, Perplexity: 2.71
Sample: the time traveller disk vanishing the lamp and i shall be hand found that a least we came to a stood looking at himself



Training: 100%|██████████| 338/338 [00:07<00:00, 44.84it/s, loss=0.886]
Training: 100%|██████████| 338/338 [00:08<00:00, 41.16it/s, loss=0.836]
Training: 100%|██████████| 338/338 [00:07<00:00, 44.04it/s, loss=0.792]
Training: 100%|██████████| 338/338 [00:07<00:00, 44.72it/s, loss=0.761]
Training: 100%|██████████| 338/338 [00:07<00:00, 45.06it/s, loss=0.716]
Epochs: 100%|██████████| 10/10 [01:17<00:00,  7.74s/it]


Epoch [10/10], Loss: 0.7413, Perplexity: 2.10
Sample: the time traveller and to the bronze penerally adapted the editor econom covered from for an overcame me it by my ratis


Final Generated Texts:

Prompt: 'the time traveller '
Generated: the time traveller and there and cut off was blown our revivenient pa
--------------------------------------------------------------------------------

Prompt: 'it was '
Generated: it was hiddled me again to my own interminute seeliking a
--------------------------------------------------------------------------------

Prompt: 'the machine '
Generated: the machine lit herments return away of looking at time and pa
--------------------------------------------------------------------------------
