# Generation using RNNs

We are trying to generate syntactically correct Roman Numbers.

For the task, we use a LSTM Network

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import torch.nn.functional as F

In [4]:
# Utility classes and functions

roman_map = {
    1: "I", 4: "IV", 5: "V", 9: "IX", 10: "X",
    40: "XL", 50: "L", 90: "XC", 100: "C",
    400: "CD", 500: "D", 900: "CM", 1000: "M"
}


def decimal_to_roman(num):
    roman = ""
    for value, symbol in sorted(roman_map.items(), key=lambda x: -x[0]):
        while num >= value:
            roman += symbol
            num -= value
    return roman


def roman_to_decimal(roman):
    roman_map = {
        'I': 1, 'V': 5, 'X': 10, 'L': 50,
        'C': 100, 'D': 500, 'M': 1000
    }
    total = 0
    prev_value = 0

    for char in reversed(roman):
        current_value = roman_map[char]
        if current_value >= prev_value:
            total += current_value
        else:
            total -= current_value
        prev_value = current_value

    return total


class RomanDataset(Dataset):
    def __init__(self, max_number=3999):
        self.data = [decimal_to_roman(i) for i in range(1, max_number + 1)]
        self.char_to_idx = {char: idx + 1 for idx, char in enumerate("IVXLCDM")}  # +1 for padding
        self.char_to_idx["<START>"] = len(self.char_to_idx) + 1
        self.char_to_idx["<END>"] = len(self.char_to_idx) + 1
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
        self.pad_idx = 0

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        roman = self.data[idx]
        tokens = [self.char_to_idx["<START>"]] + [self.char_to_idx[char] for char in roman] + [
            self.char_to_idx["<END>"]]
        return torch.tensor(tokens, dtype=torch.long)


def collate_fn(batch):
    return pad_sequence(batch, batch_first=True, padding_value=0)


# LSTM model
class RomanGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super(RomanGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        output, hidden = self.lstm(x, hidden)
        output = self.fc(output)
        return output, hidden

In [5]:
max_number = 3999
embed_size = 16
hidden_size = 128
num_layers = 1
batch_size = 32
learning_rate = 0.001
epochs = 10

In [6]:
# Dataset and DataLoader
dataset = RomanDataset(max_number=max_number)
vocab_size = len(dataset.char_to_idx) + 1  # +1 per il padding
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Model, loss and optimization
model = RomanGenerator(vocab_size, embed_size, hidden_size, num_layers)
criterion = nn.CrossEntropyLoss(ignore_index=dataset.pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [7]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = batch[:, :-1]  # Everything but the last token is the input
        targets = batch[:, 1:]  # Everything but the first token is the output
        outputs, _ = model(inputs)
        loss = criterion(outputs.reshape(-1, vocab_size), targets.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}")

Epoch 1/10, Loss: 1.4240
Epoch 2/10, Loss: 1.0933
Epoch 3/10, Loss: 1.0432
Epoch 4/10, Loss: 1.0215
Epoch 5/10, Loss: 1.0105
Epoch 6/10, Loss: 1.0018
Epoch 7/10, Loss: 0.9978
Epoch 8/10, Loss: 0.9947
Epoch 9/10, Loss: 0.9933
Epoch 10/10, Loss: 0.9905


In [12]:
def generate_sequence(model, start_token, max_length, dataset):
    model.eval()
    generated_sequence = [start_token]
    input_seq = torch.tensor([[start_token]], dtype=torch.long)
    hidden = None

    for _ in range(max_length):
        with torch.no_grad():
            output, hidden = model(input_seq, hidden)
            next_token = torch.argmax(output[:, -1, :], dim=-1).item()
            if next_token == dataset.char_to_idx["<END>"]:
                break
            generated_sequence.append(next_token)
            input_seq = torch.tensor([[next_token]], dtype=torch.long)

    return "".join([dataset.idx_to_char[idx] for idx in generated_sequence if idx > 0])


start_token = dataset.char_to_idx["<START>"]
print("A generated Roman Number:")
print(generate_sequence(model, start_token, max_length=20, dataset=dataset))

A generated Roman Number:
<START>MMMCCCXXXII


In [26]:
# Completes the sequence given as start_tokens
def read_sequence(model, start_tokens, max_length, dataset, temperature=1.0):
    model.eval() # Evaluation state

    # Get the Model ready, by feeding it the current sequence (start tokens)
    generated_sequence = start_tokens[:]
    input_seq = torch.tensor([start_tokens], dtype=torch.long)
    with torch.no_grad():
        _, hidden = model(input_seq)

    # Generate new tokens
    for _ in range(max_length):
        with torch.no_grad():
            output, hidden = model(input_seq, hidden)

            # Get the probability distribution (adapted using temperature)
            logits = output[:, -1, :]
            probabilities = F.softmax(logits / temperature, dim=-1)

            # Print probability distribution
            for i, x in enumerate(probabilities[0].numpy()):
                if i > 0:
                    print(dataset.idx_to_char[i], np.round(x, 2))

            # Pick the next token
            next_token = torch.multinomial(probabilities, num_samples=1).item()
            if next_token == dataset.char_to_idx["<END>"]:
                break
            generated_sequence.append(next_token)
            input_seq = torch.tensor([[next_token]], dtype=torch.long)

            # Print current sequence state
            print([dataset.idx_to_char[idx] for idx in generated_sequence if idx > 0])

    return "".join([dataset.idx_to_char[idx] for idx in generated_sequence if idx > 0])


start_tokens = start_tokens = [
    dataset.char_to_idx["<START>"],
    dataset.char_to_idx["X"],
]

read_sequence(
    model,
    start_tokens,
    max_length=10,
    dataset=dataset,
    temperature=0.8  # Determines how willing it is to pick improbable options
)

I 0.58
V 0.08
X 0.18
L 0.0
C 0.01
D 0.0
M 0.0
<START> 0.0
<END> 0.14
['<START>', 'X', 'V']
I 0.71
V 0.0
X 0.0
L 0.0
C 0.0
D 0.0
M 0.0
<START> 0.0
<END> 0.29


'<START>XV'