Alunos:

*   Andréa Fonseca
*   Fábio Cardoso
*   Eduardo Leite



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import random
from tqdm import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
num_letters = 5

# Cria um dicionário que mapeia pares de letras como "AB", "DE" para inteiros únicos
tok2idx = {f"{chr(65 + i)}{chr(65 + j)}": i * num_letters + j for i in range(num_letters) for j in range(num_letters)}

# Tokeniza um texto
def tokenizer(text):
    words = text.split()
    output = torch.tensor([tok2idx[word] for word in words], dtype=torch.long)
    return output

# Função para converter um tensor de inteiros em texto
def get_text_from_tensor(tensor):
    return ' '.join([list(tok2idx.keys())[i] for i in tensor])

In [None]:
tokenizer("AB CD")

tensor([ 1, 13])

In [None]:
# Gera exemplos de treinamento
def generate_examples(N):
    examples = []
    max_letter = 65 + num_letters - 1
    for _ in range(N):
        # Gera dois pares de letras aleatórios
        first_pair = f"{chr(random.randint(65, max_letter))}{chr(random.randint(65, max_letter))}"  # Random pair like "AB"
        second_pair = f"{chr(random.randint(65, max_letter))}{chr(random.randint(65, max_letter))}"  # Random pair like "DE"

        # Gera o target que é a primeira letra do primeiro par e a segunda letra do segundo par
        target = f"{first_pair[0]}{second_pair[1]}"

        # Adiciona o exemplo à lista
        examples.append((first_pair, second_pair, target))

    return examples

In [None]:
N = 3000
examples = generate_examples(N)

In [None]:
examples[:10]

[('DB', 'AB', 'DB'),
 ('CE', 'ED', 'CD'),
 ('AA', 'DC', 'AC'),
 ('AB', 'ED', 'AD'),
 ('EC', 'DE', 'EE'),
 ('CD', 'BD', 'CD'),
 ('DA', 'AC', 'DC'),
 ('EB', 'DB', 'EB'),
 ('AA', 'BA', 'AA'),
 ('BD', 'BD', 'BD')]

In [None]:
# Cria um dataset com os exemplos
class SimpleTokenDataset(Dataset):
    def __init__(self, examples):
        self.data = examples

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq1, input_seq2, target_seq = self.data[idx]
        src = tokenizer(input_seq1 + " " + input_seq2)
        tgt = tokenizer(target_seq).squeeze()
        return src, tgt

In [None]:
dataset = SimpleTokenDataset(examples)

num_val = 0.2 * N

train_set = Subset(dataset, range(N - int(num_val)))
val_set = Subset(dataset, range(N - int(num_val), N))

train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16)

In [None]:
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)

        # Apenas o último hidden state é utilizado
        _, (h_n, _) = self.lstm(x)

        out = self.fc(h_n.squeeze(0))
        return out


vocab_size = 10 * 10
embedding_dim = 128
hidden_dim = 256
output_dim = vocab_size

model = TextGenerator(vocab_size, embedding_dim, hidden_dim, output_dim)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 5
for epoch in range(epochs):
    # Treinamento
    model.train()
    train_loss = 0
    for input_seq, target_seq in tqdm(train_loader, desc=f"Train Epoch {epoch+1}/{epochs}"):
        optimizer.zero_grad()
        output = model(input_seq)
        loss = criterion(output, target_seq)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f'Train Loss: {avg_train_loss:.4f}')

    # Validação
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for input_seq, target_seq in val_loader:
            output = model(input_seq)
            loss = criterion(output, target_seq)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Val Loss: {avg_val_loss:.4f}')

Train Epoch 1/5: 100%|██████████| 150/150 [00:02<00:00, 68.50it/s]


Train Loss: 1.8059
Val Loss: 0.1224


Train Epoch 2/5: 100%|██████████| 150/150 [00:02<00:00, 74.27it/s]


Train Loss: 0.0485
Val Loss: 0.0236


Train Epoch 3/5: 100%|██████████| 150/150 [00:01<00:00, 82.02it/s]


Train Loss: 0.0153
Val Loss: 0.0111


Train Epoch 4/5: 100%|██████████| 150/150 [00:01<00:00, 94.75it/s]


Train Loss: 0.0081
Val Loss: 0.0067


Train Epoch 5/5: 100%|██████████| 150/150 [00:01<00:00, 98.68it/s]

Train Loss: 0.0051
Val Loss: 0.0045





In [None]:
# Inferência
input_text = "DE AE"
inputs = tokenizer(input_text)

output = model(inputs)
predicted_token = output.argmax().unsqueeze(0)
predicted_text = get_text_from_tensor(predicted_token)

print(f'Input: {input_text}, Predicted: {predicted_text}')

Input: DE AE, Predicted: DE


In [None]:
def generate_autoregressive_text(model, input_text, seq_len):
    inputs = tokenizer(input_text)
    outputs = []

    for _ in range(seq_len):
        output = model(inputs)
        predicted_token = output.argmax().unsqueeze(0)
        inputs = torch.cat([inputs[-1].unsqueeze(0), predicted_token])
        outputs.append(predicted_token)

    return get_text_from_tensor(outputs)

input_text = "AB CD"
seq_len = 5
predicted_text = generate_autoregressive_text(model, input_text, seq_len)

print(f'Input: {input_text}, Predicted: {predicted_text}')

Input: AB CD, Predicted: AD CD AD CD AD
