In [37]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate
import matplotlib.pyplot as plt

from torchtext.data.metrics import bleu_score

Creem un dataset a partir del txt que serà un diccionari amb les claus dels idiomes (en i de) i com a valor una llista de les frases traduïdes en ordre

In [8]:
def eliminar_ccby(linea):
    indice_ccby = linea.find("CC-BY")
    if indice_ccby != -1:
        return linea[:indice_ccby]
    return linea

def separar_frases(linea):
    patron = r'(.+?[.!?])\s+([^A-Z]?[\s]+)?(.+)?'
    coincidencias = re.match(patron, linea)
    if coincidencias:
        frase1 = coincidencias.group(1)
        frase2 = coincidencias.group(3)
        return frase1.strip(), frase2.strip() if frase2 else None
    else:
        return None, None

def make_dataset(archivo_entrada):
    dataset = []
    with open(archivo_entrada, 'r') as f_in:
        for linea in f_in:
            linea_clean = eliminar_ccby(linea)
            a,b = separar_frases(linea_clean)
            if a!=None and b!=None:
              dataset.append({'en':a,'de':b})
    return dataset

In [9]:
archivo_entrada = 'deu.txt'
data = make_dataset(archivo_entrada)
len(data)

277227

Tenim 277k traduccions per treballar el model

In [92]:
print(data[:10])

[{'en': 'Go.', 'de': 'Geh.'}, {'en': 'Hi.', 'de': 'Hallo!'}, {'en': 'Hi.', 'de': 'Grüß Gott!'}, {'en': 'Run!', 'de': 'Lauf!'}, {'en': 'Run.', 'de': 'Lauf!'}, {'en': 'Wow!', 'de': 'Potzdonner!'}, {'en': 'Wow!', 'de': 'Donnerwetter!'}, {'en': 'Duck!', 'de': 'Kopf runter!'}, {'en': 'Fire!', 'de': 'Feuer!'}, {'en': 'Help!', 'de': 'Hilfe!'}]


Importem splitters d'anglés i alemany

In [10]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [94]:
string = "What a lovely day it is today!"
[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

Para cada par de oraciones añadimos a más su split correspondiente con el incio y fin de frase y tenemos en cuenta las mayúsculas

In [95]:
len(data)

277227

Aquetsa funció només funciona amb un màxim de 1000 elementens del dataset, així que treballarem amb batches per solucionar el problema

In [11]:
def tokenize_example(data, en_nlp, de_nlp, lower, sos_token, eos_token):

    delete = []
    for i in range(len(data)):

        try:
            en_tokens = [token.text for token in en_nlp.tokenizer(data[i]["en"])]
            de_tokens = [token.text for token in de_nlp.tokenizer(data[i]["de"])]
            if lower:
                en_tokens = [token.lower() for token in en_tokens]
                de_tokens = [token.lower() for token in de_tokens]
            en_tokens = [sos_token] + en_tokens + [eos_token]
            de_tokens = [sos_token] + de_tokens + [eos_token]

        except Exception as e:
            delete.append(i)

        else:
            data[i]['en_tokens'] = en_tokens   # només afegirem les dades si no salta cap error, sinó els elements afegits estaran bruts
            data[i]['de_tokens'] = de_tokens

    return data, delete


def process_in_batches(data, batch_size, en_nlp, de_nlp, lower, sos_token, eos_token):

    result = []
    for i in range(0, len(data), batch_size):  # 277 batches de 1000 elements

        batch = data[i:i+batch_size]
        processed_batch, delete = tokenize_example(batch, en_nlp, de_nlp, lower, sos_token, eos_token)
        for frase in delete:
            del(processed_batch[frase])
        result.extend(processed_batch)

    return result

In [12]:
lower = True
sos_token = "<sos>"
eos_token = "<eos>"
batch_size = 1000
dataset = data

inputs = [dataset, batch_size, en_nlp, de_nlp, lower, sos_token, eos_token]

In [13]:
dataset = process_in_batches(*inputs)

In [99]:
len(dataset)

277227

In [100]:
data[1000]

{'en': "It's mine.",
 'de': 'Es ist meins.',
 'en_tokens': ['<sos>', 'it', "'s", 'mine', '.', '<eos>'],
 'de_tokens': ['<sos>', 'es', 'ist', 'meins', '.', '<eos>']}

sos = start of sentence,
eos = end of sentence,
unk = unknown,
pad = padding

In [14]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



In [15]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]


all_en_tokens, all_de_tokens = [], []
for frase in dataset:
    all_en_tokens.append(frase['en_tokens'])
    all_de_tokens.append(frase['de_tokens'])


en_vocab = build_vocab_from_iterator(
    all_en_tokens,
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = build_vocab_from_iterator(
    all_de_tokens,
    min_freq=min_freq,
    specials=special_tokens,
)

Com a primera conclusió podem dir que l'alemany es un idioma més ric en paraules, ja que es fan servir gairebé el doble en un dataset de 277k frases

In [16]:
words_en, words_de = en_vocab.get_itos(), de_vocab.get_itos()
len(words_en), len(words_de)

(12327, 23055)

In [17]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [105]:
"man" in words_en

True

In [18]:
def numericalize_example(data, en_vocab, de_vocab):
    for i in range(len(data)):
        en_ids = en_vocab.lookup_indices(data[i]["en_tokens"])
        de_ids = de_vocab.lookup_indices(data[i]["de_tokens"])
        data[i]['en_ids'] = en_ids
        data[i]['de_ids'] = de_ids
    return data

In [19]:
dataset = numericalize_example(dataset, en_vocab, de_vocab)
dataset[1000]

{'en': "It's mine.",
 'de': 'Es ist meins.',
 'en_tokens': ['<sos>', 'it', "'s", 'mine', '.', '<eos>'],
 'de_tokens': ['<sos>', 'es', 'ist', 'meins', '.', '<eos>'],
 'en_ids': [2, 16, 17, 463, 4, 3],
 'de_ids': [2, 15, 10, 2044, 4, 3]}

L'equivalència token - índex és correcte

In [108]:
print(dataset[100]["en_tokens"])
print(en_vocab.lookup_tokens(dataset[100]["en_ids"]))
dataset[100]["en_tokens"] == en_vocab.lookup_tokens(dataset[100]["en_ids"])

['<sos>', 'no', 'way', '!', '<eos>']
['<sos>', 'no', 'way', '!', '<eos>']


True

In [20]:
for i in range(len(dataset)):
    en_ids = dataset[i]['en_ids']
    de_ids = dataset[i]['de_ids']
    dataset[i]['en_ids'] = torch.tensor(en_ids)
    dataset[i]['de_ids'] = torch.tensor(de_ids)
dataset[100]

{'en': 'No way!',
 'de': 'Das kommt nicht in Frage!',
 'en_tokens': ['<sos>', 'no', 'way', '!', '<eos>'],
 'de_tokens': ['<sos>', 'das', 'kommt', 'nicht', 'in', 'frage', '!', '<eos>'],
 'en_ids': tensor([  2,  77, 150, 188,   3]),
 'de_ids': tensor([  2,  11, 178,   9,  21, 209,  27,   3])}

Ara ja tenim el dataset amb totes les des necessàries per començar el entrenament del model

In [110]:
lista_grande = [i*3 for i in range(100)]

import h5py

# Guardar la lista en un archivo HDF5
with h5py.File('datos.h5', 'w') as f:
    f.create_dataset('lista', data=lista_grande)

# Recuperar la lista del archivo HDF5
with h5py.File('datos.h5', 'r') as f:
    lista_grande_recuperada = f['lista'][:]


lista_grande_recuperada

array([  0,   3,   6,   9,  12,  15,  18,  21,  24,  27,  30,  33,  36,
        39,  42,  45,  48,  51,  54,  57,  60,  63,  66,  69,  72,  75,
        78,  81,  84,  87,  90,  93,  96,  99, 102, 105, 108, 111, 114,
       117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153,
       156, 159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192,
       195, 198, 201, 204, 207, 210, 213, 216, 219, 222, 225, 228, 231,
       234, 237, 240, 243, 246, 249, 252, 255, 258, 261, 264, 267, 270,
       273, 276, 279, 282, 285, 288, 291, 294, 297])

In [111]:
# Supongamos que tenemos un vocabulario de tamaño 10 y queremos vectores de embedding de dimensión 5
vocab_size = 10
embedding_dim = 5

# Creamos una capa de embedding con el tamaño de vocabulario y la dimensión del embedding especificados
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# Creamos una secuencia de entrada de índices
input_indices = torch.tensor([1, 3, 5, 7, 9])

# Pasamos la secuencia de entrada a través de la capa de embedding
embedded_vectors = embedding_layer(input_indices)

# Mostramos los vectores de embedding resultantes
print("Vectores de embedding:")
print(embedded_vectors)

Vectores de embedding:
tensor([[ 0.3381,  0.2256,  0.9960,  0.0667,  0.3738],
        [ 1.3675, -0.7673,  1.1011,  0.1367,  0.4184],
        [-0.4616,  0.0086,  0.0949, -1.0840,  0.8102],
        [-0.7779, -2.9983,  1.3007,  1.7716,  0.2621],
        [-0.2651, -0.8485, -0.5389, -1.5822,  0.7295]],
       grad_fn=<EmbeddingBackward0>)


Creem una classe básica dicionari per crear el dataloader de la llibreria torch

In [21]:
from torch.utils.data import Dataset

class ListDictDataset(Dataset):
    def __init__(self, list_of_dicts):
        self.list_of_dicts = list_of_dicts

    def __len__(self):
        return len(self.list_of_dicts)

    def __getitem__(self, index):
        return self.list_of_dicts[index]

#dataset = ListDictDataset(train)
type(dataset)

list

Creem un dataloader per processar bé totes les traduccions, on només afegirem els index de les paraules al diccionari corresponent segons l idioma, que és el que ens servirà per fer l embedding de les frases

In [22]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)  #Fem servir padding com relleno
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

In [23]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

Hem de tenir en compte també que el dataset, tal i com l'hem importat, està ordenat de frases més simples, d'una sola a paraules, a més complexes. Per tant hem de considerar què és millor per entrenar i validad, respectivament

In [24]:
batch_size = 128
test_size = 0.1

random.shuffle(dataset)
test = int( len(dataset) * test_size )
test_set = dataset[:test]
train_set = dataset[-test:]

train_data_loader = get_data_loader(train_set, batch_size, pad_index, shuffle=True)
test_data_loader = get_data_loader(test_set, batch_size, pad_index, shuffle=True)

In [116]:
train_data_loader

<torch.utils.data.dataloader.DataLoader at 0x7f777dd54b20>

Cla

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(encoder_hidden_dim * 2, decoder_hidden_dim)
        self.dropout = nn.Dropout(dropout)  # recordem: dropout desactiva un % de neurones i així evita l'overfitting

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, hidden = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        # outputs are always from the last layer
        # hidden [-2, :, : ] is the last of the forwards RNN
        # hidden [-1, :, : ] is the last of the backwards RNN
        # initial decoder hidden is final hidden state of the forwards and backwards
        # encoder RNNs fed through a linear layer
        hidden = torch.tanh(
            self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        )
        # outputs = [src length, batch size, encoder hidden dim * 2]
        # hidden = [batch size, decoder hidden dim]
        return outputs, hidden

In [4]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
        self.attn_fc = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim, decoder_hidden_dim
        )
        self.v_fc = nn.Linear(decoder_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch size, decoder hidden dim]
        # encoder_outputs = [src length, batch size, encoder hidden dim * 2]
        batch_size = encoder_outputs.shape[1]
        src_length = encoder_outputs.shape[0]
        # repeat decoder hidden state src_length times
        hidden = hidden.unsqueeze(1).repeat(1, src_length, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # hidden = [batch size, src length, decoder hidden dim]
        # encoder_outputs = [batch size, src length, encoder hidden dim * 2]
        energy = torch.tanh(self.attn_fc(torch.cat((hidden, encoder_outputs), dim=2)))
        # energy = [batch size, src length, decoder hidden dim]
        attention = self.v_fc(energy).squeeze(2)
        # attention = [batch size, src length]
        return torch.softmax(attention, dim=1)

In [5]:
class Decoder(nn.Module):
    def __init__(
        self,
        output_dim,
        embedding_dim,
        encoder_hidden_dim,
        decoder_hidden_dim,
        dropout,
        attention,
    ):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU((encoder_hidden_dim * 2) + embedding_dim, decoder_hidden_dim)
        self.fc_out = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim + embedding_dim, output_dim
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input = [batch size]
        # hidden = [batch size, decoder hidden dim]
        # encoder_outputs = [src length, batch size, encoder hidden dim * 2]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        a = self.attention(hidden, encoder_outputs)
        # a = [batch size, src length]
        a = a.unsqueeze(1)
        # a = [batch size, 1, src length]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs = [batch size, src length, encoder hidden dim * 2]
        weighted = torch.bmm(a, encoder_outputs)
        # weighted = [batch size, 1, encoder hidden dim * 2]
        weighted = weighted.permute(1, 0, 2)
        # weighted = [1, batch size, encoder hidden dim * 2]
        rnn_input = torch.cat((embedded, weighted), dim=2)
        # rnn_input = [1, batch size, (encoder hidden dim * 2) + embedding dim]
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        # output = [seq length, batch size, decoder hid dim * n directions]
        # hidden = [n layers * n directions, batch size, decoder hid dim]
        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, decoder hidden dim]
        # hidden = [1, batch size, decoder hidden dim]
        # this also means that output == hidden
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        # prediction = [batch size, output dim]
        return prediction, hidden.squeeze(0), a.squeeze(1)

In [6]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        batch_size = src.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # encoder_outputs is all hidden states of the input sequence, back and forwards
        # hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
        # outputs = [src length, batch size, encoder hidden dim * 2]
        # hidden = [batch size, decoder hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden state and all encoder hidden states
            # receive output tensor (predictions) and new hidden state
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, decoder hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

Valors amb el que anirem probant per millorar l'eficiència del model

In [25]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
encoder_hidden_dim = 512
decoder_hidden_dim = 512
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

attention = Attention(encoder_hidden_dim, decoder_hidden_dim)

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    decoder_dropout,
    attention,
)

model = Seq2Seq(encoder, decoder, device).to(device)

Cada paràmetre representa el weight o bias d'una capa (ih=input, hh=hidden) en el recorregut d'anada per calcular el loss o el de tornada que actualitza el gradient (reverse)

In [122]:
for p in model.named_parameters(): print(p)

('encoder.embedding.weight', Parameter containing:
tensor([[-0.5862, -0.7930,  0.6804,  ..., -1.2047, -1.2520, -1.1182],
        [-0.6090, -0.0700,  0.3288,  ...,  0.7027, -0.0594,  0.4895],
        [ 0.4212,  1.3886, -0.7522,  ..., -1.2896,  0.9501, -0.2751],
        ...,
        [-0.8196, -0.9215,  0.8340,  ..., -0.4374,  1.8711, -0.3074],
        [-0.9383, -1.2537, -0.6361,  ...,  0.2257,  0.8264, -0.5752],
        [ 0.8020, -1.6222, -0.1084,  ..., -0.2139,  0.5719, -0.0358]],
       requires_grad=True))
('encoder.rnn.weight_ih_l0', Parameter containing:
tensor([[ 0.0315, -0.0415,  0.0088,  ...,  0.0140,  0.0187,  0.0234],
        [-0.0193,  0.0391,  0.0118,  ...,  0.0216, -0.0166, -0.0051],
        [ 0.0332, -0.0342, -0.0211,  ..., -0.0093,  0.0144,  0.0405],
        ...,
        [-0.0182, -0.0094, -0.0309,  ...,  0.0391, -0.0112, -0.0130],
        [ 0.0166,  0.0314,  0.0269,  ...,  0.0316, -0.0334, -0.0334],
        [ 0.0073, -0.0037, -0.0428,  ..., -0.0084,  0.0428, -0.0252]],
  

In [26]:
def init_weights(m):
    for name, param in m.named_parameters():
        if "weight" in name:
            nn.init.normal_(param.data, mean=0, std=0.01)  # pesos molt propers a 0, desviació del 1%, perill de vanishing
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(23055, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn_fc): Linear(in_features=1536, out_features=512, bias=True)
      (v_fc): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(12327, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=12327, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [124]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad),model.parameters()

a,b = count_parameters(model)
suma=0
for x in b:
  print(x)

print(f"The model has {a:,} trainable parameters",suma)

Parameter containing:
tensor([[ 7.4209e-03, -4.2855e-03,  1.1764e-02,  ..., -2.9881e-03,
          8.0914e-03, -1.1787e-02],
        [ 5.9011e-03,  1.9764e-02, -9.4549e-05,  ...,  4.4672e-03,
         -5.3793e-03, -1.3888e-02],
        [ 3.8701e-04, -9.1292e-03,  8.3757e-03,  ...,  1.9544e-03,
         -5.8790e-03, -2.5763e-03],
        ...,
        [ 1.8644e-02,  7.1831e-03, -4.7250e-03,  ..., -1.0184e-02,
          7.5142e-03,  3.0610e-03],
        [-9.8726e-03, -4.1172e-03,  2.3252e-03,  ..., -5.8614e-03,
         -1.0822e-02, -8.4239e-03],
        [ 2.7760e-03, -2.0539e-04, -3.1274e-03,  ..., -6.1374e-03,
         -7.8546e-03,  1.0367e-02]], requires_grad=True)
Parameter containing:
tensor([[-0.0093, -0.0040, -0.0056,  ..., -0.0039,  0.0072,  0.0102],
        [ 0.0157, -0.0072, -0.0124,  ..., -0.0117, -0.0030, -0.0086],
        [ 0.0018, -0.0074,  0.0004,  ..., -0.0147,  0.0055, -0.0009],
        ...,
        [ 0.0020, -0.0051, -0.0062,  ..., -0.0061, -0.0083, -0.0151],
        [ 0

In [27]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

Totes les primeres files de cada batch estan plenes de 2, com l'inici de tots els en_ids o de_ids, que representa el eos

In [126]:
loader = train_data_loader
a,b = 0,0

for batch in loader:
  print(batch['en_ids'])

num_batches = len(loader)
print(f'Número de batches: {num_batches}')  #número de batches dins del loader

tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   5,    5,    6,  ...,    5,   16,    7],
        [  51,   52,  479,  ...,   14, 2111,   45],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  5,  31,  62,  ...,   5,  45, 105],
        [379, 144,  13,  ...,  95,   5,  17],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [ 16,   9,   9,  ...,   5,   6,  18],
        [ 17, 426, 967,  ...,  14,  76, 242],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [197,  28,   9,  ...,   6,   6,   6],
  

In [40]:
def train_seq2seq(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["de_ids"].to(device)   # device és gpu collab ja que és on fem les probes per no gasta hores de mv azure
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)


def evaluate_seq2seq(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_predictions, all_references = [], []
    
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
            
            predictions = output.argmax(dim=-1).tolist()
            references = trg.tolist()
            all_predictions.extend(predictions)
            all_references.extend(references)
    
    avg_loss = epoch_loss / len(data_loader)
    bleu = bleu_score(all_predictions, all_references)
    return avg_loss,bleu

Ara hem de dividir el conjunt de dades que hem processat en train, test per entrenar i validar correctament la xarxa neuronal

In [128]:
n_epochs = 2
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):  #tqdm per veure el processament del bucle, ja que tardará molt
    train_loss = train_seq2seq(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss,bleu = evaluate_seq2seq(
        model,
        test_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut3-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 50%|█████     | 1/2 [18:44<18:44, 1124.32s/it]

	Train Loss:   5.078 | Train PPL: 160.429
	Valid Loss:   4.686 | Valid PPL: 108.464


100%|██████████| 2/2 [37:31<00:00, 1125.74s/it]

	Train Loss:   4.029 | Train PPL:  56.208
	Valid Loss:   4.154 | Valid PPL:  63.676





In [135]:
input_dim = len(de_vocab)  #21491
output_dim = len(en_vocab)  #11919
encoder_embedding_dim = 256
decoder_embedding_dim = 256
encoder_hidden_dim = 512
decoder_hidden_dim = 512
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

attention = Attention(encoder_hidden_dim, decoder_hidden_dim)

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    decoder_dropout,
    attention,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [132]:
print(input_dim,output_dim)

21491 11919


In [29]:
model.load_state_dict(torch.load('tut3-model.pt'))
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(23055, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn_fc): Linear(in_features=1536, out_features=512, bias=True)
      (v_fc): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(12327, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=12327, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
test_loss = evaluate_seq2seq(model, test_data_loader, criterion, device)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

In [31]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            de_tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            de_tokens = [token for token in sentence]
        if lower:
            de_tokens = [token.lower() for token in de_tokens]
        de_tokens = [sos_token] + de_tokens + [eos_token]
        ids = de_vocab.lookup_indices(de_tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        encoder_outputs, hidden = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        attentions = torch.zeros(max_output_length, 1, len(ids))
        for i in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, attention = model.decoder(
                inputs_tensor, hidden, encoder_outputs
            )
            attentions[i] = attention
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        en_tokens = en_vocab.lookup_tokens(inputs)
    return en_tokens, de_tokens, attentions[: len(en_tokens) - 1]

In [138]:
def plot_attention(sentence, translation, attention):
    fig, ax = plt.subplots(figsize=(10, 10))
    attention = attention.squeeze(1).numpy()
    cax = ax.matshow(attention, cmap="bone")
    ax.set_xticks(ticks=np.arange(len(sentence)), labels=sentence, rotation=90, size=15)
    translation = translation[1:]
    ax.set_yticks(ticks=np.arange(len(translation)), labels=translation, size=15)
    plt.show()
    plt.close()

In [32]:
i = 1
sentence = data[i]["de"]
expected_translation = data[i]["en"]
sentence, expected_translation

('Hallo!', 'Hi.')

In [33]:
translation, sentence_tokens, attention = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

print(translation)
print(expected_translation)

['<sos>', 'let', "'s", 'a', '.', '<eos>']
Hi.


In [35]:
import wandb

# Inicializar wandb
api_key = "1d1e143a3eefd6e21e71cefd94bfa81337fc22e1" # your api key
wandb.login(key = api_key)
wandb.init(project="uab-grup14")
    

# Definir el optimizador y la función de pérdida
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

# Función de entrenamiento
def train_seq2seq(model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["de_ids"].to(device)
        trg = batch["en_ids"].to(device)
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

# Función de evaluación
def evaluate_seq2seq(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            output = model(src, trg, 0)  # turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)



VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112232877773446, max=1.0…

In [39]:
n_epochs = 5
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_seq2seq(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss,bleu = evaluate_seq2seq(
        model,
        test_data_loader,
        criterion,
        device,
    )

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "best_model.pt")

    # Log metrics to wandb
    wandb.log({
        'Train Loss': train_loss,
        'Valid Loss': valid_loss,
        'Bleu': bleu,
        'Train PPL': np.exp(train_loss),
        'Valid PPL': np.exp(valid_loss)
    })

    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")
    print(f"\tBleu: {bleu}")

# Finalizar wandb
wandb.finish()

  0%|          | 0/5 [16:49<?, ?it/s]


TypeError: object of type 'int' has no len()