# Práctica 2: Implementación de un mecanismo de atención en un modelo Seq2Seq con LSTMs

Partiendo del código del modelo seq2seq con feedback para tareas de Traducción Automática Neuronal (NMT) del notebook anterior, se debe implementar el modelo de atención de Bahdanau o Luong.

Objetivos de la práctica:
- Entender el funcionamiento de los modelos Seq2Seq con LSTMs.
- Comprender e implementar mecanismos de atención.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import warnings

from torch.utils.data import DataLoader
from attention.attention_factory import AttentionFactory
from translation import Translation, collate_fn

# Modelo Seq2Seq
from models_definition.seq2seq.encoder import Encoder
from models_definition.seq2seq.decoder import Decoder
from models_definition.seq2seq.seq2seq import Seq2Seq

# Modelo Bahdanau
from models_definition.bahdanau.encoder import Encoder as EncoderBahdanau
from models_definition.bahdanau.decoder import Decoder as DecoderBahdanau
from models_definition.bahdanau.bahdanau import Bahdanau

# Modelo Luong
from models_definition.luong.encoder import Encoder as EncoderLuong
from models_definition.luong.decoder import Decoder as DecoderLuong
from models_definition.luong.luong import Luong

#import wandb

### TODO:
- Modelo Loung 
- Modelo Badanauh:
    - son lstm bidireccionales con la segunda entrada en reverse : OJO
    - 

Conexión con *Weights & Biases*

In [2]:
wandb.init(project="LSTM-Attention", name="Bahdanau",
            config={
          "learning_rate": 0.001,
          "architecture": "LSTM",
          "epochs": 30,
          "batch_size": 7,
          })

config = wandb.config

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msusanasrez[0m ([33mdata2023[0m). Use [1m`wandb login --relogin`[0m to force relogin


## 1. Cargar los datos

instalar los datos 

In [2]:
archivo_ingles = 'datasets_practice/mock/mock.en'
archivo_espanol = 'datasets_practice/mock/mock.es'

translation = Translation(archivo_ingles, archivo_espanol)

## 2. Entrenamiento

### 2.1. Entrenamiento Seq2Seq con atención

In [None]:
# Parámetros
input_dim = 300
output_dim = translation.vocab_es.vectors.shape[0]
hidden_dim = 512
num_layers = 1
num_workers = 0
shuffle = True

attention = AttentionFactory.initialize_attention("Multi-Layer Perceptron", hidden_dim, hidden_dim)

# Inicializa el modelo, el optimizador y la función de pérdida
encoder = Encoder(input_dim, hidden_dim, num_layers)
decoder = Decoder(input_dim, hidden_dim, output_dim, num_layers, attention=attention)
model = Seq2Seq(encoder, decoder)
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
criterion = nn.CrossEntropyLoss()

train_loader = DataLoader(translation.train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn)
test_loader = DataLoader(translation.test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn)

In [None]:
warnings.filterwarnings("ignore")

for epoch in range(config.epochs):

    model.train()
    total_loss = 0

    for batch_idx, (src, tgt, src_indices, tgt_indices) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(src, tgt)

        tgt_indices = torch.tensor(tgt_indices, dtype=torch.long)
        loss = 0
        for t in range(1, tgt.shape[1]):
            loss += criterion(output[:, t, :], tgt_indices[:, t])

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if batch_idx % 5 == 0:
            print(f'Epoch [{epoch+1}/{config.epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


    model.eval()
    test_loss = 0

    with torch.no_grad():
        for src, tgt, src_indices, tgt_indices in test_loader:
            output = model(src, tgt)

            tgt_indices = torch.tensor(tgt_indices, dtype=torch.long)
            loss = 0
            for t in range(1, tgt.shape[1]):
                loss += criterion(output[:, t, :], tgt_indices[:, t])
            
            test_loss += loss.item()

    wandb.log({"Train loss": total_loss,"Test_loss": test_loss})

    print(f'Epoch [{epoch+1}/{config.epochs}], Average Train Loss: {total_loss / len(train_loader):.4f}, Average Test Loss: {test_loss / len(test_loader):.4f}')
    print('--------------------------------------------------------------------------------------------------------------')

Epoch [1/30], Step [1/2], Loss: 41.3289
Epoch [1/30], Average Train Loss: 34.4250, Average Test Loss: 26.7123
--------------------------------------------------------------------------------------------------------------
Epoch [2/30], Step [1/2], Loss: 40.3296
Epoch [2/30], Average Train Loss: 32.5693, Average Test Loss: 22.1751
--------------------------------------------------------------------------------------------------------------
Epoch [3/30], Step [1/2], Loss: 34.9326
Epoch [3/30], Average Train Loss: 33.4764, Average Test Loss: 15.7554
--------------------------------------------------------------------------------------------------------------
Epoch [4/30], Step [1/2], Loss: 24.3726
Epoch [4/30], Average Train Loss: 24.7107, Average Test Loss: 15.6595
--------------------------------------------------------------------------------------------------------------
Epoch [5/30], Step [1/2], Loss: 21.8719
Epoch [5/30], Average Train Loss: 20.4075, Average Test Loss: 13.3263
------

In [22]:
torch.save(model.state_dict(), './training_models/no_attention_Seq2Seq.pth')

In [3]:
wandb.finish()

### 2.2. Entrenamiento Bahdanau

In [4]:
# Parámetros
input_dim = 300
output_dim = translation.vocab_es.vectors.shape[0]
hidden_dim = 512
num_layers = 1
num_workers = 0
shuffle = True

# Inicializa el modelo, el optimizador y la función de pérdida
encoder = EncoderBahdanau(input_dim, hidden_dim, num_layers)
decoder = DecoderBahdanau(input_dim, hidden_dim, output_dim, num_layers)
model = Bahdanau(encoder, decoder)
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
criterion = nn.CrossEntropyLoss()

train_loader = DataLoader(translation.train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn)
test_loader = DataLoader(translation.test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn)

In [5]:
warnings.filterwarnings("ignore")

for epoch in range(config.epochs):

    model.train()
    total_loss = 0

    for batch_idx, (src, tgt, src_indices, tgt_indices) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(src, tgt)

        tgt_indices = torch.tensor(tgt_indices, dtype=torch.long)
        loss = 0
        for t in range(1, tgt.shape[1]):
            loss += criterion(output[:, t, :], tgt_indices[:, t])

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if batch_idx % 5 == 0:
            print(f'Epoch [{epoch+1}/{config.epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


    model.eval()
    test_loss = 0

    with torch.no_grad():
        for src, tgt, src_indices, tgt_indices in test_loader:
            output = model(src, tgt)

            tgt_indices = torch.tensor(tgt_indices, dtype=torch.long)
            loss = 0
            for t in range(1, tgt.shape[1]):
                loss += criterion(output[:, t, :], tgt_indices[:, t])
            
            test_loss += loss.item()

    wandb.log({"Train loss": total_loss,"Test_loss": test_loss})

    print(f'Epoch [{epoch+1}/{config.epochs}], Average Train Loss: {total_loss / len(train_loader):.4f}, Average Test Loss: {test_loss / len(test_loader):.4f}')
    print('--------------------------------------------------------------------------------------------------------------')

Epoch [1/30], Step [1/2], Loss: 41.4318
Epoch [1/30], Average Train Loss: 40.5970, Average Test Loss: 23.3346
--------------------------------------------------------------------------------------------------------------
Epoch [2/30], Step [1/2], Loss: 35.0180
Epoch [2/30], Average Train Loss: 27.0646, Average Test Loss: 13.1916
--------------------------------------------------------------------------------------------------------------
Epoch [3/30], Step [1/2], Loss: 23.0014
Epoch [3/30], Average Train Loss: 20.0131, Average Test Loss: 13.3988
--------------------------------------------------------------------------------------------------------------
Epoch [4/30], Step [1/2], Loss: 16.7200
Epoch [4/30], Average Train Loss: 14.3433, Average Test Loss: 9.8559
--------------------------------------------------------------------------------------------------------------
Epoch [5/30], Step [1/2], Loss: 10.5424
Epoch [5/30], Average Train Loss: 8.9326, Average Test Loss: 8.1712
---------

In [6]:
torch.save(model.state_dict(), './training_models/bahdanau.pth')

In [None]:
wandb.finish()

### 2.3 Luong

In [3]:
# Parámetros
input_dim = 300
output_dim = translation.vocab_es.vectors.shape[0]
hidden_dim = 512
num_layers = 1
num_workers = 0
shuffle = True
lr = 0.001
batch_size = 7

# Inicializa el modelo, el optimizador y la función de pérdida
encoder = EncoderLuong(input_dim, hidden_dim, num_layers)
decoder = DecoderLuong(input_dim, hidden_dim, output_dim, num_layers)
model = Luong(encoder, decoder)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

train_loader = DataLoader(translation.train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn)
test_loader = DataLoader(translation.test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn)

In [4]:
warnings.filterwarnings("ignore")

epochs=30

for epoch in range(epochs):

    model.train()
    total_loss = 0

    for batch_idx, (src, tgt, src_indices, tgt_indices) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(src, tgt)

        tgt_indices = torch.tensor(tgt_indices, dtype=torch.long)
        loss = 0
        for t in range(1, tgt.shape[1]):
            loss += criterion(output[:, t, :], tgt_indices[:, t])

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if batch_idx % 5 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


    model.eval()
    test_loss = 0

    with torch.no_grad():
        for src, tgt, src_indices, tgt_indices in test_loader:
            output = model(src, tgt)

            tgt_indices = torch.tensor(tgt_indices, dtype=torch.long)
            loss = 0
            for t in range(1, tgt.shape[1]):
                loss += criterion(output[:, t, :], tgt_indices[:, t])
            
            test_loss += loss.item()

    #wandb.log({"Train loss": total_loss,"Test_loss": test_loss})

    print(f'Epoch [{epoch+1}/{epochs}], Average Train Loss: {total_loss / len(train_loader):.4f}, Average Test Loss: {test_loss / len(test_loader):.4f}')
    print('--------------------------------------------------------------------------------------------------------------')

Epoch [1/30], Step [1/2], Loss: 41.4839
Epoch [1/30], Average Train Loss: 34.2284, Average Test Loss: 39.8299
--------------------------------------------------------------------------------------------------------------
Epoch [2/30], Step [1/2], Loss: 39.2797
Epoch [2/30], Average Train Loss: 31.7624, Average Test Loss: 36.1807
--------------------------------------------------------------------------------------------------------------
Epoch [3/30], Step [1/2], Loss: 33.8748
Epoch [3/30], Average Train Loss: 26.7794, Average Test Loss: 31.7229
--------------------------------------------------------------------------------------------------------------
Epoch [4/30], Step [1/2], Loss: 18.6728
Epoch [4/30], Average Train Loss: 23.4287, Average Test Loss: 29.2567
--------------------------------------------------------------------------------------------------------------
Epoch [5/30], Step [1/2], Loss: 26.5393
Epoch [5/30], Average Train Loss: 25.4322, Average Test Loss: 27.5471
------

crear carpeta, ejecutar las dos celdas seguidas

In [5]:
torch.save(model.state_dict(), './training_models/luong.pth')

## 3. Evaluación 

In [6]:
input_dim = 300
output_dim = translation.vocab_es.vectors.shape[0]
hidden_dim = 512
num_layers = 1
num_workers = 0
shuffle = True

encoder = EncoderLuong(input_dim, hidden_dim, num_layers)
decoder = DecoderLuong(input_dim, hidden_dim, output_dim, num_layers)
model = Luong(encoder, decoder)

model.load_state_dict(torch.load('./training_models/luong.pth'))
model.eval()

Luong(
  (encoder): Encoder(
    (rnn): LSTM(300, 512, batch_first=True)
  )
  (decoder): Decoder(
    (rnn): LSTM(300, 512, batch_first=True)
    (linear): Linear(in_features=1024, out_features=512, bias=True)
    (fc_out): Linear(in_features=512, out_features=985671, bias=True)
    (attention): BilinearAttention(
      (W): Linear(in_features=512, out_features=512, bias=False)
    )
  )
)

In [14]:
sentence = "head"

# Convertir a vectores
tokens = translation.tokenizer_en(sentence)
tokens = tokens + ['<eos>']
text_tensor = translation.vocab_en.get_vecs_by_tokens(tokens)
text_tensor = text_tensor.unsqueeze(0)

with torch.no_grad():
    encoder_outputs, (hidden, cell) = model.encoder(text_tensor)

outputs = []

input_token = torch.tensor(translation.vocab_es.stoi['<sos>']).unsqueeze(0)
input_token = translation.vocab_es.vectors[input_token].unsqueeze(0)
    

for _ in range(5):
    with torch.no_grad():
        output, (hidden, cell) = model.decoder(input_token, hidden, cell, encoder_outputs) # teacher_forcing_ratio=0.0
        
    # Obtener el token con la probabilidad más alta
    best_guess = output.argmax(2).squeeze(0)
    outputs.append(best_guess.item())
        
    # Si el token es <eos>, terminar la traducción
    if best_guess == translation.vocab_es.stoi['<eos>']:
        break
        
    # Utilizar la palabra predicha como la siguiente entrada al decoder
    input_token = translation.vocab_es.vectors[best_guess].unsqueeze(0)
        
# Convertir los índices de salida a palabras
translated_sentence = [translation.vocab_es.itos[idx] for idx in outputs]
    
result = ' '.join(translated_sentence)

print(result)

mi <eos>
