# **Configuración del Entrenamiento Seq2Seq con Atención**


En este cuaderno, configuraremos todos los componentes necesarios para entrenar nuestro modelo Seq2Seq con atención Bahdanau

## **Atencion**

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self,enc_hid_dim, dec_hid_dim):
        """
            Constructor de la capa de atencion:
            Arg:  
                enc_hid_dim (int): Dimension oculta del encoder (BiLSTM).
                dec_hid_dim (int): Dimension oculta del decoder (LSTM).
        """
        super().__init__()

        # Capa lineal para transformar el estado oculto del encoder
        self.attn_W_enc = nn.Linear(enc_hid_dim * 2, dec_hid_dim, bias=False)
        # Capa lineal para transformar el estado oculto del decoder
        self.attn_W_dec = nn.Linear(dec_hid_dim, dec_hid_dim, bias=False)
        # Capa lineal para calcular el score final
        self.attn_v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        """
            Calcula los pesos de atencion y el vector de contexto
            Arg:
                decoder_hidden (Tensor): Estado oculto del decoder del paso anterior.
                encoder_outputs (Tensor): Salidas de todos los pasos de tiempo del encoder.
            
            Returns:
            context_vector (Tensor): Vector de contexto calculado.
            attention_weights (Tensor): Pesos de atencion calculados.
        """
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]

        decoder_hidden_repeated = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        energy = torch.tanh(self.attn_W_enc(encoder_outputs) + self.attn_W_dec(decoder_hidden_repeated))

        attention_scores = self.attn_v(energy).squeeze(2)

        attention_weights = F.softmax(attention_scores, dim=1)

        attention_weights_unsqueezed = attention_weights.unsqueeze(1)

        context_vector = torch.bmm(attention_weights_unsqueezed, encoder_outputs)

        context_vector = context_vector.squeeze(1)

        return context_vector, attention_weights




## **model.py con atencion**

In [3]:
# En este archivo vamos vamos a definir el Encoder, Decoder y Seq2Seq

import torch
import torch.nn as nn
import random

class Encoder(nn.Module):
    def __init__(self,input_dim, emb_dim, hidden_dim, n_layers, dropout, pad_idx):
        """
            Constructor del Encoder.
            Args: 
                input_dim(int): tamanio del vocabulario de entrada (fuente o sorce o src).
                emb_dim (int): Dimension de los embeddings.
                hidden_dim(int): dimension de la capa oculta del LSTM.
                n_layers (int): Numero de capas del LSTM
                dropout (float): Probabilidad de dropout
                pad_idx (idx): Indice del token de padding en el vocabulario
        """
        super().__init__() # Configuraciones internas de nn.Module en el Encoder
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)

        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers,
                           dropout=dropout if n_layers>1 else 0,
                           bidirectional=True, batch_first=True)
        
        self.fc_hidden = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc_cell = nn.Linear(hidden_dim * 2, hidden_dim)

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        """
            Procesa la secuencia fuente.
            Arg:
                src (Tensor): Secuencia de tokens de entrada [batch_size, src_len]
            Return:

        """
        embedded = self.dropout(self.embedding(src))

        outputs, (hidden, cell) = self.rnn(embedded)

        hidden = hidden.permute(1, 0, 2)
        
        hidden = hidden.reshape(hidden.size(0), self.n_layers, 2 * self.hidden_dim)

        hidden = hidden.permute(1, 0, 2)

        cell = cell.permute(1, 0, 2)
        cell = cell.reshape(cell.size(0), self.n_layers, 2 * self.hidden_dim)
        cell = cell.permute(1, 0, 2)

        hidden = torch.tanh(self.fc_hidden(hidden))
        cell = torch.tanh(self.fc_cell(cell))

        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout, pad_idx, attention):
        """
            Inicializador del Decoder.
            Args:
                output_dim(int): tamanio del vocabulario de salida.
                emb_dim(int): dimension de los embeddings.
                enc_hid_dim (int): Dimensión oculta del encoder
                dec_hid_dim (int): dimension de la capa oculta del LSTM
                n_layers(int): Numero de capas del LSTM
                dropout (float): Probabilidad de dropout
                pad_idx: Indice del token de padding en el vocabulario
                attention (Attention): Instancia de la clase Attention.
        """
        super().__init__() # Configuraciones internas del Module.nn en el Decoder

        self.output_dim = output_dim
        self.hidden_dim = dec_hid_dim 
        self.n_layers = n_layers
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)

        self.rnn = nn.LSTM(emb_dim + (enc_hid_dim * 2), dec_hid_dim, n_layers,
                           dropout=dropout if n_layers > 1 else 0, batch_first=True)
        
        self.fc_out = nn.Linear(emb_dim + (enc_hid_dim * 2) + dec_hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        """
            Procesa un paso de la decodificación:
            Arg:
                input(Tensor): Token de entrada actual [batch size]
                hidden(Tensor): Estado oculto del paso anterior [n_layers, batch_size, hidden_dim].
                cell(Tensor): Estado de la celda en el paso anterior  [n_layers, batch_size, hidden_dim].
            Return:
        """

        input = input.unsqueeze(1) # input = [batch size, 1]
        embedded = self.dropout(self.embedding(input)) # embedded = [batch size, 1, emb dim]

        context, attention_weights = self.attention(hidden, encoder_outputs)

        context = context.unsqueeze(1)

        rnn_input = torch.cat((embedded, context), dim=2)

        rnn_output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))

        embedded = embedded.squeeze(1)     
        context = context.squeeze(1)       
        rnn_output = rnn_output.squeeze(1) 

        fc_input = torch.cat((embedded, context, rnn_output), dim=1)

        prediction = self.fc_out(fc_input)

        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        """
            Args:
                encoder(Encoder): instancia del encoder
                decoder(Decoder): instancia del decoder
                device(torch.device): cpu o cuda 
        """
        super().__init__() # Configuraciones internas de nn.Modules en Seq2Seq

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        if hasattr(encoder, 'n_layers') and hasattr(decoder, 'n_layers'):
             assert encoder.n_layers == decoder.n_layers, \
                 "El encoder y decoder deben de tener el mismo numero de capas"

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        """
            Procesa el par de secuencias fuente y objetivo.
            Args:
                src(Tensor): secuencia fuente [batch_size, src_len].
                trg(Tensor): secuencia target [batch_size, trg_len].
                teacher_forcing_ratio (float): Probabilidad de usar teacher forcing.
            
            Return:
                output(Tensor): predicciones del decoder.
        """
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, hidden, cell = self.encoder(src)

        input = trg[:, 0]

        for t in range(1, trg_len): # Predecimos a partir del segundo token

            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)

            # Guardamos las predicciones en el tensor de salida
            outputs[:, t, :] = output 

            # Decidir si usar teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio

            # Obtener el token predicho con mayor probabilidad
            top1 = output.argmax(1) 

            # Si es teacher forcing, usar el token real como siguiente input
            # Si no, usar el token predicho
            input = trg[:, t] if teacher_force else top1


        return outputs

## **data_loader.py**

In [4]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torch

class SummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len_article, max_len_highlight, bos_token_id, eos_token_id):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len_article = max_len_article - 2  # Reservar espacio para BOS y EOS
        self.max_len_highlight = max_len_highlight - 2
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        article_text = self.dataframe.iloc[idx]['article']
        highlight_text = self.dataframe.iloc[idx]['highlights']

        # Tokenizar y truncar artículo
        self.tokenizer.enable_truncation(max_length=self.max_len_article)
        encoded_article = self.tokenizer.encode(article_text)
        article_token_ids = encoded_article.ids

        # Tokenizar y truncar resumen
        self.tokenizer.enable_truncation(max_length=self.max_len_highlight)
        encoded_highlight = self.tokenizer.encode(highlight_text)
        highlight_token_ids = encoded_highlight.ids

        # Añadir tokens BOS/EOS y convertir a tensor
        article_tensor = torch.cat(
            (torch.tensor([self.bos_token_id]),
             torch.tensor(article_token_ids, dtype=torch.long),
             torch.tensor([self.eos_token_id]))
        )

        highlight_tensor = torch.cat(
            (torch.tensor([self.bos_token_id]),
             torch.tensor(highlight_token_ids, dtype=torch.long),
             torch.tensor([self.eos_token_id]))
        )

        return article_tensor, highlight_tensor

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(src_sample)
        tgt_batch.append(tgt_sample)

    src_batch_padded = pad_sequence(src_batch, batch_first=True, padding_value=1)  # 1 = PAD_IDX
    tgt_batch_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=1)

    return src_batch_padded, tgt_batch_padded

## **Tokenizer**

In [5]:
import os
from tokenizers import Tokenizer

# Cargamos el tokenizer
TOKENIZER_DIR = "cnn_dailymail_bpe_tokenizer" 
TOKENIZER_PATH = os.path.join(TOKENIZER_DIR, "tokenizer.json")


tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
print(f"Tokenizador cargado desde: {TOKENIZER_PATH}")
INPUT_DIM = tokenizer.get_vocab_size()
OUTPUT_DIM = tokenizer.get_vocab_size() 

Tokenizador cargado desde: cnn_dailymail_bpe_tokenizer\tokenizer.json


In [6]:
PAD_IDX = tokenizer.token_to_id("<pad>")
BOS_IDX = tokenizer.token_to_id("<bos>")
EOS_IDX = tokenizer.token_to_id("<eos>")
UNK_IDX = tokenizer.token_to_id("<unk>")

In [7]:
print(f"Tamaño del Vocabulario (INPUT_DIM/OUTPUT_DIM): {INPUT_DIM}")
print(f"Índice de OOV: {UNK_IDX}")
print(f"Índice de Padding: {PAD_IDX}")
print(f"Índice de Begin of seq.: {BOS_IDX}")
print(f"Índice de End of seq.: {EOS_IDX}")


Tamaño del Vocabulario (INPUT_DIM/OUTPUT_DIM): 30000
Índice de OOV: 0
Índice de Padding: 1
Índice de Begin of seq.: 2
Índice de End of seq.: 3


## **Hiperparametros**

In [8]:
import numpy as np

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512      
ENC_HID_DIM = HID_DIM 
DEC_HID_DIM = HID_DIM
N_LAYERS = 2
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

LEARNING_RATE = 0.0005 # tasa de aprendizaje
BATCH_SIZE = 32       
N_EPOCHS = 10         # Numero de epocas
CLIP = 1              # Gradient clipping 

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True 

MAX_TOKENS_ARTICLE = 1200 
MAX_TOKENS_HIGHLIGHT = 130  


print(f"Device: {DEVICE}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Learning Rate: {LEARNING_RATE}")

Device: cpu
Batch Size: 32
Learning Rate: 0.0005


## Instanciar Hiperparametros del modelo

In [10]:
import pandas as pd

train_df_filtered = pd.read_parquet("data/train_filtered.parquet") 
print("DataFrame de entrenamiento filtrado cargado.")

validation_df_filtered = pd.read_parquet("data/validation_filtered.parquet") 
print("DataFrame de validacion filtrado cargado.")

DataFrame de entrenamiento filtrado cargado.
DataFrame de validacion filtrado cargado.


In [12]:
from torch.utils.data import DataLoader

train_dataset = SummarizationDataset(
    train_df_filtered, tokenizer,
    MAX_TOKENS_ARTICLE, MAX_TOKENS_HIGHLIGHT,
    BOS_IDX, EOS_IDX
)
val_dataset = SummarizationDataset(
    validation_df_filtered, tokenizer,
    MAX_TOKENS_ARTICLE, MAX_TOKENS_HIGHLIGHT,
    BOS_IDX, EOS_IDX
)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Datasets y DataLoaders creados.")
print(f"  Tamaño Dataset Entrenamiento: {len(train_dataset)}")
print(f"  Tamaño Dataset Validación: {len(val_dataset)}")
print(f"  Número batches Entrenamiento: {len(train_dataloader)}")
print(f"  Número batches Validación: {len(val_dataloader)}")

src_batch_test, trg_batch_test = next(iter(train_dataloader))
print(f"  Shape src_batch: {src_batch_test.shape}")
print(f"  Shape trg_batch: {trg_batch_test.shape}")

Datasets y DataLoaders creados.
  Tamaño Dataset Entrenamiento: 286766
  Tamaño Dataset Validación: 13353
  Número batches Entrenamiento: 8962
  Número batches Validación: 418
  Shape src_batch: torch.Size([32, 1200])
  Shape trg_batch: torch.Size([32, 130])


## **Modelo con atencion**


In [13]:
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT, PAD_IDX)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, PAD_IDX, attn)
model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)
print("Modelo Seq2Seq con Atención instanciado y movido a device.")

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'El modelo tiene {count_parameters(model):,} parámetros entrenables.')


Modelo Seq2Seq con Atención instanciado y movido a device.
El modelo tiene 86,215,472 parámetros entrenables.


## Iniciar Pesos

In [14]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(30000, 256, padding_idx=1)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
    (fc_hidden): Linear(in_features=1024, out_features=512, bias=True)
    (fc_cell): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn_W_enc): Linear(in_features=1024, out_features=512, bias=False)
      (attn_W_dec): Linear(in_features=512, out_features=512, bias=False)
      (attn_v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(30000, 256, padding_idx=1)
    (rnn): LSTM(1280, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc_out): Linear(in_features=1792, out_features=30000, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

## Definimos el Optimizador

In [15]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(f"Optimizador Adam definido con LR={LEARNING_RATE}")

  from .autonotebook import tqdm as notebook_tqdm


Optimizador Adam definido con LR=0.0005


## **Definimos la Función de Pérdida**

In [16]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
print(f"Función de pérdida CrossEntropyLoss definida, ignorando índice {PAD_IDX}.")

Función de pérdida CrossEntropyLoss definida, ignorando índice 1.


## Funciones Auxiliares

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs