In [5]:
import torch
import torch.nn as nn
import random

class Encoder(nn.Module):
    """
    The Encoder module of the sequence-to-sequence model.
    It takes a sequence of input tokens and returns a context vector.
    """
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        """
        Initializes the Encoder module.

        Args:
            input_dim (int): The size of the input (source) vocabulary.
            embedding_dim (int): The dimensionality of the word embeddings.
            hidden_dim (int): The dimensionality of the hidden and cell states of the LSTM.
            n_layers (int): The number of layers in the LSTM.
            dropout (float): The dropout probability.
        """
        # Calls the constructor of the parent class (nn.Module) to properly initialize the module.
        super().__init__()

        # Store the hidden dimension and number of layers for later use.
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # 1. Embedding Layer
        # This layer converts input token indices into dense vector representations (embeddings).
        # - input_dim: The number of unique tokens in our source vocabulary.
        # - embedding_dim: The size of the vector that will represent each token.
        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # 2. LSTM Layer
        # This is the core of the encoder. It's a multi-layered LSTM that processes the
        # sequence of embeddings.
        # - input_size: The dimensionality of the input to the LSTM at each timestep.
        #   This must match the embedding_dim from the previous layer.
        # - hidden_size: The dimensionality of the hidden state and cell state.
        # - num_layers: The number of stacked LSTM layers (for a deep LSTM).
        # - dropout: Adds a dropout layer on the outputs of each LSTM layer except the
        #   last one. This is a regularization technique to prevent overfitting.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)

        # 3. Dropout Layer
        # This is a regularization layer. During training, it will randomly zero out some
        # of the elements of the input tensor with probability 'dropout'.
        # This helps to prevent the model from becoming too reliant on any single feature.
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_seq):
        """
        Defines the forward pass of the Encoder.

        Args:
            input_seq (torch.Tensor): The input sequence of token indices.
                                     Shape: [seq_len, batch_size]

        Returns:
            tuple(torch.Tensor, torch.Tensor): The final hidden and cell states of the LSTM,
                                               which serve as the context vector.
                                               - hidden: Shape [n_layers, batch_size, hidden_dim]
                                               - cell:   Shape [n_layers, batch_size, hidden_dim]
        """
        # input_seq shape: [seq_len, batch_size]
        # This is a tensor of long integers, where each integer is an index
        # corresponding to a token in our vocabulary.

        # 1. Pass the input sequence through the embedding layer.
        # The embedding layer looks up the vector for each token index.
        # embedded shape: [seq_len, batch_size, embedding_dim]
        embedded = self.embedding(input_seq)

        # 2. Apply dropout to the embeddings.
        # This is done to regularize the model. Dropout is typically applied after
        # embeddings and before they are passed into the recurrent layers.
        # embedded_dropout shape: [seq_len, batch_size, embedding_dim]
        embedded_dropout = self.dropout(embedded)

        # 3. Pass the dropout-applied embeddings through the LSTM.
        # The LSTM processes the sequence one timestep at a time. It does not require
        # an initial hidden/cell state; if not provided, PyTorch defaults to a
        # zero-initialized state.
        # - `outputs`: Contains the hidden state from the *final* LSTM layer for *every*
        #   timestep. We don't need this for our context vector, so we ignore it.
        #   Shape: [seq_len, batch_size, hidden_dim]
        # - `hidden`: Contains the final hidden state for *each* layer from the *final*
        #   timestep. This is a crucial part of our context vector.
        #   Shape: [n_layers, batch_size, hidden_dim]
        # - `cell`: Contains the final cell state for *each* layer from the *final*
        #   timestep. This is the other crucial part of our context vector.
        #   Shape: [n_layers, batch_size, hidden_dim]
        outputs, (hidden, cell) = self.lstm(embedded_dropout)

        # The core idea of this Seq2Seq model is to use the final hidden and cell
        # states of the encoder as the context vector. This context is then used to
        # initialize the decoder.
        return hidden, cell

# --- Putting It All Together: A Small Example ---

# Define the hyperparameters for our Encoder instance.
INPUT_DIM = 5000  # Example source vocabulary size
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2  # The paper used 4, but we use 2 for a simpler example
DROPOUT = 0.5

# Instantiate the Encoder model.
encoder = Encoder(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)

print("Encoder Model Architecture:\n", encoder)

# --- Create a dummy input to test the forward pass ---

# Let's create a dummy batch of sentences.
# batch_size = 32
# seq_len = 10
# The input must be a tensor of long integers (token indices).
batch_size = 32
seq_len = 10
dummy_input_seq = torch.randint(0, INPUT_DIM, (seq_len, batch_size))

print(f"\nShape of dummy input sequence: {dummy_input_seq.shape}")
print(f"Data type of dummy input: {dummy_input_seq.dtype}")

# Pass the dummy input through the encoder.
# We set the model to evaluation mode to disable dropout for this test pass.
encoder.eval()
with torch.no_grad(): # We don't need to compute gradients for this test
    final_hidden, final_cell = encoder(dummy_input_seq)

# --- Check the outputs ---

print(f"\nShape of final hidden state (context): {final_hidden.shape}")
print(f"Expected hidden shape: [{N_LAYERS}, {batch_size}, {HIDDEN_DIM}]")

print(f"\nShape of final cell state (context): {final_cell.shape}")
print(f"Expected cell shape: [{N_LAYERS}, {batch_size}, {HIDDEN_DIM}]")

Encoder Model Architecture:
 Encoder(
  (embedding): Embedding(5000, 256)
  (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
)

Shape of dummy input sequence: torch.Size([10, 32])
Data type of dummy input: torch.int64

Shape of final hidden state (context): torch.Size([2, 32, 512])
Expected hidden shape: [2, 32, 512]

Shape of final cell state (context): torch.Size([2, 32, 512])
Expected cell shape: [2, 32, 512]


In [6]:
import torch
import torch.nn as nn

class Decoder(nn.Module):
    """
    The Decoder module of the sequence-to-sequence model.
    It takes a context vector and a target token, and returns a prediction
    for the next token in the sequence and an updated context vector.
    """
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        """
        Initializes the Decoder module.

        Args:
            output_dim (int): The size of the output (target) vocabulary.
            embedding_dim (int): The dimensionality of the word embeddings.
            hidden_dim (int): The dimensionality of the hidden and cell states of the LSTM.
                              This MUST be the same as the encoder's hidden_dim.
            n_layers (int): The number of layers in the LSTM.
                            This MUST be the same as the encoder's n_layers.
            dropout (float): The dropout probability.
        """
        # Call the parent class constructor.
        super().__init__()

        # Store the output dimension for later use in the linear layer.
        self.output_dim = output_dim
        # Store hidden_dim and n_layers, which must match the encoder.
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # 1. Embedding Layer
        # This layer converts the input token index (from the target sequence)
        # into a dense vector representation.
        # - output_dim: The number of unique tokens in our target vocabulary.
        # - embedding_dim: The size of the vector for each target token.
        self.embedding = nn.Embedding(output_dim, embedding_dim)

        # 2. LSTM Layer
        # The decoder's LSTM. Its role is to process the input token's embedding
        # along with the context from the previous timestep to produce an output
        # hidden state.
        # - The dimensions (embedding_dim, hidden_dim, n_layers) must align with
        #   the encoder's dimensions so that the context vector can be passed seamlessly.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)

        # 3. Fully Connected (Linear) Layer
        # This layer is crucial for prediction. It takes the LSTM's output hidden
        # state and transforms it into a vector of scores (logits), one for each
        # token in our target vocabulary.
        # - in_features: The size of the input to this layer, which is the LSTM's
        #   hidden state dimension.
        # - out_features: The size of the output, which must be the size of our
        #   target vocabulary (output_dim).
        self.fc_out = nn.Linear(hidden_dim, output_dim)

        # 4. Dropout Layer
        # A regularization layer, similar to the one in the encoder.
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden_state, cell_state):
        """
        Defines the forward pass for a single decoding step.

        Args:
            input_token (torch.Tensor): The input token for the current timestep.
                                        Shape: [batch_size]
            hidden_state (torch.Tensor): The hidden state from the previous timestep.
                                         Shape: [n_layers, batch_size, hidden_dim]
            cell_state (torch.Tensor): The cell state from the previous timestep.
                                       Shape: [n_layers, batch_size, hidden_dim]
        
        Returns:
            tuple: A tuple containing:
                - prediction (torch.Tensor): Raw, unnormalized scores (logits) for each
                                             token in the output vocabulary.
                                             Shape: [batch_size, output_dim]
                - hidden_state (torch.Tensor): The updated hidden state.
                                               Shape: [n_layers, batch_size, hidden_dim]
                - cell_state (torch.Tensor): The updated cell state.
                                             Shape: [n_layers, batch_size, hidden_dim]
        """
        # The input_token is a 1D tensor of shape [batch_size], containing the
        # index of the token for each sequence in the batch.
        
        # We need to add a sequence length dimension to the input_token tensor
        # because the embedding and LSTM layers expect a sequence. Since we are
        # decoding one token at a time, the sequence length is 1.
        # input_token shape: [batch_size] -> [1, batch_size]
        input_token = input_token.unsqueeze(0)

        # 1. Pass the input token through the embedding layer and apply dropout.
        # embedded shape: [1, batch_size, embedding_dim]
        embedded = self.dropout(self.embedding(input_token))

        # 2. Pass the embedded token and the previous hidden/cell states into the LSTM.
        # The LSTM takes the embedding and the previous context (hidden, cell)
        # and computes the next context.
        # - `output`: The hidden state from the top LSTM layer for this timestep.
        #   Shape: [1, batch_size, hidden_dim]
        # - `hidden_state`, `cell_state`: The updated hidden and cell states for all layers.
        #   Shape: [n_layers, batch_size, hidden_dim]
        output, (hidden_state, cell_state) = self.lstm(embedded, (hidden_state, cell_state))

        # The `output` tensor has a sequence length dimension of 1. We need to
        # remove this before passing it to the linear layer, which expects a
        # 2D tensor of [batch_size, hidden_dim].
        # output shape: [1, batch_size, hidden_dim] -> [batch_size, hidden_dim]
        assert output.shape[0] == 1, "Output sequence length should be 1"
        output_squeezed = output.squeeze(0)

        # 3. Pass the LSTM's output through the final linear layer.
        # This generates the raw prediction scores (logits) over the vocabulary.
        # prediction shape: [batch_size, output_dim]
        prediction = self.fc_out(output_squeezed)

        # Return the prediction and the new hidden and cell states. These new states
        # will be used as the context for the next decoding step.
        return prediction, hidden_state, cell_state



# --- Define Hyperparameters ---
INPUT_DIM = 5000
OUTPUT_DIM = 5500 # Example target vocabulary size
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5

# Instantiate both models
encoder = Encoder(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)

print("Decoder Model Architecture:\n", decoder)

# --- Create dummy inputs to test the one-step forward pass ---
batch_size = 32

# 1. Create a dummy source sequence for the encoder
dummy_input_seq = torch.randint(0, INPUT_DIM, (10, batch_size)) # seq_len=10

# 2. Get the initial context vector from the encoder
encoder.eval()
with torch.no_grad():
    initial_hidden, initial_cell = encoder(dummy_input_seq)

print(f"\nShape of initial context (hidden): {initial_hidden.shape}")
print(f"Shape of initial context (cell): {initial_cell.shape}")


# 3. Create a dummy target token to start the decoding process
# This would typically be the <SOS> (start-of-sequence) token index.
# We'll just use a random token for this example.
dummy_target_token = torch.randint(0, OUTPUT_DIM, (batch_size,)) # One token for each item in the batch

print(f"\nShape of dummy target token: {dummy_target_token.shape}")


# 4. Perform one decoding step
decoder.eval()
with torch.no_grad():
    prediction, next_hidden, next_cell = decoder(dummy_target_token, initial_hidden, initial_cell)

# --- Check the outputs of the decoder ---
print(f"\nShape of prediction logits: {prediction.shape}")
print(f"Expected prediction shape: [{batch_size}, {OUTPUT_DIM}]")

print(f"\nShape of next hidden state: {next_hidden.shape}")
print(f"Expected next hidden shape: [{N_LAYERS}, {batch_size}, {HIDDEN_DIM}]")

print(f"\nShape of next cell state: {next_cell.shape}")
print(f"Expected next cell shape: [{N_LAYERS}, {batch_size}, {HIDDEN_DIM}]")

Decoder Model Architecture:
 Decoder(
  (embedding): Embedding(5500, 256)
  (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
  (fc_out): Linear(in_features=512, out_features=5500, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

Shape of initial context (hidden): torch.Size([2, 32, 512])
Shape of initial context (cell): torch.Size([2, 32, 512])

Shape of dummy target token: torch.Size([32])

Shape of prediction logits: torch.Size([32, 5500])
Expected prediction shape: [32, 5500]

Shape of next hidden state: torch.Size([2, 32, 512])
Expected next hidden shape: [2, 32, 512]

Shape of next cell state: torch.Size([2, 32, 512])
Expected next cell shape: [2, 32, 512]


In [7]:
class Seq2Seq(nn.Module):
    """
    The main Seq2Seq model that encapsulates the Encoder and Decoder.
    """
    def __init__(self, encoder, decoder, device):
        """
        Initializes the Seq2Seq model.

        Args:
            encoder (Encoder): The encoder module.
            decoder (Decoder): The decoder module.
            device (torch.device): The device (CPU or GPU) to which tensors will be sent.
        """
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        # Ensure the encoder and decoder have the same hidden dimensions and number of layers.
        # This is a critical architectural constraint.
        assert encoder.hidden_dim == decoder.hidden_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have the same number of layers!"

    def forward(self, source_seq, target_seq, teacher_forcing_ratio=0.5):
        """
        Defines the forward pass of the Seq2Seq model.

        Args:
            source_seq (torch.Tensor): The source sequence.
                                       Shape: [src_len, batch_size]
            target_seq (torch.Tensor): The target sequence.
                                       Shape: [trg_len, batch_size]
            teacher_forcing_ratio (float): The probability of using the ground-truth
                                           target token as the next input, instead of
                                           the model's own prediction.

        Returns:
            torch.Tensor: A tensor of predictions (logits) for the entire target sequence.
                          Shape: [trg_len, batch_size, output_dim]
        """
        # Get the batch size and target sequence length from the input tensors.
        batch_size = target_seq.shape[1]
        trg_len = target_seq.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # 1. Create a tensor to store the decoder's outputs for each timestep.
        # This tensor will be filled with the raw logit predictions.
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # 2. Pass the source sequence through the encoder to get the context vector.
        # `hidden` and `cell` are the final states of the encoder's LSTM.
        hidden, cell = self.encoder(source_seq)

        # 3. The first input to the decoder is the <SOS> (start-of-sequence) token.
        # The target_seq tensor is assumed to contain the <SOS> token at the first
        # timestep (index 0).
        # input_token shape: [batch_size]
        input_token = target_seq[0,:]

        # 4. Loop through the target sequence to generate predictions one token at a time.
        # We loop from the second token (index 1) because the first token is our <SOS> input.
        for t in range(1, trg_len):
            
            # - Pass the current input token and the previous context (hidden, cell)
            #   into the decoder.
            # - This gives us a prediction for the next token and the updated context.
            prediction, hidden, cell = self.decoder(input_token, hidden, cell)

            # - Store the prediction in our `outputs` tensor.
            outputs[t] = prediction

            # - Decide whether to use "teacher forcing" for the next input.
            #   Teacher forcing means we use the actual ground-truth token from the
            #   target sequence as the next input.
            #   Otherwise, we use the model's own highest-probability prediction.
            teacher_force = random.random() < teacher_forcing_ratio

            # - Get the token with the highest probability from our prediction.
            #   `prediction.argmax(1)` finds the index of the max value along dimension 1.
            top1 = prediction.argmax(1)

            # - Determine the next input token.
            #   If teacher_force is true, use the actual next token from the target sequence.
            #   Otherwise, use the model's own prediction.
            input_token = target_seq[t] if teacher_force else top1
            
        return outputs

# --- Putting It All Together: A Small Example ---

# Define hyperparameters
INPUT_DIM = 5000
OUTPUT_DIM = 5500
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Instantiate the sub-models
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT)

# Instantiate the main Seq2Seq model
model = Seq2Seq(encoder, decoder, device).to(device)

print("\nSeq2Seq Model Architecture:\n", model)

# --- Create dummy inputs to test the full forward pass ---
batch_size = 16
src_len = 12
trg_len = 15 # Target can have a different length

# Dummy source and target sequences
dummy_src_seq = torch.randint(0, INPUT_DIM, (src_len, batch_size)).to(device)
dummy_trg_seq = torch.randint(0, OUTPUT_DIM, (trg_len, batch_size)).to(device)

# Perform a forward pass
# Set the model to training mode to enable dropout
model.train()
outputs = model(dummy_src_seq, dummy_trg_seq) # Use default teacher_forcing_ratio=0.5

# --- Check the output shape ---
print(f"\nShape of final output tensor: {outputs.shape}")
print(f"Expected output shape: [{trg_len}, {batch_size}, {OUTPUT_DIM}]")

# Check the first output (corresponding to <SOS> input)
# It should be all zeros, as we initialized it that way and start the loop from t=1.
print("\nFirst timestep output (should be zeros):")
print(outputs[0,0,:10])

# Check a later output
print("\nSecond timestep output (should be populated):")
print(outputs[1,0,:10])

Using device: cuda

Seq2Seq Model Architecture:
 Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5000, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5500, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5500, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

Shape of final output tensor: torch.Size([15, 16, 5500])
Expected output shape: [15, 16, 5500]

First timestep output (should be zeros):
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0',
       grad_fn=<SliceBackward0>)

Second timestep output (should be populated):
tensor([-0.0694,  0.0140, -0.0068, -0.0345,  0.0522, -0.0241,  0.0562, -0.0546,
         0.0416, -0.0201], device='cuda:0', grad_fn=<SliceBackward0>)
