# Encoder-Decoder

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from collections import Counter
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


## Encoder

In [2]:
class Encoder(nn.Module):
    """
    編碼器 (Encoder)
    將嵌入後的序列編碼成一個固定長度的上下文向量 (context vector)
    """
    def __init__(self, emb_dim, hidden_dim, n_layers=1, dropout=0.1):
        super(Encoder, self).__init__()
        
        # LSTM 層：處理序列資料
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, 
                          dropout=dropout if n_layers > 1 else 0,
                          batch_first=True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, embedded):
        """
        參數:
            embedded: 嵌入後的輸入序列 [batch_size, seq_len, emb_dim]
        
        回傳:
            outputs: 所有時間步的輸出 [batch_size, seq_len, hidden_dim]
            hidden: 最後的隱藏狀態 (h_n, c_n)
        """
        # embedded shape: [batch_size, seq_len, emb_dim]
        embedded = self.dropout(embedded)
        
        outputs, hidden = self.rnn(embedded)
        # outputs shape: [batch_size, seq_len, hidden_dim]
        # hidden: (h_n, c_n) 各為 [n_layers, batch_size, hidden_dim]
        
        return outputs, hidden

## Decoder

In [5]:
class Decoder(nn.Module):
    """
    解碼器 (Decoder)
    根據編碼器的上下文向量，逐步生成輸出序列
    """
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers=1, dropout=0.1):
        super(Decoder, self).__init__()
        
        self.output_dim = output_dim
        
        # LSTM 層
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers,
                          dropout=dropout if n_layers > 1 else 0,
                          batch_first=True)
        
        # 輸出層：將隱藏狀態映射到詞彙表大小
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, embedded, hidden):
        """
        參數:
            embedded: 嵌入後的當前時間步輸入 [batch_size, 1, emb_dim]
            hidden: 上一時間步的隱藏狀態 (h, c)
        
        回傳:
            prediction: 預測的詞彙分布 [batch_size, output_dim]
            hidden: 更新後的隱藏狀態
        """
        # embedded shape: [batch_size, 1, emb_dim]
        embedded = self.dropout(embedded)
        
        output, hidden = self.rnn(embedded, hidden)
        # output shape: [batch_size, 1, hidden_dim]
        
        prediction = self.fc_out(output.squeeze(1))
        # prediction shape: [batch_size, output_dim]
        
        return prediction, hidden

## Encoder - Decoder

In [10]:
class Seq2Seq(nn.Module):
    """
    序列到序列模型 (Sequence-to-Sequence Model)
    結合嵌入層、編碼器和解碼器，完成序列轉換任務
    """
    def __init__(self, input_dim, output_dim, emb_dim, hidden_dim, n_layers, dropout, device):
        super(Seq2Seq, self).__init__()
        
        # 嵌入層
        self.src_embedding = nn.Embedding(input_dim, emb_dim)
        self.trg_embedding = nn.Embedding(output_dim, emb_dim)
        
        # 編碼器和解碼器
        self.encoder = Encoder(emb_dim, hidden_dim, n_layers, dropout)
        self.decoder = Decoder(output_dim, emb_dim, hidden_dim, n_layers, dropout)
        
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        """
        參數:
            src: 來源序列 [batch_size, src_len]
            trg: 目標序列 [batch_size, trg_len]
            teacher_forcing_ratio: 教師強迫比例（訓練時使用真實標籤的機率）
        
        回傳:
            outputs: 預測結果 [batch_size, trg_len, output_dim]
        """
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        # 儲存解碼器的輸出
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # 來源序列的嵌入
        src_embedded = self.src_embedding(src)
        # src_embedded shape: [batch_size, src_len, emb_dim]
        
        # 編碼器處理來源序列
        enc_outputs, hidden = self.encoder(src_embedded)
        
        # 解碼器的第一個輸入（通常是 <SOS> 起始符號）
        input = trg[:, 0].unsqueeze(1)  # [batch_size, 1]
        
        # 逐步生成目標序列
        for t in range(1, trg_len):
            # 目標序列的嵌入
            trg_embedded = self.trg_embedding(input)
            # trg_embedded shape: [batch_size, 1, emb_dim]
            
            output, hidden = self.decoder(trg_embedded, hidden)
            outputs[:, t, :] = output
            
            # 決定下一個輸入：使用真實標籤或模型預測
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1).unsqueeze(1)
            input = trg[:, t].unsqueeze(1) if teacher_force else top1
        
        return outputs

In [11]:
# 設定參數
INPUT_DIM = 1024   # 來源詞彙表大小
OUTPUT_DIM = 1024  # 目標詞彙表大小
EMB_DIM = 256      # 嵌入維度
HIDDEN_DIM = 512   # 隱藏層維度
N_LAYERS = 2       # LSTM 層數
DROPOUT = 0.1

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "mps"

In [12]:
model = Seq2Seq(INPUT_DIM, OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT, device).to(device)

print(f"模型已建立，使用設備: {device}")
print(f"\n模型架構:")
print(model)

# 計算參數量
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n總參數量: {total_params:,}")
print(f"可訓練參數量: {trainable_params:,}")

# 測試前向傳播
batch_size = 32
src_len = 10
trg_len = 12

src = torch.randint(0, INPUT_DIM, (batch_size, src_len)).to(device)
trg = torch.randint(0, OUTPUT_DIM, (batch_size, trg_len)).to(device)

output = model(src, trg, teacher_forcing_ratio=0.5)
print(f"\n輸入形狀: {src.shape}")
print(f"目標形狀: {trg.shape}")
print(f"輸出形狀: {output.shape}")
print("\n✓ 模型測試成功！")

模型已建立，使用設備: mps

模型架構:
Seq2Seq(
  (src_embedding): Embedding(1024, 256)
  (trg_embedding): Embedding(1024, 256)
  (encoder): Encoder(
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.1)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): Decoder(
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.1)
    (fc_out): Linear(in_features=512, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

總參數量: 8,406,016
可訓練參數量: 8,406,016

輸入形狀: torch.Size([32, 10])
目標形狀: torch.Size([32, 12])
輸出形狀: torch.Size([32, 12, 1024])

✓ 模型測試成功！


# 近代Encoder-Decoder (2025)

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5GemmaConfig
import torch

MODEL_NAME = "google/t5gemma-b-b-prefixlm"
device = "mps" # mps, cuda, cpu

In [4]:
model_config = T5GemmaConfig.from_pretrained(MODEL_NAME)
model_config.num_hidden_layers = 3 ## 這是這模型目前的bug 不加這行跑不了

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, config = model_config).to(device) 

In [6]:
print(model)

T5GemmaForConditionalGeneration(
  (model): T5GemmaModel(
    (encoder): T5GemmaEncoder(
      (embed_tokens): Embedding(256000, 768, padding_idx=0)
      (norm): T5GemmaRMSNorm((768,), eps=1e-06)
      (rotary_emb): T5GemmaRotaryEmbedding()
      (layers): ModuleList(
        (0-11): 12 x T5GemmaEncoderLayer(
          (self_attn): T5GemmaSelfAttention(
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (o_proj): Linear(in_features=768, out_features=768, bias=False)
          )
          (pre_self_attn_layernorm): T5GemmaRMSNorm((768,), eps=1e-06)
          (post_self_attn_layernorm): T5GemmaRMSNorm((768,), eps=1e-06)
          (mlp): T5GemmaMLP(
            (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
            (up_proj): Linear(in_features=768, out_features=2048, bias=Fals

In [15]:
inputs = tokenizer("Hello, Glad to see you.", return_tensors="pt").input_ids.to(device)

In [18]:
outputs = model.generate(inputs, max_length=96, num_beams=8, early_stopping=True)
print(tokenizer.decode(outputs[0]))

<bos>

I'm glad to hear you're doing well.

I'm glad to hear you're doing well.

I'm glad to hear you're doing well.

I'm glad to hear you're doing well.<eos>
