# Transformers

create:
Self-attention
FFN
Add & Norm
Activation Function

In [1]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers
import math

  from .autonotebook import tqdm as notebook_tqdm


## Config

In [2]:
class TransformerConfig:
    def __init__(self):
        self.vocab_size = 10_000
        self.max_length = 512
        self.d_model = 512
        self.num_heads = 8 # 在這ipynb會是 Multi-Head Attention(比起單純的Self-Attention複雜一些，但觀念相同)

        self.d_ff = 2048 # Feed Forward

        self.num_enc_layers = 6
        self.num_dec_layers = 6

        self.dropout = 0.1
        self.pad_token_id = 0

config = TransformerConfig()

## Positional Encoding
$$
PE_{pos, 2i} = sin(pos/10,000^{2i/d_{model}})
$$
$$
PE_{pos, 2i+1} = cos(pos/10,000^{2i/d_{model}})
$$

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length = 512, dropout = 0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_seq_length, d_model) # initilization

        # 列出所有位置
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        # print(position)

        # 計算P 10,000^{2i/d_{model}}
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -math.log(10000.0) / d_model)
        # 套入公式
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe) # 保存位置訊息

    # Encode 之後輸出！
    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [4]:
d_model = 512
max_seq_length = 100
batch_size = 2
seq_length = 10

# Self Attention!

Attention Sccores (What we learn on the class)
$$
scores = \frac{Q \cdot K^T}{\sqrt{d_{model}}} \cdot V
$$

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        assert d_model % num_heads == 0 # 必須要能整除
        self.d_model = d_model
        self.num_heads = num_heads 
        self.d_k = d_model // num_heads


        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.DropOut(p=0.1)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.d_k) # [batch_size, seq_length, num_heads, dim_k]
        return x.transpose(1, 2) # [batch_size, num_heads, seq_length, dim_k]


    def scaled_dot_product_attention(self, Q, K, V, mask = None):
        scores = torch.matmul(Q, K.transpose(-2, -1) / math.sqrt(self.d_k))

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attention_weights = torch.softmax(scores, dim=1)
        attention_weights = self.dropout(attention_weights)

        output = torch.matmul(attention_weights, V)
        
        return output, attention_weights
    
    def forward(self, query, key, value, mask = None):
        batch_size = query.size(0)
        # 將輸入轉成多頭進行
        Q = self.split_heads(self.W_q(query), batch_size)
        K = self.split_heads(self.W_k(key), batch_size)
        V = self.split_heads(self.W_v(value), batch_size)

        attn_output, attn_weights = self.scaled_dot_product_attention(Q, K, V, mask)

        # 合併多頭，順便轉換位置
        # [batch, num_heads, seq_len, d_k] -> [batch, seq_len, num_heads, d_k]
        attn_output = attn_output.transpose(1, 2).contiguous()

        # [batch, seq_len, num_heads, d_k] -> [batch, seq_len, d_model]
        attn_output = attn_output.view(batch_size, -1, self.d_model)

        output = self.W_o(attn_output)
        return output, attn_weights


## Feed Forward (簡單的DNN)

In [6]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        # x: [batch_size, seq_length, d_model]
        x = self.linear1(x)      # [batch, seq_len, d_ff]
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)      # [batch, seq_len, d_model]
        return x

# Encoder (整合成Encoder)

In [7]:
class EncoderLayer(nn.Module):
    """
    單個 Encoder 層 = Multi-Head Attention + Feed-Forward
    每個子層都有 Add & Norm（殘差連接 + 層歸一化）
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        # Layer Normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # 1. Multi-Head Attention + Add & Norm
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout1(attn_output))  # Add & Norm
        
        # 2. Feed-Forward + Add & Norm
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout2(ff_output))    # Add & Norm
        
        return x

# Decoder (整合成Decoder)

In [8]:
class DecoderLayer(nn.Module):
    """
    單個 Decoder 層 = Masked Self-Attention + Cross-Attention + Feed-Forward
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
    
    def forward(self, x, encoder_output, src_mask=None, tgt_mask=None):
        # 1. Masked Self-Attention (只能看到之前的詞)
        attn_output, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # 2. Cross-Attention (關注 encoder 的輸出)
        attn_output, _ = self.cross_attn(x, encoder_output, encoder_output, src_mask)
        x = self.norm2(x + self.dropout2(attn_output))
        
        # 3. Feed-Forward
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout3(ff_output))
        
        return x


# Transformers (將Encoder, Decoder 結合！)

In [9]:
class Transformer(nn.Module):
    """
    完整的 Transformer 模型
    """
    def __init__(self, 
                 src_vocab_size,      # 源語言詞彙表大小
                 tgt_vocab_size,      # 目標語言詞彙表大小
                 d_model=512,
                 num_heads=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 d_ff=2048,
                 max_seq_length=512,
                 dropout=0.1):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        
        # Embedding 層
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        
        # Positional Encoding
        self.pos_encoding = PositionalEncoding(d_model, max_seq_length, dropout)
        
        # Encoder 層堆疊
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_encoder_layers)
        ])
        
        # Decoder 層堆疊
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])
        
        # 最後的線性層（投影到詞彙表）
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        
        # 初始化權重
        self._init_weights()
    
    def _init_weights(self):
        """初始化權重（使用 Xavier 初始化）"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def generate_square_subsequent_mask(self, sz):
        """
        生成 look-ahead mask（防止看到未來的詞）
        上三角矩陣，對角線及以下為 1，上方為 0
        """
        mask = torch.triu(torch.ones(sz, sz), diagonal=1)
        mask = mask.masked_fill(mask == 1, 0)
        return mask.unsqueeze(0).unsqueeze(0)  # [1, 1, sz, sz]
    
    def make_src_mask(self, src):
        """
        創建源序列的 padding mask
        src: [batch_size, src_seq_length]
        """
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        # [batch_size, 1, 1, src_seq_length]
        return src_mask
    
    def make_tgt_mask(self, tgt):
        """
        創建目標序列的 mask（padding mask + look-ahead mask）
        tgt: [batch_size, tgt_seq_length]
        """
        batch_size, tgt_len = tgt.shape
        
        # Padding mask
        tgt_padding_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        # [batch_size, 1, 1, tgt_seq_length]
        
        # Look-ahead mask
        tgt_sub_mask = self.generate_square_subsequent_mask(tgt_len).to(tgt.device)
        # [1, 1, tgt_seq_length, tgt_seq_length]
        
        # 結合兩種 mask
        tgt_mask = tgt_padding_mask & tgt_sub_mask
        return tgt_mask
    
    def encode(self, src, src_mask):
        """
        Encoder 前向傳播
        """
        # Embedding + Positional Encoding
        x = self.encoder_embedding(src) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # 通過所有 Encoder 層
        for layer in self.encoder_layers:
            x = layer(x, src_mask)
        
        return x
    
    def decode(self, tgt, encoder_output, src_mask, tgt_mask):
        """
        Decoder 前向傳播
        """
        # Embedding + Positional Encoding
        x = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # 通過所有 Decoder 層
        for layer in self.decoder_layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        
        return x
    
    def forward(self, src, tgt):
        """
        完整前向傳播
        Args:
            src: [batch_size, src_seq_length] - 源序列
            tgt: [batch_size, tgt_seq_length] - 目標序列
        Returns:
            output: [batch_size, tgt_seq_length, tgt_vocab_size]
        """
        # 創建 mask
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        
        # Encoder
        encoder_output = self.encode(src, src_mask)
        
        # Decoder
        decoder_output = self.decode(tgt, encoder_output, src_mask, tgt_mask)
        
        # 投影到詞彙表
        output = self.fc_out(decoder_output)
        
        return output

# Inference
* written by claude-4.5.sonnet

In [11]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

print("="*60)
print("T5 (Text-to-Text Transfer Transformer) 推理示範")
print("="*60)

# ============================================
# 1. 載入模型和 Tokenizer
# ============================================
print("\n📥 載入模型中...")

# 選擇模型大小（可以換成其他版本）
# - t5-small: 60M 參數（最快，適合測試）
# - t5-base: 220M 參數
# - t5-large: 770M 參數
# - t5-3b: 3B 參數

model_name = "t5-small"  # 建議先用 small 測試
print(f"模型: {model_name}")

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 如果有 GPU 就用 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"使用設備: {device}")

# 查看模型架構
print(f"\n📊 模型參數量: {sum(p.numel() for p in model.parameters()):,}")
print(f"Encoder 層數: {model.config.num_layers}")
print(f"Decoder 層數: {model.config.num_decoder_layers}")
print(f"注意力頭數: {model.config.num_heads}")
print(f"d_model: {model.config.d_model}")
print(f"d_ff: {model.config.d_ff}")

# ============================================
# 2. 定義推理函數
# ============================================
def generate_text(task_prefix, input_text, max_length=128):
    """
    T5 的推理函數
    
    Args:
        task_prefix: 任務前綴（T5 的特色）
        input_text: 輸入文本
        max_length: 最大生成長度
    """
    # T5 的輸入格式：task_prefix + input_text
    input_text = f"{task_prefix}: {input_text}"
    
    # Tokenization
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    
    # 生成
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=4,           # Beam search
        early_stopping=True,
        temperature=0.7,
    )
    
    # Decode
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

T5 (Text-to-Text Transfer Transformer) 推理示範

📥 載入模型中...
模型: t5-small
使用設備: cpu

📊 模型參數量: 60,506,624
Encoder 層數: 6
Decoder 層數: 6
注意力頭數: 8
d_model: 512
d_ff: 2048


In [14]:
# ----------------
# 任務 1: 翻譯
# ----------------
print("\n【任務 1: 英文翻譯成德文】")
input_text = "Hello, how are you?"
task = "translate English to French"
result = generate_text(task, input_text)
print(f"輸入: {input_text}")
print(f"輸出: {result}")


【任務 1: 英文翻譯成德文】
輸入: Hello, how are you?
輸出: Bonjour, comment êtes-vous?


In [15]:
# ----------------
# 任務 2: 摘要
# ----------------
print("\n【任務 2: 文本摘要】")
input_text = """
The Transformer architecture has revolutionized natural language processing. 
It was introduced in the paper 'Attention Is All You Need' by Vaswani et al. in 2017.
The key innovation is the self-attention mechanism, which allows the model to weigh 
the importance of different words in a sentence. This architecture has become the 
foundation for models like BERT, GPT, and T5.
"""
task = "summarize"
result = generate_text(task, input_text, max_length=50)
print(f"輸入: {input_text[:100]}...")
print(f"摘要: {result}")


【任務 2: 文本摘要】
輸入: 
The Transformer architecture has revolutionized natural language processing. 
It was introduced in ...
摘要: the Transformer architecture has revolutionized natural language processing. it was introduced in the paper 'Attention Is All You Need' by Vaswani et al.


In [16]:
# ----------------
# 任務 3: 問答
# ----------------
print("\n【任務 3: 問答】")
context = "The Transformer was introduced in 2017 by Vaswani et al."
question = "When was the Transformer introduced?"
input_text = f"question: {question} context: {context}"
task = ""  # 問答任務不需要前綴
result = generate_text(task, input_text, max_length=20)
print(f"問題: {question}")
print(f"答案: {result}")


【任務 3: 問答】
問題: When was the Transformer introduced?
答案: 2017


In [20]:
# ----------------
# 任務 4: 情感分類
# ----------------
print("\n【任務 4: 情感分類】")
input_text = "This movie is absolutely wonderful! I loved every minute of it."
task = "sentiment"
result = generate_text(task, input_text, max_length=10)
print(f"輸入: {input_text}")
print(f"情感: {result}")


【任務 4: 情感分類】
輸入: This movie is absolutely wonderful! I loved every minute of it.
情感: sentiment: This movie is absolutely wonderful!


In [21]:
# ============================================
# 4. 查看內部結構（對比我們的實作）
# ============================================
print("\n" + "="*60)
print("模型內部結構")
print("="*60)

print("\n【Encoder 結構】")
print(f"層數: {len(model.encoder.block)}")
print(f"第一層結構:")
first_encoder_layer = model.encoder.block[0]
print(f"  - Self-Attention: {first_encoder_layer.layer[0].__class__.__name__}")
print(f"  - Feed-Forward: {first_encoder_layer.layer[1].__class__.__name__}")

print("\n【Decoder 結構】")
print(f"層數: {len(model.decoder.block)}")
print(f"第一層結構:")
first_decoder_layer = model.decoder.block[0]
print(f"  - Self-Attention: {first_decoder_layer.layer[0].__class__.__name__}")
print(f"  - Cross-Attention: {first_decoder_layer.layer[1].__class__.__name__}")
print(f"  - Feed-Forward: {first_decoder_layer.layer[2].__class__.__name__}")


模型內部結構

【Encoder 結構】
層數: 6
第一層結構:
  - Self-Attention: T5LayerSelfAttention
  - Feed-Forward: T5LayerFF

【Decoder 結構】
層數: 6
第一層結構:
  - Self-Attention: T5LayerSelfAttention
  - Cross-Attention: T5LayerCrossAttention
  - Feed-Forward: T5LayerFF


In [23]:
# ============================================
# 5. 手動前向傳播（展示 Encoder-Decoder）
# ============================================
print("\n" + "="*60)
print("手動前向傳播展示")
print("="*60)

input_text = "translate English to French: Hello world"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

print(f"\n輸入 token IDs 形狀: {input_ids.shape}")

# Encoder
with torch.no_grad():
    encoder_outputs = model.encoder(input_ids)
    encoder_hidden_states = encoder_outputs.last_hidden_state

print(f"Encoder 輸出形狀: {encoder_hidden_states.shape}")
print(f"  [batch_size, seq_length, d_model]")

# Decoder（需要目標序列的開始）
decoder_input_ids = torch.tensor([[tokenizer.pad_token_id]]).to(device)

with torch.no_grad():
    decoder_outputs = model.decoder(
        input_ids=decoder_input_ids,
        encoder_hidden_states=encoder_hidden_states
    )
    decoder_hidden_states = decoder_outputs.last_hidden_state

print(f"Decoder 輸出形狀: {decoder_hidden_states.shape}")

# 投影到詞彙表
with torch.no_grad():
    lm_logits = model.lm_head(decoder_hidden_states)

print(f"最終 logits 形狀: {lm_logits.shape}")
print(f"  [batch_size, seq_length, vocab_size]")


手動前向傳播展示

輸入 token IDs 形狀: torch.Size([1, 8])
Encoder 輸出形狀: torch.Size([1, 8, 512])
  [batch_size, seq_length, d_model]
Decoder 輸出形狀: torch.Size([1, 1, 512])
最終 logits 形狀: torch.Size([1, 1, 32128])
  [batch_size, seq_length, vocab_size]


與我們實作的對比

我們的實作 vs T5:

相同點 ✅:
1. Encoder-Decoder 架構
2. Multi-Head Attention
3. Feed-Forward Network
4. Positional Encoding（T5 用的是相對位置編碼）
5. Layer Normalization
6. 殘差連接

差異點 🔍:
1. T5 使用相對位置編碼，我們用絕對位置編碼
2. T5 的 Layer Norm 位置略有不同（Pre-LN vs Post-LN）
3. T5 使用 SentencePiece tokenizer
4. T5 的訓練任務是 Text-to-Text
5. T5 有多種大小的預訓練模型

核心概念完全一樣！我們實作的就是 T5 的基礎架構。

💡 提示:
1. 可以嘗試其他任務前綴：translate, summarize, question, etc.
2. 可以換成其他模型：t5-base, t5-large, flan-t5-base
3. 對於中文任務，可以使用 mT5（多語言版本）