<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>

# Chapter 4 Exercise solutions

In [1]:
from importlib.metadata import version

import torch
print("torch version:", version("torch"))

torch version: 2.4.0


# Exercise 4.1: Parameters in the feed forward versus attention module
# 练习 4.1：前馈层与注意力模块中的参数

In [2]:
# 从gpt模块导入TransformerBlock类
from gpt import TransformerBlock

# GPT-2小型模型(124M参数)的配置
GPT_CONFIG_124M = {
    "vocab_size": 50257,      # 词表大小
    "context_length": 1024,   # 上下文长度/序列长度
    "emb_dim": 768,          # 嵌入维度
    "n_heads": 12,           # 注意力头数
    "n_layers": 12,          # Transformer层数
    "drop_rate": 0.1,        # Dropout比率
    "qkv_bias": False        # 是否在QKV投影中使用偏置
}

# 使用上述配置初始化一个Transformer块
block = TransformerBlock(GPT_CONFIG_124M)

In [3]:
# 计算前馈层中的总参数量
total_params = sum(p.numel() for p in block.ff.parameters())
# 打印前馈层中的参数总数
print(f"Total number of parameters in feed forward module: {total_params:,}")

Total number of parameters in feed forward module: 4,722,432


In [4]:
# 计算注意力模块中的总参数量
total_params = sum(p.numel() for p in block.att.parameters())
# 打印注意力模块中的参数总数
print(f"Total number of parameters in attention module: {total_params:,}")

Total number of parameters in attention module: 2,360,064


- The results above are for a single transformer block
- 上述结果是针对单个transformer块的
- Optionally multiply by 12 to capture all transformer blocks in the 124M GPT model
- 可以乘以12来获得124M GPT模型中所有transformer块的参数量

# Exercise 4.2: Initialize larger GPT models
# 练习 4.2：初始化更大的GPT模型

- **GPT2-small** (the 124M configuration we already implemented):
    - "emb_dim" = 768
    - "n_layers" = 12
    - "n_heads" = 12

- **GPT2-medium:**
    - "emb_dim" = 1024
    - "n_layers" = 24
    - "n_heads" = 16

- **GPT2-large:**
    - "emb_dim" = 1280
    - "n_layers" = 36
    - "n_heads" = 20

- **GPT2-XL:**
    - "emb_dim" = 1600
    - "n_layers" = 48
    - "n_heads" = 25

In [5]:
# 定义GPT-2小型模型的基础配置
GPT_CONFIG_124M = {
    "vocab_size": 50257,      # 词汇表大小
    "context_length": 1024,   # 上下文长度(序列长度)
    "emb_dim": 768,          # 嵌入维度
    "n_heads": 12,           # 注意力头数
    "n_layers": 12,          # transformer层数
    "drop_rate": 0.1,        # dropout比率
    "qkv_bias": False        # 是否在QKV变换中使用偏置项
}


def get_config(base_config, model_name="gpt2-small"):
    """
    根据模型名称获取对应的配置
    Args:
        base_config: 基础配置字典
        model_name: 模型名称,默认为"gpt2-small"
    Returns:
        更新后的配置字典
    """
    GPT_CONFIG = base_config.copy()  # 复制基础配置

    if model_name == "gpt2-small":   # GPT2小型模型配置
        GPT_CONFIG["emb_dim"] = 768
        GPT_CONFIG["n_layers"] = 12
        GPT_CONFIG["n_heads"] = 12

    elif model_name == "gpt2-medium": # GPT2中型模型配置
        GPT_CONFIG["emb_dim"] = 1024
        GPT_CONFIG["n_layers"] = 24
        GPT_CONFIG["n_heads"] = 16

    elif model_name == "gpt2-large":  # GPT2大型模型配置
        GPT_CONFIG["emb_dim"] = 1280
        GPT_CONFIG["n_layers"] = 36
        GPT_CONFIG["n_heads"] = 20

    elif model_name == "gpt2-xl":     # GPT2超大型模型配置
        GPT_CONFIG["emb_dim"] = 1600
        GPT_CONFIG["n_layers"] = 48
        GPT_CONFIG["n_heads"] = 25

    else:
        raise ValueError(f"Incorrect model name {model_name}")

    return GPT_CONFIG


def calculate_size(model): # 基于章节代码
    """
    计算模型的参数量和大小
    Args:
        model: GPT模型实例
    """
    # 计算模型总参数量
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters: {total_params:,}")

    # 考虑权重共享后的实际训练参数量
    total_params_gpt2 =  total_params - sum(p.numel() for p in model.out_head.parameters())
    print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")
    
    # 计算模型总大小(假设使用float32,每个参数4字节)
    total_size_bytes = total_params * 4
    
    # 转换为MB
    total_size_mb = total_size_bytes / (1024 * 1024)
    
    print(f"Total size of the model: {total_size_mb:.2f} MB")

In [6]:
# 导入GPT模型类
from gpt import GPTModel


# 遍历不同规模的GPT2模型
for model_abbrev in ("small", "medium", "large", "xl"):
    # 构建完整的模型名称
    model_name = f"gpt2-{model_abbrev}"
    # 获取对应规模模型的配置
    CONFIG = get_config(GPT_CONFIG_124M, model_name=model_name)
    # 实例化GPT模型
    model = GPTModel(CONFIG)
    # 打印模型名称
    print(f"\n\n{model_name}:")
    # 计算并打印模型大小
    calculate_size(model)



gpt2-small:
Total number of parameters: 163,009,536
Number of trainable parameters considering weight tying: 124,412,160
Total size of the model: 621.83 MB


gpt2-medium:
Total number of parameters: 406,212,608
Number of trainable parameters considering weight tying: 354,749,440
Total size of the model: 1549.58 MB


gpt2-large:
Total number of parameters: 838,220,800
Number of trainable parameters considering weight tying: 773,891,840
Total size of the model: 3197.56 MB


gpt2-xl:
Total number of parameters: 1,637,792,000
Number of trainable parameters considering weight tying: 1,557,380,800
Total size of the model: 6247.68 MB


# Exercise 4.3: Using separate dropout parameters
# 练习 4.3: 使用独立的 dropout 参数

In [7]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,         # 词表大小
    "context_length": 1024,      # 上下文长度(序列最大长度)
    "emb_dim": 768,             # 词嵌入和位置嵌入的维度
    "n_heads": 12,              # 注意力头数
    "n_layers": 12,             # Transformer层数
    "drop_rate_emb": 0.1,       # 词嵌入层的dropout比率
    "drop_rate_attn": 0.1,      # 多头注意力层的dropout比率
    "drop_rate_shortcut": 0.1,  # 残差连接的dropout比率
    "qkv_bias": False           # 是否使用QKV的偏置项
}

In [8]:
# 导入必要的PyTorch模块和自定义组件
import torch.nn as nn
from gpt import MultiHeadAttention, LayerNorm, FeedForward


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        # 初始化Transformer块
        super().__init__()
        # 创建多头注意力层
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"], 
            dropout=cfg["drop_rate_attn"], # 为多头注意力设置dropout
            qkv_bias=cfg["qkv_bias"])
        # 创建前馈网络层
        self.ff = FeedForward(cfg)
        # 创建两个层归一化层
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        # 创建残差连接的dropout层
        self.drop_shortcut = nn.Dropout(cfg["drop_rate_shortcut"])

    def forward(self, x):
        # 注意力块的残差连接
        shortcut = x
        # 第一个层归一化
        x = self.norm1(x)
        # 应用多头注意力
        x = self.att(x)  # 形状 [batch_size, num_tokens, emb_size]
        # 应用dropout
        x = self.drop_shortcut(x)
        # 添加残差连接
        x = x + shortcut  # 加回原始输入

        # 前馈网络块的残差连接
        shortcut = x
        # 第二个层归一化
        x = self.norm2(x)
        # 应用前馈网络
        x = self.ff(x)
        # 应用dropout
        x = self.drop_shortcut(x)
        # 添加残差连接
        x = x + shortcut  # 加回原始输入

        return x


class GPTModel(nn.Module):
    def __init__(self, cfg):
        # 初始化GPT模型
        super().__init__()
        # 创建词嵌入层
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        # 创建位置嵌入层
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        # 创建嵌入层的dropout
        self.drop_emb = nn.Dropout(cfg["drop_rate_emb"]) # 为嵌入层设置dropout

        # 创建多个Transformer块
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # 创建最终的层归一化
        self.final_norm = LayerNorm(cfg["emb_dim"])
        # 创建输出层（线性层）
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        # 获取输入的批次大小和序列长度
        batch_size, seq_len = in_idx.shape
        # 计算词嵌入
        tok_embeds = self.tok_emb(in_idx)
        # 计算位置嵌入
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        # 组合词嵌入和位置嵌入
        x = tok_embeds + pos_embeds  # 形状 [batch_size, num_tokens, emb_size]
        # 应用嵌入层dropout
        x = self.drop_emb(x)
        # 通过Transformer块
        x = self.trf_blocks(x)
        # 应用最终的层归一化
        x = self.final_norm(x)
        # 计算输出logits
        logits = self.out_head(x)
        return logits

In [9]:
# 导入PyTorch库
import torch

# 设置随机种子以确保结果可复现
torch.manual_seed(123)
# 使用124M参数配置初始化GPT模型
model = GPTModel(GPT_CONFIG_124M)