<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>


 # The Main Data Loading Pipeline Summarized
 # 主要数据加载流程总结

 The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).
 
 完整的章节代码位于 [ch02.ipynb](./ch02.ipynb)。
 
 This notebook contains the main takeaway, the data loading pipeline without the intermediate steps.
 
 本笔记本包含了主要内容 - 不含中间步骤的数据加载流程。

 Packages that are being used in this notebook:
 
 本节中使用的包:

In [1]:
# NBVAL_SKIP
# 从importlib.metadata导入version模块
from importlib.metadata import version

# 打印torch和tiktoken的版本信息
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.5.0
tiktoken version: 0.8.0


In [2]:
# 导入所需的包
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # 对整个文本进行分词
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # 使用滑动窗口将文本分成重叠的序列,每个序列长度为max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # 初始化分词器
    tokenizer = tiktoken.get_encoding("gpt2")

    # 创建数据集
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # 创建数据加载器
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


# 读取文本文件
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# 初始化分词器并对文本进行编码
tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

# 设置模型参数
vocab_size = 50257  # 词汇表大小
output_dim = 256    # 输出维度
context_length = 1024  # 上下文长度

# 创建词元嵌入层和位置嵌入层
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

# 创建数据加载器实例
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)

In [4]:
# 遍历数据加载器中的批次
for batch in dataloader:
    # 从批次中获取输入x和目标y
    x, y = batch

    # 通过词元嵌入层获取词元嵌入
    token_embeddings = token_embedding_layer(x)
    # 通过位置嵌入层获取位置嵌入
    # torch.arange(max_length)会生成一个从0到max_length-1的连续整数序列
    # 例如当max_length=4时,生成[0,1,2,3]
    # 这些数字代表了序列中每个位置的索引,用于生成位置编码
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    # 将词元嵌入和位置嵌入相加得到最终的输入嵌入
    input_embeddings = token_embeddings + pos_embeddings

    # 仅处理一个批次后就退出循环
    break

In [6]:
# 打印输入嵌入的形状，预期为[batch_size, max_length, output_dim]
print(input_embeddings.shape)

torch.Size([8, 4, 256])
