In [13]:
# from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders, processors

# files = ["train_sampled.txt", "TinyStoriesV2-GPT4-valid.txt"]
# tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>","<|endoftext|>"])
# tokenizer.train(files, trainer)

# tokenizer.decoder = decoders.BPEDecoder()
# tokenizer.post_processor = processors.TemplateProcessing(
#     single="<bos> $A <eos>",
#     pair="<bos> $A <eos> $B:1 <eos>:1",
#     special_tokens=[("<bos>", tokenizer.token_to_id("<bos>")), ("<eos>", tokenizer.token_to_id("<eos>"))],
# )

# tokenizer.save("tiny_tokenizer.json")
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors

tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))

# ✅ 使用 ByteLevel 分词器可以保留空格等边界信息
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()

trainer = trainers.BpeTrainer(
    vocab_size=8192,
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>", "<|endoftext|>"]
)

tokenizer.train( ["train_sampled.txt", "TinyStoriesV2-GPT4-valid.txt", "alpaca_tokenizer_text.txt"], trainer)

tokenizer.post_processor = processors.TemplateProcessing(
    single="<bos> $A <eos>",
    pair="<bos> $A <eos> $B:1 <eos>:1",
    special_tokens=[
        ("<bos>", tokenizer.token_to_id("<bos>")),
        ("<eos>", tokenizer.token_to_id("<eos>"))
    ],
)

tokenizer.save("tiny_tokenizer.json")








In [14]:
# 
import torch
# from torch.utils.data import Dataset

# class TokenizedDataset(Dataset):
#     def __init__(self, text, tokenizer, block_size):
#         # text: str, tokenizer: 已加载的tokenizer，block_size: int
#         self.token_ids = tokenizer.encode(text).ids
#         self.block_size = block_size
#         self.num_blocks = (len(self.token_ids) - 1) // block_size

#     def __len__(self):
#         return self.num_blocks

#     def __getitem__(self, idx):
#         start = idx * self.block_size
#         end = start + self.block_size + 1
#         chunk = self.token_ids[start:end]
#         x = torch.tensor(chunk[:-1], dtype=torch.long)
#         y = torch.tensor(chunk[1:], dtype=torch.long)
#         return x, y
import torch
from torch.utils.data import Dataset

class TokenizedDataset(Dataset):
    def __init__(self, text, tokenizer, block_size=256):
        self.block_size = block_size
        self.chunks = []

        stories = text.split('<|endoftext|>')
        for story in stories:
            encoded = tokenizer.encode(story + '<|endoftext|>').ids
            if len(encoded) < block_size + 1:
                continue
            for i in range(0, len(encoded) - block_size, block_size):
                chunk = encoded[i:i + block_size + 1]
                self.chunks.append(chunk)

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        chunk = self.chunks[idx]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y



In [15]:
import torch.nn as nn
import torch.nn.functional as F
import torch
class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(emb_dim, n_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.GELU(),
            nn.Linear(4 * emb_dim, emb_dim)
        )
        self.norm2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        B, T, C = x.size()

        # 生成 causal mask，保证第 t 个位置只能看到 <= t 的位置
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).repeat(B, 1, 1)
        # nn.MultiheadAttention 需要 bool mask，True 表示被遮挡
        attn_mask = ~mask.bool()[0]  # (T, T) bool，True 表示遮挡

        attn_out, _ = self.attn(x, x, x, attn_mask=attn_mask, need_weights=False)
        x = x + attn_out
        x = self.norm1(x)
        mlp_out = self.mlp(x)
        x = x + mlp_out
        return self.norm2(x)

class TinyTransformer(nn.Module):
    def __init__(self, vocab_size, emb_dim=512, n_heads=16, n_layers=12, block_size=512):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, block_size, emb_dim))
        self.blocks = nn.Sequential(*[
            TransformerBlock(emb_dim, n_heads) for _ in range(n_layers)
        ])
        self.ln = nn.LayerNorm(emb_dim)
        self.fc = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        tok_emb = self.token_embedding(x)
        x = tok_emb + self.pos_embedding[:, :x.size(1), :]
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.fc(x)
        return logits

In [16]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载 tokenizer
from tokenizers import Tokenizer as RawTokenizer
raw_tokenizer = RawTokenizer.from_file("tiny_tokenizer.json")
vocab_size = raw_tokenizer.get_vocab_size()

# 读文本
with open("train_sampled.txt", "r", encoding="utf-8") as f:
    train_text = f.read()
with open("TinyStoriesV2-GPT4-valid.txt", "r", encoding="utf-8") as f:
    valid_text = f.read()

block_size = 512
batch_size = 64

train_dataset = TokenizedDataset(train_text, raw_tokenizer, block_size)
valid_dataset = TokenizedDataset(valid_text, raw_tokenizer, block_size)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

model = TinyTransformer(vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

def get_loss(logits, targets):
    B, T, V = logits.shape
    logits = logits.view(B * T, V)
    targets = targets.view(B * T)
    return F.cross_entropy(logits, targets)

@torch.no_grad()
def evaluate(val_loader):
    model.eval()
    total_loss = 0
    count = 0
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = get_loss(logits, y)
        total_loss += loss.item()
        count += 1
    model.train()
    return math.exp(total_loss / count)  # 计算PPL


In [21]:
import os
# torch.cuda.empty_cache()
save_path = "tiny_transformer_checkpoint.pth"

epochs = 1
for epoch in range(epochs):
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = get_loss(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    val_ppl = evaluate(valid_loader)
    print(f"Epoch {epoch+1} | Val PPL: {val_ppl:.2f}")

    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch + 1,
        'val_ppl': val_ppl,
    }, save_path)
    print(f"模型已保存到 {save_path}")


Epoch 1 | Val PPL: 22.17
模型已保存到 tiny_transformer_checkpoint.pth
