In [1]:
# from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders, processors

# files = ["train_sampled.txt", "TinyStoriesV2-GPT4-valid.txt"]
# tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>","<|endoftext|>"])
# tokenizer.train(files, trainer)

# tokenizer.decoder = decoders.BPEDecoder()
# tokenizer.post_processor = processors.TemplateProcessing(
#     single="<bos> $A <eos>",
#     pair="<bos> $A <eos> $B:1 <eos>:1",
#     special_tokens=[("<bos>", tokenizer.token_to_id("<bos>")), ("<eos>", tokenizer.token_to_id("<eos>"))],
# )

# tokenizer.save("tiny_tokenizer.json")
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors

tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))

# 使用 ByteLevel 分词器可以保留空格等边界信息
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()

trainer = trainers.BpeTrainer(
    vocab_size=8192,
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>", "<|endoftext|>"]
)

tokenizer.train( ["./data/train_sampled.txt", "./data/TinyStoriesV2-GPT4-valid.txt", "./data/alpaca_tokenizer_text.txt"], trainer)

tokenizer.post_processor = processors.TemplateProcessing(
    single="<bos> $A <eos>",
    pair="<bos> $A <eos> $B:1 <eos>:1",
    special_tokens=[
        ("<bos>", tokenizer.token_to_id("<bos>")),
        ("<eos>", tokenizer.token_to_id("<eos>"))
    ],
)

tokenizer.save("./data/tiny_tokenizer.json")








In [2]:
# 
import torch
# from torch.utils.data import Dataset

# class TokenizedDataset(Dataset):
#     def __init__(self, text, tokenizer, block_size):
#         # text: str, tokenizer: 已加载的tokenizer，block_size: int
#         self.token_ids = tokenizer.encode(text).ids
#         self.block_size = block_size
#         self.num_blocks = (len(self.token_ids) - 1) // block_size

#     def __len__(self):
#         return self.num_blocks

#     def __getitem__(self, idx):
#         start = idx * self.block_size
#         end = start + self.block_size + 1
#         chunk = self.token_ids[start:end]
#         x = torch.tensor(chunk[:-1], dtype=torch.long)
#         y = torch.tensor(chunk[1:], dtype=torch.long)
#         return x, y
import torch
from torch.utils.data import Dataset

class TokenizedDataset(Dataset):
    def __init__(self, text, tokenizer, block_size=1024):
        self.block_size = block_size
        self.chunks = []

        stories = text.split('<|endoftext|>')
        for story in stories:
            encoded = tokenizer.encode(story + '<|endoftext|>').ids
            if len(encoded) < block_size + 1:
                continue
            for i in range(0, len(encoded) - block_size, block_size):
                chunk = encoded[i:i + block_size + 1]
                self.chunks.append(chunk)

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        chunk = self.chunks[idx]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y



In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch
class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(emb_dim, n_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.GELU(),
            nn.Linear(4 * emb_dim, emb_dim)
        )
        self.norm2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        B, T, C = x.size()

        # 生成 causal mask，保证第 t 个位置只能看到 <= t 的位置
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).repeat(B, 1, 1)
        # nn.MultiheadAttention 需要 bool mask，True 表示被遮挡
        attn_mask = ~mask.bool()[0]  # (T, T) bool，True 表示遮挡

        attn_out, _ = self.attn(x, x, x, attn_mask=attn_mask, need_weights=False)
        x = x + attn_out
        x = self.norm1(x)
        mlp_out = self.mlp(x)
        x = x + mlp_out
        return self.norm2(x)

class TinyTransformer(nn.Module):
    def __init__(self, vocab_size, emb_dim=512, n_heads=16, n_layers=12, block_size=1024):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, block_size, emb_dim))
        self.blocks = nn.Sequential(*[
            TransformerBlock(emb_dim, n_heads) for _ in range(n_layers)
        ])
        self.ln = nn.LayerNorm(emb_dim)
        self.fc = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        tok_emb = self.token_embedding(x)
        x = tok_emb + self.pos_embedding[:, :x.size(1), :]
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.fc(x)
        return logits

In [4]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载 tokenizer
from tokenizers import Tokenizer as RawTokenizer
raw_tokenizer = RawTokenizer.from_file("./data/tiny_tokenizer.json")
vocab_size = raw_tokenizer.get_vocab_size()

# 读文本
with open("./data/train_sampled.txt", "r", encoding="utf-8") as f:
    train_text = f.read()
with open("./data/TinyStoriesV2-GPT4-valid.txt", "r", encoding="utf-8") as f:
    valid_text = f.read()

block_size = 1024
batch_size = 64

train_dataset = TokenizedDataset(train_text, raw_tokenizer, block_size)
valid_dataset = TokenizedDataset(valid_text, raw_tokenizer, block_size)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

model = TinyTransformer(vocab_size)
if torch.cuda.device_count()>1:
    model = nn.DataParallel(model)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

def get_loss(logits, targets):
    B, T, V = logits.shape
    logits = logits.view(B * T, V)
    targets = targets.view(B * T)
    return F.cross_entropy(logits, targets)

@torch.no_grad()
def evaluate(val_loader):
    model.eval()
    total_loss = 0
    count = 0
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = get_loss(logits, y)
        total_loss += loss.item()
        count += 1
    model.train()
    return math.exp(total_loss / count)  # 计算PPL


    There is an imbalance between your GPUs. You may want to exclude GPU 4 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


In [5]:
# import os
# # torch.cuda.empty_cache()
# save_path = "tiny_transformer_checkpoint.pth"

# epochs = 1
# for epoch in range(epochs):
#     for x, y in train_loader:
#         x, y = x.to(device), y.to(device)
#         logits = model(x)
#         loss = get_loss(logits, y)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     val_ppl = evaluate(valid_loader)
#     print(f"Epoch {epoch+1} | Val PPL: {val_ppl:.2f}")

#     torch.save({
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict(),
#         'epoch': epoch + 1,
#         'val_ppl': val_ppl,
#     }, save_path)
#     print(f"模型已保存到 {save_path}")


In [6]:
import os 
import torch
import math
import matplotlib.pyplot as plt
import random
import numpy as np


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # 避免某些算子非确定性行为（会稍慢一点）
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  
# 创建目录
save_dir = "checkpoints"
pic_dir = "pic"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(pic_dir, exist_ok=True)

best_model_path = os.path.join(save_dir, "tiny_transformer_best.pth")

best_val_ppl = float('inf')
epochs = 300

train_losses = []
val_ppls = []

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    count = 0

    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = get_loss(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        count += 1

    avg_train_loss = total_train_loss / count
    train_losses.append(avg_train_loss)

    # 验证
    model.eval()
    with torch.no_grad():
        total_val_loss = 0
        count = 0
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = get_loss(logits, y)
            total_val_loss += loss.item()
            count += 1
        avg_val_loss = total_val_loss / count
        val_ppl = math.exp(avg_val_loss)
        val_ppls.append(val_ppl)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val PPL: {val_ppl:.2f}")

    # 保存模型
    epoch_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch+1}.pth")
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch + 1,
        'val_ppl': val_ppl,
    }, epoch_path)

    if val_ppl < best_val_ppl:
        best_val_ppl = val_ppl
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch': epoch + 1,
            'val_ppl': val_ppl,
        }, best_model_path)

    # 提前停止条件
    if val_ppl < 30:
        print(f"🎉 提前停止：验证集 PPL 达到 {val_ppl:.2f} < 30")
        break

# 画图（只画已训练轮数）
epochs_ran = range(1, len(train_losses) + 1)

# 图 1：完整曲线
plt.figure(figsize=(10, 5))
plt.plot(epochs_ran, train_losses, label='Train Loss')
plt.plot(epochs_ran, val_ppls, label='Val PPL')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.title('Training Loss & Validation Perplexity')
plt.legend()
plt.grid(True)

plot_path = os.path.join(pic_dir, 'training_plot.png')
plt.savefig(plot_path)
print(f"已保存训练曲线图像到 {plot_path}")
plt.close()

# 图 2：放大版 PPL<200
plt.figure(figsize=(10, 5))
plt.plot(epochs_ran, val_ppls, label='Val PPL (zoomed)')
plt.axhline(y=200, color='gray', linestyle='--', linewidth=1)
plt.xlabel('Epoch')
plt.ylabel('PPL')
plt.title('Validation PPL (Zoomed View, <200)')
plt.ylim(0, 200)
plt.grid(True)
plt.legend()

zoom_path = os.path.join(pic_dir, 'ppl_zoomed.png')
plt.savefig(zoom_path)
print(f"已保存验证集 PPL 放大图到 {zoom_path}")
plt.close()



  return torch._native_multi_head_attention(


Epoch 1 | Train Loss: 9.1772 | Val PPL: 2878.29
Epoch 2 | Train Loss: 7.9645 | Val PPL: 1913.08
Epoch 3 | Train Loss: 7.5823 | Val PPL: 1538.71
Epoch 4 | Train Loss: 7.3777 | Val PPL: 1358.24
Epoch 5 | Train Loss: 7.2337 | Val PPL: 1130.77
Epoch 6 | Train Loss: 7.0428 | Val PPL: 1670.60
Epoch 7 | Train Loss: 7.4055 | Val PPL: 918.97
Epoch 8 | Train Loss: 6.8034 | Val PPL: 857.83
Epoch 9 | Train Loss: 6.7225 | Val PPL: 773.41
Epoch 10 | Train Loss: 6.6055 | Val PPL: 690.13
Epoch 11 | Train Loss: 6.4791 | Val PPL: 612.50
Epoch 12 | Train Loss: 6.3486 | Val PPL: 542.63
Epoch 13 | Train Loss: 6.2185 | Val PPL: 493.86
Epoch 14 | Train Loss: 6.1232 | Val PPL: 469.58
Epoch 15 | Train Loss: 6.0433 | Val PPL: 441.05
Epoch 16 | Train Loss: 5.9730 | Val PPL: 414.56
Epoch 17 | Train Loss: 5.9076 | Val PPL: 399.88
Epoch 18 | Train Loss: 5.8679 | Val PPL: 386.43
Epoch 19 | Train Loss: 5.8271 | Val PPL: 371.70
Epoch 20 | Train Loss: 5.7793 | Val PPL: 359.52
Epoch 21 | Train Loss: 5.7364 | Val PPL: 35