In [None]:
# 如果没装 datasets，请先在一个单元里安装：
# !pip install -q datasets

import torch
from practice.data_prep import prepare_shakespeare_loaders

# 超参可按需调整
seq_len = 100
batch_size = 2

loaders, tokenizer, vocab_size = prepare_shakespeare_loaders(
    seq_len=seq_len,
    batch_size=batch_size,
    val_ratio=0.1,     # 可调，按连续token流切分
    seed=42,
    num_workers=0,     # Jupyter 下通常设 0
    shuffle_train=True
)

print("vocab_size:", vocab_size)

# 取一个 batch 查看形状与内容
xb, yb = next(iter(loaders["train"]))
print("xb shape:", xb.shape)  # (batch_size, seq_len)
print("yb shape:", yb.shape)  # (batch_size, seq_len)
print("xb dtype:", xb.dtype)  # torch.long

# 反解码看一下文本是否合理
print("sample x[0]:")
print(tokenizer.decode(xb[0].tolist()))
print("sample y[0]:")
print(tokenizer.decode(yb[0].tolist()))

# 设备放置（训练时使用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xb, yb = xb.to(device), yb.to(device)

# 后续你可以用 vocab_size 初始化 Embedding/GRU 等
# 例如：
# import torch.nn as nn
# emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=256).to(device)
# gru = nn.GRU(input_size=256, hidden_size=512, batch_first=True).to(device)
