In [None]:
#WikiText序列预测项目
#步骤 1：导入必要的工具包
#步骤 2：导入 wikiText-2 数据集并进行基本处理
#步骤 3：根据模型输入构建批量数据
#步骤 4：构建训练和评估函数
#步骤 5：进行训练和评估（包括验证和测试）

In [5]:
import torch
import torchtext
print(f"PyTorch version: {torch.__version__}")
print(f"TorchText version: {torchtext.__version__}")

PyTorch version: 2.2.0+cu121
TorchText version: 0.17.0+cpu


In [23]:
#从huggingface下载训练集，解压
!unzip archive.zip

Archive:  archive.zip
  inflating: wikitext-2/wiki.test.tokens  
  inflating: wikitext-2/wiki.train.tokens  
  inflating: wikitext-2/wiki.valid.tokens  


In [24]:
#预处理wikitext2数据集
import torch
from transformers import AutoTokenizer

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def process_file(filename, split_name):
    print(f"正在处理 {filename}...")

    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # 过滤空行和标题行（以=开头的行）
    texts = []
    for line in lines:
        line = line.strip()
        if line and not line.startswith('=') and len(line) > 5:
            texts.append(line)

    print(f"{split_name}集有效行数: {len(texts)}")

    all_input_ids = []
    all_attention_masks = []
    batch_size = 500

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        encoded = tokenizer(
            batch,
            truncation=True,
            max_length=128,
            padding="max_length",
            return_tensors="pt"
        )

        all_input_ids.append(encoded["input_ids"])
        all_attention_masks.append(encoded["attention_mask"])

        if (i // batch_size) % 10 == 0:  # 每10个batch显示一次进度
            print(f"{split_name} - 已处理: {min(i+batch_size, len(texts))}/{len(texts)}")

    return torch.cat(all_input_ids, dim=0), torch.cat(all_attention_masks, dim=0)

# 处理三个文件
print("开始处理WikiText-2数据集...")

train_ids, train_masks = process_file("wikitext-2/wiki.train.tokens", "训练")
val_ids, val_masks = process_file("wikitext-2/wiki.valid.tokens", "验证")
test_ids, test_masks = process_file("wikitext-2/wiki.test.tokens", "测试")

# 保存处理结果
torch.save({
    "train_input_ids": train_ids,
    "train_attention_mask": train_masks,
    "val_input_ids": val_ids,
    "val_attention_mask": val_masks,
    "test_input_ids": test_ids,
    "test_attention_mask": test_masks
}, "wikitext2_processed.pt")

print("\n" + "="*50)
print("WikiText-2 数据预处理完成！")
print("="*50)
print(f"训练集形状: {train_ids.shape}")
print(f"验证集形状: {val_ids.shape}")
print(f"测试集形状: {test_ids.shape}")
print(f"数据已保存到: wikitext2_processed.pt")
print("="*50)

# 显示一些统计信息
print(f"\n数据统计:")
print(f"- 训练样本数: {train_ids.shape[0]:,}")
print(f"- 验证样本数: {val_ids.shape[0]:,}")
print(f"- 测试样本数: {test_ids.shape[0]:,}")
print(f"- 序列长度: {train_ids.shape[1]}")

开始处理WikiText-2数据集...
正在处理 wikitext-2/wiki.train.tokens...
训练集有效行数: 17456
训练 - 已处理: 500/17456
训练 - 已处理: 5500/17456
训练 - 已处理: 10500/17456
训练 - 已处理: 15500/17456
正在处理 wikitext-2/wiki.valid.tokens...
验证集有效行数: 1841
验证 - 已处理: 500/1841
正在处理 wikitext-2/wiki.test.tokens...
测试集有效行数: 2163
测试 - 已处理: 500/2163

WikiText-2 数据预处理完成！
训练集形状: torch.Size([17456, 128])
验证集形状: torch.Size([1841, 128])
测试集形状: torch.Size([2163, 128])
数据已保存到: wikitext2_processed.pt

数据统计:
- 训练样本数: 17,456
- 验证样本数: 1,841
- 测试样本数: 2,163
- 序列长度: 128


In [38]:
#根据模型输入构建批量数据
import torch

def create_batches(data_path='wikitext2_processed.pt', batch_size=20, seq_len=35):
    """
    最简单的批量数据创建
    """
    # 加载数据
    data = torch.load(data_path)

    def process_data(input_ids):
        # 展平所有序列，移除padding
        tokens = []
        for seq in input_ids:
            valid_tokens = seq[seq != 0]  # 移除padding
            tokens.append(valid_tokens)

        # 合并所有token
        all_tokens = torch.cat(tokens, dim=0)

        # 批量化：[总长度] -> [seq_len, batch_size]
        nbatch = all_tokens.size(0) // batch_size
        all_tokens = all_tokens[:nbatch * batch_size]
        return all_tokens.view(batch_size, -1).t()

    # 处理三个数据集
    train_data = process_data(data['train_input_ids'])
    val_data = process_data(data['val_input_ids'])
    test_data = process_data(data['test_input_ids'])

    # 获取词汇表大小
    vocab_size = 30522  # BERT tokenizer的词汇表大小

    print(f"训练数据: {train_data.shape}")
    print(f"验证数据: {val_data.shape}")
    print(f"测试数据: {test_data.shape}")

    return train_data, val_data, test_data, vocab_size

def get_batch(source, i, seq_len=35):
    """
    获取一个批次的数据
    """
    seq_len = min(seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]           # 输入
    target = source[i+1:i+1+seq_len]     # 目标（下一个词）
    return data, target.reshape(-1)

# 使用示例
if __name__ == "__main__":
    # 创建批量数据
    train_data, val_data, test_data, vocab_size = create_batches()

    # 获取一个批次测试
    data, targets = get_batch(train_data, 0)
    print(f"批次输入: {data.shape}")
    print(f"批次目标: {targets.shape}")

    print("✅ 完成！")

训练数据: torch.Size([84773, 20])
验证数据: torch.Size([9058, 20])
测试数据: torch.Size([10265, 20])
批次输入: torch.Size([35, 20])
批次目标: torch.Size([700])
✅ 完成！


In [39]:
#获取官方源码
# -*- coding: utf-8 -*-

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output



class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


In [41]:
#设置超参数和构建模型
ntoken = vocab_size # vocab_size是因为用的是bert分词生成的字典
ninp = 200 #词嵌入维度
nhead = 2
nhid = 200 #前馈全连接层节点数
nlayers = 2
dropout = 0.2

#设置cuda设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#按照超参数创建模型对象
model = TransformerModel(ntoken=ntoken,ninp=ninp,nhead=nhead,nhid=nhid,nlayers=nlayers,dropout=dropout).to(device)

#准备训练
criterion = nn.CrossEntropyLoss()
lr = 5.0 # 学习率
optimizer = torch.optim.SGD(model.parameters(), lr=lr) #创建优化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)#学习率自动调整工具

# 开始训练
for epoch in range(2):
    model.train()
    total_loss = 0

    for batch, i in enumerate(range(0, train_data.size(0) - 1, 35)):
        data, targets = get_batch(train_data, i)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntoken), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()

        if batch % 100 == 0:
            print(f'Epoch {epoch}, Batch {batch}, Loss: {loss.item():.4f}')

    print(f'Epoch {epoch} finished, Avg Loss: {total_loss/batch:.4f}')

print("训练完成！")




Epoch 0, Batch 0, Loss: 10.6197
Epoch 0, Batch 100, Loss: 6.8308
Epoch 0, Batch 200, Loss: 6.4037


KeyboardInterrupt: 

In [42]:
import torch
import torch.nn as nn
import math
from transformers import AutoTokenizer

# 获取正确的词汇表大小
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")# 获取bert分词器
vocab_size = tokenizer.vocab_size  # 30522

# 设置超参数和构建模型
ntoken = vocab_size # BERT分词器的词汇表大小
ninp = 200 #词嵌入维度
nhead = 2
nhid = 200 #前馈全连接层节点数
nlayers = 2
dropout = 0.2

# 设置cuda设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 按照超参数创建模型对象
model = TransformerModel(ntoken=ntoken,ninp=ninp,nhead=nhead,nhid=nhid,nlayers=nlayers,dropout=dropout).to(device)

# 将数据移到设备上
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

# 准备训练
criterion = nn.CrossEntropyLoss()
lr = 5.0 # 学习率
optimizer = torch.optim.SGD(model.parameters(), lr=lr) #创建优化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)#学习率自动调整工具

def evaluate_model(model, data, device):
    """简单的模型评估函数"""
    model.eval()  # 设置为评估模式
    total_loss = 0.0
    total_batches = 0

    with torch.no_grad():  # 不计算梯度，节省内存
        for i in range(0, data.size(0) - 1, 35):
            # 获取批次数据
            input_data, targets = get_batch(data, i)
            input_data, targets = input_data.to(device), targets.to(device)

            # 前向传播
            output = model(input_data)
            loss = criterion(output.view(-1, ntoken), targets)

            total_loss += loss.item()
            total_batches += 1

    # 计算平均损失和困惑度
    avg_loss = total_loss / total_batches
    perplexity = math.exp(avg_loss)

    return avg_loss, perplexity

def simple_evaluate():
    """简单评估并打印结果"""
    # 验证集评估
    val_loss, val_ppl = evaluate_model(model, val_data, device)
    print(f"验证集 - Loss: {val_loss:.4f}, 困惑度: {val_ppl:.2f}")

    # 测试集评估
    test_loss, test_ppl = evaluate_model(model, test_data, device)
    print(f"测试集 - Loss: {test_loss:.4f}, 困惑度: {test_ppl:.2f}")

    return val_loss, test_loss

# 开始训练
print("开始训练...")
print(f"训练数据形状: {train_data.shape}")
print(f"总批次数: {(train_data.size(0) - 1) // 35}")

for epoch in range(2):
    model.train()
    total_loss = 0
    total_batches = (train_data.size(0) - 1) // 35

    for batch, i in enumerate(range(0, train_data.size(0) - 1, 35)):
        data, targets = get_batch(train_data, i)
        data, targets = data.to(device), targets.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntoken), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()

        if batch % 100 == 0:
            print(f'Epoch {epoch}, Batch {batch}/{total_batches}, Loss: {loss.item():.4f}')

    avg_loss = total_loss / (batch + 1)
    print(f'Epoch {epoch} finished, Avg Loss: {avg_loss:.4f}')

    # 每个epoch后评估
    print(f"\nEpoch {epoch} 评估结果:")
    val_loss, test_loss = simple_evaluate()
    print("-" * 50)

    # 更新学习率
    scheduler.step()

print("训练完成！")

# 最终评估
print("\n=== 最终评估结果 ===")
simple_evaluate()

# 保存模型（可选）
torch.save(model.state_dict(), 'wikitext_transformer_model.pt')
print("模型已保存到: wikitext_transformer_model.pt")

使用设备: cpu
开始训练...
训练数据形状: torch.Size([84773, 20])
总批次数: 2422
Epoch 0, Batch 0/2422, Loss: 10.6549
Epoch 0, Batch 100/2422, Loss: 6.6222
Epoch 0, Batch 200/2422, Loss: 6.4848
Epoch 0, Batch 300/2422, Loss: 6.3450
Epoch 0, Batch 400/2422, Loss: 5.9682
Epoch 0, Batch 500/2422, Loss: 6.0599
Epoch 0, Batch 600/2422, Loss: 6.2041
Epoch 0, Batch 700/2422, Loss: 5.9127
Epoch 0, Batch 800/2422, Loss: 5.9831
Epoch 0, Batch 900/2422, Loss: 5.7636
Epoch 0, Batch 1000/2422, Loss: 5.6051
Epoch 0, Batch 1100/2422, Loss: 5.5765
Epoch 0, Batch 1200/2422, Loss: 6.1256
Epoch 0, Batch 1300/2422, Loss: 5.9243
Epoch 0, Batch 1400/2422, Loss: 5.3704
Epoch 0, Batch 1500/2422, Loss: 5.7416
Epoch 0, Batch 1600/2422, Loss: 5.4538
Epoch 0, Batch 1700/2422, Loss: 5.3165
Epoch 0, Batch 1800/2422, Loss: 5.2734
Epoch 0, Batch 1900/2422, Loss: 5.2922
Epoch 0, Batch 2000/2422, Loss: 5.4741
Epoch 0, Batch 2100/2422, Loss: 5.6420
Epoch 0, Batch 2200/2422, Loss: 5.2162
Epoch 0, Batch 2300/2422, Loss: 5.6071
Epoch 0, Batch