In [1]:
import torch
from torch import candidate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 设置设备
print(torch.__file__)
print(torch.__version__)
print(device)

E:\software\Anaconda\envs\llm-gpt\lib\site-packages\torch\__init__.py
2.1.0+cu118
cuda


In [2]:
import numpy as np
import torch
import torch.nn as nn
from torch.cuda import device

d_k = 64 # K（=Q）维度
d_v = 64 # V维度
# 定义缩放点积注意力类
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()
    def forward(self, Q, K, V, attn_mask):
        #----------------------维度信息------------------------
        # Q、K、V [batch_size, n_heads, len_q/k/v, dim_k/v] (dim_q=dim_k)
        # attn_mask [batch_size, n_heads, len_q, len_k]
        #-------------------------------------------------------
        # 计算注意力分数（原始权重）[batch_size, n_heads, len_q, dim_q] * [batch_size, n_heads, dim_k, len_q] = [batch_size, n_heads, len_q, len_k]
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
        #----------------------维度信息------------------------
        # scores [batch_size, n_heads, len_q, len_k]
        #-------------------------------------------------------
        # 使用注意力掩码，将attn_mask中值为1的位置的权重替换为极小值
        #----------------------维度信息------------------------
        # attn_mask [batch_size, n_heads, len_q, len_k]
        #-------------------------------------------------------
        scores.masked_fill_(attn_mask, -1e9)
        # 用softmax函数对scores进行归一化，得到注意力权重
        weights = nn.Softmax(dim=-1)(scores)  # 创建了一个指定最后一个维度的softmax层，并对scores进行softmax操作
        #----------------------维度信息------------------------
        # weights [batch_size, n_heads, len_q, len_k]
        #-------------------------------------------------------
        # 计算上下文向量（也就是注意力值），是上下文信息的紧凑表示 [batch_size, n_heads, len_q, len_k] * [batch_size, n_heads, len_k, dim_v] = [batch_size, n_heads, len_q, dim_v]
        context = torch.matmul(weights, V)
        #----------------------维度信息------------------------
        # context [batch_size, n_heads, len_q, dim_v]
        #-------------------------------------------------------
        return context, weights # 返回上下文信息和注意力权重

In [3]:
# 定义多头注意力类
d_embedding = 512 # 词嵌入维度
n_heads = 8 # 多头注意力个数
batch_size = 3 # 批次大小
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        # 确保d_model能被n_heads整除
        assert d_embedding % n_heads == 0
        self.W_Q = nn.Linear(d_embedding, d_k * n_heads)  # Q的线性变换层
        self.W_K = nn.Linear(d_embedding, d_k * n_heads)  # K的线性变换层
        self.W_V = nn.Linear(d_embedding, d_v * n_heads)  # V的线性变换层
        self.linear = nn.Linear(d_v * n_heads, d_embedding)  # 最后的线性变换层
        self.layer_norm = nn.LayerNorm(d_embedding)  # Layer Norm层
    def forward(self, Q, K, V, attn_mask):
        #----------------------维度信息------------------------
        # Q、K、V [batch_size, len_q/k/v, embedding_dim]
        #-------------------------------------------------------
        residual, batch_size = Q, Q.size(0) # 残差连接
        # 将输入进行线性变换和重塑，以便后续处理，[batch_size, len_q/k/v, embedding_dim] -> [batch_size, n_heads, len_q/k/v, d_k/d_v]
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)  # Q [batch_size, n_heads, len_q, d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)  # K [batch_size, n_heads, len_k, d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2)  # V [batch_size, n_heads, len_v, d_v]
        # 将注意力掩码复制多到头 attn_mask [batch_size, 1, len_q, len_k] -> [batch_size, n_heads, len_q, len_k]
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
        #----------------------维度信息------------------------
        # attn_mask [batch_size, n_heads, len_q, len_k]
        #-------------------------------------------------------
        # 使用缩放点积注意力计算上下文和注意力权重
        context, weights = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        #----------------------维度信息------------------------
        # context [batch_size, n_heads, len_q, d_v]
        # weights [batch_size, n_heads, len_q, len_k]
        #-------------------------------------------------------
        # 通过调整维度将多个头的上下文向量连接在一起，[batch_size, n_heads, len_q, d_v] -> [batch_size, len_q, n_heads * d_v]
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)
        #----------------------维度信息------------------------
        # context [batch_size, len_q, n_heads * d_v]
        #-------------------------------------------------------
        # 用一个线性层把连接后的在多头注意力结果转换，原始地嵌入维度，[batch_size, len_q, n_heads * d_v] -> [batch_size, len_q, embedding_dim]
        output = self.linear(context)
        #----------------------维度信息------------------------
        # output [batch_size, len_q, embedding_dim]
        #-------------------------------------------------------
        # 与输入(Q)进行残差连接，并进行层归一化后处理
        output = self.layer_norm(output + residual)
        #----------------------维度信息------------------------
        # output [batch_size, len_q, embedding_dim]
        #-------------------------------------------------------
        return output, weights  # 返回层归一化的输出和和注意力权重

In [4]:
# 定义逐位置前馈网络
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, d_ff=2048):
        super(PoswiseFeedForwardNet, self).__init__()
        # 定义一维卷积层1，用于将输入映射到更高维度
        self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=d_ff, kernel_size=1)
        # 定义一维卷积层2，用于将映射后的向量映射回原始维度
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_embedding, kernel_size=1)
        # 定义层归一化
        self.layer_norm = nn.LayerNorm(d_embedding)
    def forward(self, inputs):
        #----------------------维度信息------------------------
        # inputs [batch_size, len_q, embedding_dim]
        #-------------------------------------------------------
        residual = inputs  # 保留残差连接
        # 在第一个卷积层1后使用ReLU函数，[batch_size, len_q, embedding_dim] -> [batch_size, embedding_dim, len_q] -> [batch_size, len_q, d_ff]
        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        #----------------------维度信息------------------------
        # output [batch_size, d_ff, len_q]
        #-------------------------------------------------------
        # 使用卷积2进行降维
        output = self.conv2(output).transpose(1, 2)
        #----------------------维度信息------------------------
        # output [batch_size, len_q, embedding_dim]
        #-------------------------------------------------------
        # 与输入进行残差连接，并进行层归一化
        output = self.layer_norm(output + residual)
        #----------------------维度信息------------------------
        # output [batch_size, len_q, embedding_dim]
        #-------------------------------------------------------
        return output  # 返回层归一化的输出

In [5]:
# 定义填充注意力掩码函数
def get_attn_pad_mask(seq_q, seq_k):
    #----------------------维度信息------------------------
    # seq_q的维度是[batch_size, len_q]
    # seq_k的维度是[batch_size, len_k]
    #-------------------------------------------------------
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # 生成布尔类型张量
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # <PAD>token的编码值为0
    #----------------------维度信息------------------------
    # pad_attn_mask [batch_size, 1, len_k]
    #-------------------------------------------------------
    # 变形为与注意力分数相同的张量
    pad_attn_mask = pad_attn_mask.expand(batch_size, len_q, len_k)
    #----------------------维度信息------------------------
    # pad_attn_mask [batch_size, len_q, len_k]
    #-------------------------------------------------------
    return pad_attn_mask  # 返回注意力掩码张量

In [6]:
# 生成后续注意力掩码的函数，用于在多头自注意力计算中忽略未来信息
def get_attn_subsequence_mask(seq):
    #----------------------维度信息------------------------
    # seq的维度是[batch_size, seq_len(Q)=seq_len(K)]
    #-------------------------------------------------------
    # 获取输入序列的形状
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    #----------------------维度信息------------------------
    # attn_shape [batch_size, seq_len, seq_len]
    #-------------------------------------------------------
    # 生成一个下三角矩阵
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    #----------------------维度信息------------------------
    # subsequent_mask [batch_size, seq_len, seq_len]
    #-------------------------------------------------------
    # 将numpy数组转换为Tensor，并将其转换为布尔类型
    subsequent_mask = torch.from_numpy(subsequent_mask).bool()
    #----------------------维度信息------------------------
    # subsequent_mask [batch_size, seq_len, seq_len]
    #-------------------------------------------------------
    return subsequent_mask  # 返回后续注意力掩码张量

In [7]:
# 定义解码器层类
class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention()  # 多头自注意力
        self.feed_forward = PoswiseFeedForwardNet()  # 逐位置前馈网络
        self.norm1 = nn.LayerNorm(d_embedding)  # 第一个层归一化
        self.norm2 = nn.LayerNorm(d_embedding)  # 第二个层归一化
    def forward(self, dec_inputs, self_attn_mask=None):
        # 使用多头注意力处理输入
        dec_outputs, _ = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)
        # 将注意力输出与输入相加并进行第一个层归一化
        norm1_outputs = self.norm1(dec_outputs + dec_inputs)
        # 将第一个层归一化的输出输入到逐位置前馈网络
        ff_outputs = self.feed_forward(norm1_outputs)
        # 将前馈网络的输出与第一个层归一化的输出相加并进行第二个层归一化
        dec_outputs = self.norm2(ff_outputs + norm1_outputs)
        return dec_outputs  # 返回解码器层的输出

In [8]:
# 定义解码器类
n_layers = 6 # 解码器层数
class Decoder(nn.Module):
    def __init__(self, vocab_size, max_seq_len):
        super(Decoder, self).__init__()
        # 词嵌入层（参数为字典维度）
        self.src_emb = nn.Embedding(vocab_size, d_embedding)
        # 位置编码层 （参数为最大序列长度）
        self.pos_emb = nn.Embedding(max_seq_len, d_embedding)
        # 初始化N个解码器层
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
    def forward(self, dec_inputs):
        # 创建位置信息
        positions = torch.arange(len(dec_inputs), device=dec_inputs.device).unsqueeze(-1)  # [seq_len, 1]
        # 将输入序列的词嵌入和位置编码相加
        inputs_embedding = self.src_emb(dec_inputs) + self.pos_emb(positions)
        # 生成解码器自注意力掩码
        attn_mask = get_attn_subsequence_mask(inputs_embedding).to(device)
        # 初始化解码器输入，这是第一个解码器层的输入
        dec_outputs = inputs_embedding
        for layer in self.layers:
            # 逐层调用解码器层
            dec_outputs = layer(dec_outputs, attn_mask)
        return dec_outputs  # 返回解码器的输出

In [9]:
# 定义GPT模型
class GPT(nn.Module):
    def __init__(self, vocab_size, max_seq_len):
        super(GPT, self).__init__()
        self.decoder = Decoder(vocab_size, max_seq_len)  # 解码器，用于学习文本生成能力
        self.projection = nn.Linear(d_embedding, vocab_size, bias=False)  # 全连接层，输出预测成果
    def forward(self, dec_inputs):
        # 通过解码器获取输出
        dec_outputs = self.decoder(dec_inputs)
        # 通过全连接层获取预测结果
        dec_logits = self.projection(dec_outputs)
        return dec_logits

In [10]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = WikiText2(split='train', root='.')
valid_iter = WikiText2(split='valid', root='.')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<pad>', '<sos>', '<eos>'])
vocab.set_default_index(vocab['<pad>'])
# 打印词汇表信息
print("词汇表大小:", len(vocab))
print("词汇示例(word to index):", 
      {word: vocab[word] for word in ["<pad>", "<sos>", "<eos>", "the", "apple"]})

词汇表大小: 28785
词汇示例(word to index): {'<pad>': 0, '<sos>': 1, '<eos>': 2, 'the': 3, 'apple': 11505}


In [11]:
from torch.utils.data import Dataset # 导入Dataset
max_seq_len = 256 # 设置序列的最大长度

# 定义一个处理WikiText2数据集的自定义数据集类
class WikiDataset(Dataset):
    def __init__(self, data_iter, vocab, max_len=max_seq_len):
        self.data = []        
        for sentence in data_iter: # 遍历数据集，将文本转换为tokens
            # 对每个句子进行tokenization，并截取长度为max_len-2，为<sos>和<eos>留出空间
            tokens = tokenizer(sentence)[:max_len - 2]
            tokens = [vocab["<sos>"]] + vocab(tokens) + [vocab["<eos>"]] # 添加<sos>和<eos>            
            self.data.append(tokens) # 将处理好的tokens添加到数据集中
    
    def __len__(self): # 定义数据集的长度
        return len(self.data)    
    
    def __getitem__(self, idx): # 定义数据集的索引方法 (即抽取数据条目)        
        source = self.data[idx][:-1] # 获取当前数据，并将<eos>移除，作为source        
        target = self.data[idx][1:] # 获取当前数据，并将<sos>移除，作为target（右移1位）       
        return torch.tensor(source), torch.tensor(target) # 转换为tensor并返回

train_dataset = WikiDataset(train_iter, vocab) # 创建训练数据集
valid_dataset = WikiDataset(valid_iter, vocab) # 创建验证数据集
print(f"Dataset数据条目: {len(train_dataset)}")
sample_source, sample_target = train_dataset[100]
print(f"输入序列张量样例: {sample_source}")
print(f"目标序列张量样例: {sample_target}")
decoded_source = ' '.join(vocab.lookup_tokens(sample_source.tolist()))
decoded_target = ' '.join(vocab.lookup_tokens(sample_target.tolist()))
print(f"输入序列样例文本: {decoded_source}")
print(f"目标序列样例文本: {decoded_target}")

Dataset数据条目: 36718
输入序列张量样例: tensor([    1,  2659,  3478, 17569,  9098])
目标序列张量样例: tensor([ 2659,  3478, 17569,  9098,     2])
输入序列样例文本: <sos> 96 ammunition packing boxes
目标序列样例文本: 96 ammunition packing boxes <eos>


In [12]:
from torch.utils.data import DataLoader # 导入DataLoader
# 定义pad_sequence函数，用于将一批序列填充到相同长度
def pad_sequence(sequences, padding_value=0, length=None):
    # 获取每个序列的长度，如果没有指定长度，则使用最长序列的长度
    max_size = length if length is not None else max([len(seq) for seq in sequences])
    # 将每个序列填充到相同长度
    return [torch.cat([seq, torch.tensor([padding_value] * (max_size - len(seq)))]) for seq in sequences]

# 定义collate_fn函数，用于将一批数据转换为模型的输入和目标
def collate_fn(batch):
    # 从批次中分离源序列和目标序列
    sources, targets = zip(*batch)  # 将源序列和目标序列分离
    # 计算批次中的最大序列长度
    max_len = max(max(len(s) for s in sources), max(len(t) for t in targets))
    # 将源序列和目标序列填充到相同长度
    sources = pad_sequence(sources, vocab["<pad>"], max_len)
    targets = pad_sequence(targets, vocab["<pad>"], max_len)
    return torch.stack(sources), torch.stack(targets) # 返回模型的输入和目标

# 创建一个训练数据加载器，用于加载训练数据
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 创建一个验证数据加载器，用于加载验证数据
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [13]:
import torch.optim as optim # 导入优化器
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 设置设备
model = GPT(len(vocab), max_seq_len).to(device) # 创建模型并移动到设备
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"]) # 创建交叉熵损失函数，忽略填充标记
optimizer = optim.Adam(model.parameters(), lr=0.0001) # 创建Adam优化器
epochs = 2 # 设置训练轮数

import os # 导入os模块
min_valid_loss = float('inf') # 初始化最小验证损失
save_path = "best_model.pt"  # 模型保存路径

for epoch in range(epochs):
    epoch_loss = 0
    for batch_idx, (source, target) in enumerate(train_loader):  # 遍历训练数据加载器
        inputs, targets = source.long().to(device), target.long().to(device)  # 将输入和目标移动到设备
        optimizer.zero_grad()  # 梯度清零
        outputs = model(inputs)  # 通过模型获取输出
        loss = criterion(outputs.view(-1, len(vocab)), targets.view(-1))  # 计算损失
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数
        epoch_loss += loss.item()  # 累加损失
        if batch_idx % 1000 == 0:  # 每1000个批次打印一次损失
            print(f"Epoch {epoch + 1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, Loss {loss.item()}")
    print(f"Epoch {epoch + 1}/{epochs}, Loss {epoch_loss / len(train_loader)}")  # 打印每轮的平均损失
    # 评估模型
    model.eval()  # 设置模型为评估模式
    valid_loss = 0
    with torch.no_grad():  # 关闭梯度计算
        for source, target in valid_loader:  # 遍历验证数据加载器
            inputs, targets = source.long().to(device), target.long().to(device)  # 将输入和目标移动到设备
            outputs = model(inputs)  # 通过模型获取输出
            loss = criterion(outputs.view(-1, len(vocab)), targets.view(-1))  # 计算损失
            valid_loss += loss.item()  # 累加损失
        valid_loss /= len(valid_loader)  # 计算平均损失
        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss {valid_loss}")  # 打印验证损失
        # 保存最优模型
        if valid_loss < min_valid_loss:
            min_valid_loss = valid_loss
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved at epoch {epoch + 1} with validation loss: {valid_loss}")
    model.train()  # 设置模型为训练模式

Epoch 1/2, Batch 0/12240, Loss 10.33438491821289
Epoch 1/2, Batch 1000/12240, Loss 6.7005615234375
Epoch 1/2, Batch 2000/12240, Loss 6.272916316986084
Epoch 1/2, Batch 3000/12240, Loss 6.35912561416626
Epoch 1/2, Batch 4000/12240, Loss 6.1083550453186035
Epoch 1/2, Batch 5000/12240, Loss 5.837934494018555
Epoch 1/2, Batch 6000/12240, Loss 5.301558971405029
Epoch 1/2, Batch 7000/12240, Loss 5.902187824249268
Epoch 1/2, Batch 8000/12240, Loss 5.574525833129883
Epoch 1/2, Batch 9000/12240, Loss 5.52511739730835
Epoch 1/2, Batch 10000/12240, Loss 5.786240577697754
Epoch 1/2, Batch 11000/12240, Loss 6.608101844787598
Epoch 1/2, Batch 12000/12240, Loss 5.677886009216309
Epoch 1/2, Loss 5.321375150438642
Epoch 1/2, Validation Loss 4.74414207259909
New best model saved at epoch 1 with validation loss: 4.74414207259909
Epoch 2/2, Batch 0/12240, Loss 0.9734917879104614
Epoch 2/2, Batch 1000/12240, Loss 1.9096531867980957
Epoch 2/2, Batch 2000/12240, Loss 5.712131023406982
Epoch 2/2, Batch 3000/1

In [26]:
# 定义集束搜索函数
def generate_text_beam_search(model, input_str, max_len=50, beam_width=5):
    model.eval()  # 设置模型为评估模式
    # 将输入字符串中的每个token转换为在词汇表中的索引
    input_tokens = [vocab[token] for token in input_str.split()]
    # 创建一个列表用于存储候选序列
    candidates = [(input_tokens, 0.0)]  # (序列, 分数)
    with torch.no_grad():  # 关闭梯度计算
        for _ in range(max_len):  # 生成最大长度的序列
            new_candidates = []
            for candidate, candidate_score in candidates:  # 遍历候选序列
                inputs = torch.LongTensor(candidate).unsqueeze(0).to(device)  # 将输入移动到设备
                outputs = model(inputs)  # 输出logits形状为[1, len(output_tokens), len(vocab)]
                logits = outputs[0, -1, :]  # 获取最后一个时间步的输出
                scores, next_tokens = torch.topk(logits, beam_width, dim=-1)  # 获取最高的beam_width个分数和索引
                final_results = []  # 存储最终结果
                for score, next_token in zip(scores.squeeze(), next_tokens.squeeze()):
                    new_candidate = candidate + [next_token.item()]  # 添加下一个token
                    new_score = candidate_score - score.item()  # 使用负数，因为需要降序排列
                    if next_token.item() == vocab["<eos>"]:
                        # 如果下一个token是<eos>，则将当前序列添加到最终结果
                        final_results.append((new_candidate, new_score))
                    else:
                        # 否则将当前序列添加到新的候选序列
                        new_candidates.append((new_candidate, new_score))
            # 从新的候选序列中选择最高的beam_width个序列
            candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width]
    # 选择最终结果中的最高分数序列
    best_candidate = sorted(final_results, key=lambda x: x[1])[0]
    # 将输出的token转换为字符串
    output_str = ' '.join([vocab.get_itos()[token] for token in best_candidate[0] if vocab.get_itos()[token] != "<pad>"])
    return output_str

model.load_state_dict(torch.load(save_path))  # 加载最优模型
input_str = "my name"  # 输入字符串
output_str = generate_text_beam_search(model, input_str)  # 生成文本
print("生成的文本：", output_str)  # 打印生成的文本

生成的文本： my name , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> <unk> , <unk> <eos>
