In [2]:
import torch

print(torch.cuda.is_available())

True


In [3]:
random_torch = torch.rand(4, 4)
print(random_torch)

tensor([[0.2372, 0.2246, 0.5621, 0.8302],
        [0.8773, 0.8323, 0.4185, 0.4526],
        [0.9798, 0.6827, 0.6646, 0.2128],
        [0.9298, 0.8947, 0.8593, 0.8535]])


In [None]:
class TokenEmbedding(torch.nn.Embedding):
    def __init__(self, vocab_size, d_model):
        '''将输入的词汇表索引转换为指定维度的Embedding, d_model为模型维度'''
        super().__init__(vocab_size, d_model, padding_idx=1)

In [4]:
class PositionalEmbedding(torch.nn.Module):
    def __init__(self, max_len, d_model, device):
        super().__init__()
        self.encoding = torch.zeros(max_len, d_model, device = device)
        self.encoding.requires_grad = False
        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim = 1) # 转换为二维张量
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        # 根据Transformer论文公式
        self.encoding[:, 0::2] = torch.sin(pos / (1000 ** (_2i / d_model))) # [:, 0::2]表示选择所有行，取每行中的偶数索引元素，计算sin值作为位置编码
        self.encoding[:, 1::2] = torch.cos(pos / (1000 ** (_2i / d_model)))
        
    
    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :] # 返回编码矩阵中前seq_len行的元素

In [7]:
class TransformerEmbedding(torch.nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super().__init__()
        self.token_embedding = TokenEmbedding(vocab_size, d_model)
        self.positional_embedding = PositionalEmbedding(max_len, d_model, device)
        self.drop_out = torch.nn.Dropout(p = drop_prob) # 在训练过程中随机丢弃一些神经元以减少过拟合

    
    def forward(self, x):
        token_embedding = self.token_embedding(x) # 实际上是调用了torch.nn.Embedding的__call__()，执行前向传播
        positional_embedding = self.positional_embedding(x)
        return self.drop_out(token_embedding + positional_embedding)
    
        