In [1]:
import math
import torch
from torch import nn
from torch.nn.functional import softmax

In [2]:
# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# 位置编码 - position encoding
$PE(pos,2i) = sin(\dfrac{pos}{10000^{\dfrac{2i}{d}}})$，$PE(pos,2i+1) = cos(\dfrac{pos}{10000^{\dfrac{2i}{d}}})$        
pos表示token在句子中的位置，d代表词嵌入的维度，2i代表在词嵌入维度中的第几维  

In [None]:
embedding_size = 512
max_len = 1000
numerator = torch.arange(max_len, dtype=torch.float32).reshape(-1, 1) # 定义分子 - 转变为(max_len, 1)
denominator = torch.pow(10000, torch.arange(0, embedding_size, 2, dtype=torch.float32) / embedding_size) # 定义分母 - 输出维度为(embedding_size/2, 1)
fraction = numerator / denominator
numerator.shape, denominator.shape, fraction.shape

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_size, dropout, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # 创建一个足够长的P
        self.P = torch.zeros((1, max_len, embedding_size)) 
        
        numerator = torch.arange(max_len, dtype=torch.float32).reshape(-1, 1) # 定义分子 - 转变为(max_len, 1)
        denominator = torch.pow(10000, torch.arange(0, embedding_size, 2, dtype=torch.float32) / embedding_size) # 定义分母 - 输出维度为(1, embedding_size/2)
        fraction = numerator / denominator # 输出维度(max_len, embedding_size/2)
        
        self.P[:, :, 0::2] = torch.sin(fraction) # 0::2 表示从索引 0 开始，每隔两个元素选择一个元素
        self.P[:, :, 1::2] = torch.cos(fraction) # 1::2 表示从索引 1 开始，每隔两个元素选择一个元素。

    def forward(self, X):
        # 输入维度为：(1, seq_size, embedding_size)
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        X = self.dropout(X)
        return X

In [None]:
seq_size = 50
embedding_size = 512
dropout = 0.5

PE = PositionalEncoding(embedding_size, dropout)
X = torch.rand(1, seq_size, embedding_size)
X = PE(X)
X.shape

# 自注意力 - 还没解决掩码问题

In [None]:
def transpose_qkv(X, num_heads):
    """为了多注意力头的并行计算而变换形状"""
    # 输入X的形状:(batch_size, seq_size, embedding_size)
    # 输出X的形状:(batch_size，seq_size，num_heads，embedding_size/num_heads) 四维
    X = X.reshape(X.shape[0], X.shape[1], num_heads, -1)

    # 输出X的形状:(batch_size，num_heads，查询或者“键－值”对的个数, num_hiddens/num_heads)
    X = X.permute(0, 2, 1, 3)

    # 最终输出的形状:(batch_size*num_heads,查询或者“键－值”对的个数, num_hiddens/num_heads)
    return X.reshape(-1, X.shape[2], X.shape[3])

In [None]:
batch_size = 1
seq_size = 2
key_size = 3
keys = torch.rand(batch_size, seq_size, key_size)
keys.transpose(1, 2).shape, keys, keys.transpose(1, 2), torch.tensor([1]).shape, keys.transpose(1, 2)/torch.tensor([2]), math.sqrt(4)

In [None]:
def transpose_qkv(X, num_heads):
    """为了多注意力头的并行计算而变换形状"""
    # 输入X的形状:(batch_size，seq_size，#_size*num_heads)
    X = X.reshape(X.shape[0], X.shape[1], num_heads, -1) # (batch_size，seq_size，num_heads，#_size) 
    X = X.permute(0, 2, 1, 3) # (batch_size，num_heads，seq_size, #_size)
    X = X.reshape(-1, X.shape[2], X.shape[3]) # 最终输出的形状: (batch_size*num_heads, seq_size, #_size)
    return X

def transpose_output(X, num_heads):
    """逆转transpose_qkv函数的操作"""
    # 输出维度：(batch_size*num_heads, seq_size, value_size)
    X = X.reshape(-1, num_heads, X.shape[1], X.shape[2]) # (batch_size, num_heads, seq_size, value_size)
    X = X.permute(0, 2, 1, 3) # (batch_size, seq_size, num_heads, value_size)
    X = X.reshape(X.shape[0], X.shape[1], -1) # (batch_size, seq_size, num_heads*value_size)
    return X

In [None]:
class MultiHeadAttention(nn.Module):
    # 由于encoder和decoder都包含了多头注意力，所以我们需要考虑掩码Masked的情况
    def __init__(self, embedding_size, query_size, key_size, value_size, output_size, num_heads, dropout, bias=False, **kwargs): 
        # 参数我们一般选取 $p_q h = p_k h = p_v h = p_o$，也就是说 query_size*num_heads = key_size*num_heads = value_size*num_heads = output_size
        # 由于 Add & Norm 需要“self.dropout(Y) + X”，所以我们在  Add & Norm 中输入的 X和Y 的维度要匹配，Y是经过多头注意力之后的输出，也就是说 output_size = embedding_size
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.key_size = key_size
        self.num_heads = num_heads
        self.W_q = nn.Linear(embedding_size, query_size*num_heads, bias=bias)
        self.W_k = nn.Linear(embedding_size, key_size*num_heads, bias=bias)
        self.W_v = nn.Linear(embedding_size, value_size*num_heads, bias=bias)
        self.W_o = nn.Linear(value_size*num_heads, output_size, bias=bias)

    def forward(self, X, valid_lens):
        # 输入X的size：(batch_size, seq_size, embedding_size)
        # valid_lens的形状:(batch_size，)或者(batch_size，num_hiddens)

        queries = self.W_q(X) # (batch_size, seq_size, query_size*num_heads)
        keys = self.W_k(X) # (batch_size, seq_size, key_size*num_heads)
        values = self.W_v(X) # (batch_size, seq_size, value_size*num_heads)

        # 根据我们平板上的推导，我们一开始的想法是错误的
        # 在Q K.T之前我们需要对 QKV 进行处理，使得第三个维度 #_size*num_heads 中的 num_heads 到第一个维度 batch_size上去，不要影响 Z_i 的结果
        queries = transpose_qkv(queries, self.num_heads) # (batch_size*num_heads, seq_size, query_size)
        keys = transpose_qkv(keys, self.num_heads) # (batch_size*num_heads, seq_size, key_size)
        values = transpose_qkv(values, self.num_heads) # (batch_size*num_heads, seq_size, value_size)

        # 为了计算 Q K^T 我们需要先reshape keys
        keys = keys.transpose(1, 2) # 交换keys的第一个和第二个维度（从0开始） -> (batch_size*num_heads, key_size, seq_size)
        scores = torch.bmm(queries, keys) # 1.输出维度：(batch_size*num_heads, seq_size, seq_size) 2.我们需要保证 query_size = key_size！
        scores = scores / torch.tensor([math.sqrt(self.key_size)]) # 除以 根号下(key_size)
        scores = nn.functional.softmax(scores) # (batch_size*num_heads, seq_size, seq_size)
        Z = torch.bmm(scores, values) # (batch_size*num_heads, seq_size, seq_size) 与 (batch_size*num_heads, seq_size, value_size) 相乘 -> (batch_size*num_heads, seq_size, value_size)

        # 此时的Z的维度为：(batch_size*num_heads, seq_size, value_size)，我们需要进行一定的转化
        Z_concat = transpose_output(Z, self.num_heads) # Z_concat的维度：(batch_size, seq_size, num_heads*value_size)
        
        if valid_lens is not None:
            # 在轴0（按行），将第行复制num_heads次。
            # 例子：x = torch.tensor([[1, 2], [3, 4]]) result = torch.repeat_interleave(x, repeats=2, dim=0) print(result) 输出：tensor([[1, 2],[1, 2],[3, 4],[3, 4]])
            valid_lens = torch.repeat_interleave(valid_lens, repeats=self.num_heads, dim=0)

        outputs = self.W_o(Z_concat) # outputs的维度：(batch_size, seq_size, output_size)
        return outputs

In [None]:
embedding_size = 512
query_size = 32
key_size = 32
value_size = 32
output_size = 256
num_heads = 8
dropout = 0.5
seq_size = 50

MHA = MultiHeadAttention(embedding_size, query_size, key_size, value_size, output_size, num_heads, dropout)
X = torch.rand(2, seq_size, embedding_size)
outputs = MHA(X, None)
outputs.shape

# Add & Norm - 不改变输出的形状

In [None]:
class AddNorm(nn.Module):
    """残差连接后进行层规范化"""
    def __init__(self, normalized_shape, dropout, **kwargs):
        # 输入维度：(batch_size, seq_size, output_size)
        # normalized_shape是最后一个维度的大小
        super(AddNorm, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(normalized_shape)

    def forward(self, X, Y):
        fx_add_x = self.dropout(Y) + X # 残差连接
        outputs = self.ln(fx_add_x) # Layer Normalization层归一化 - 对每个样本的所有特征进行归一化
        return outputs

# Feed Forward 逐位前反馈神经网络 - （Position-wise Feed-Forward Network, 简称 FFN）

In [None]:
class PositionWiseFFN(nn.Module):
    """基于位置的前馈网络"""
    def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs, **kwargs):
        # 7.4 晚上 21：01：18 来看ffn_num_input就等于 “class MultiHeadAttention(nn.Module)” 中的 output_size
        # 7.4 晚上 21：01：18 来看ffn_num_outputs就等于 embedding_size
        super(PositionWiseFFN, self).__init__(**kwargs)
        self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)

    def forward(self, X):
        X = self.dense1(X)
        X = self.relu(X)
        X = self.dense2(X)
        return X # (batch_size, seq_size, output_size) -> (batch_size, seq_size，ffn_num_outputs)

# encoder

In [None]:
class EncoderBlock(nn.Module):
    """Transformer编码器块"""
    def __init__(self, embedding_size, query_size, key_size, value_size, output_size, num_heads, normalized_shape, ffn_num_hiddens, dropout, use_bias=False, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.attention = MultiHeadAttention(embedding_size, query_size, key_size, value_size, output_size, num_heads, dropout, use_bias)
        self.addnorm1 = AddNorm(normalized_shape, dropout)
        self.ffn = PositionWiseFFN(output_size, ffn_num_hiddens, embedding_size)
        self.addnorm2 = AddNorm(normalized_shape, dropout)

    def forward(self, X, valid_lens):
        # print(self.attention(X, valid_lens).shape)
        Y = self.addnorm1(X, self.attention(X, valid_lens))
        return self.addnorm2(Y, self.ffn(Y))

In [None]:
embedding_size = 512
query_size = 32
key_size = 32
value_size = 32
output_size = 512
num_heads = 8

ffn_num_hiddens = 200
dropout = 0.5

seq_size = 50

normalized_shape = [seq_size, output_size]

encoder_blk = EncoderBlock(embedding_size, query_size, key_size, value_size, output_size, num_heads, normalized_shape, ffn_num_hiddens, dropout)
X = torch.rand(2, seq_size, embedding_size)
o = encoder_blk(X, None)
o.shape

EncoderBlock中的参数有：embedding_size, query_size, key_size, value_size, output_size, num_heads, normalized_shape, ffn_num_hiddens   
其中需要注意的有：
1. embedding_size = output_size
2. query_size*num_heads = key_size*num_heads = value_size*num_heads = output_size
3. normalized_shap = \[seq_size, output_size\]

In [None]:
class TransformerEncoder(nn.Module):
    """Transformer编码器"""
    def __init__(self, seq_size, embedding_size, query_size, key_size, value_size, output_size, num_heads, normalized_shape, ffn_num_hiddens, num_layers, dropout, use_bias=False, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(seq_size, embedding_size)
        self.pos_encoding = PositionalEncoding(embedding_size, dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers): # 堆叠 num_layers 个 EncoderBlock
            self.blks.add_module("block"+str(i), 
                                 EncoderBlock(embedding_size, query_size, key_size, value_size, output_size, num_heads, normalized_shape, ffn_num_hiddens, dropout, use_bias))

    def forward(self, X, valid_lens, *args):
        # 因为位置编码值在-1和1之间，因此需要嵌入值乘以嵌入维度的平方根进行缩放，然后再与位置编码相加。
        # 第一个编码块的输入
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.embedding_size)) # 大佬的解释：token是one-hot，经过embedding相当于从词嵌入矩阵W中取特定行，而W被 Xavier初始化，其方差和嵌入维数成反比。也就是嵌入维数越大，方差越小，权重越集中于0，后续再和positional encoding相加，词嵌入特征由于绝对值太小，可能被位置信息掩盖，难以影响模型后续计算。因此需要放大W的方差，最直接的方法就是乘以维度的平方根。
        # self.attention_weights = [None] * len(self.blks)
        for i, blk in enumerate(self.blks):
            X = blk(X, valid_lens) # 通过编码块
            # self.attention_weights[i] = blk.attention.attention.attention_weights
        return X

In [None]:
embedding_size = 512
query_size = 32
key_size = 32
value_size = 32
output_size = 512
num_heads = 8

ffn_num_hiddens = 200
num_layers = 8
dropout = 0.5

seq_size = 50

normalized_shape = [seq_size, output_size]

transencoder = TransformerEncoder(seq_size, embedding_size, query_size, key_size, value_size, output_size, num_heads, normalized_shape, ffn_num_hiddens, num_layers, dropout)
X = torch.ones((2, seq_size), dtype=torch.long)
o = transencoder(X, None)
o.shape

# decoder

In [None]:
class DecoderBlock(nn.Module):
    """解码器中第i个块"""
    def __init__(self, key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, dropout, i, **kwargs):
        super(DecoderBlock, self).__init__(**kwargs)
        self.i = i
        self.attention1 = MultiHeadAttention(embedding_size, query_size, key_size, value_size, output_size, num_heads, dropout)
        self.addnorm1 = AddNorm(normalized_shape, dropout)
        self.attention2 = MultiHeadAttention(embedding_size, query_size, key_size, value_size, output_size, num_heads, dropout)
        self.addnorm2 = AddNorm(normalized_shape, dropout)
        self.ffn = PositionWiseFFN(ffn_num_input, ffn_num_hiddens, ffn_num_outputs)
        self.addnorm3 = AddNorm(normalized_shape, dropout)

    def forward(self, X, state):
        encoding_outputs, encoding_valid_lens = state[0], state[1] # encoder的输出为decoder的输入
        # 训练阶段，输出序列的所有词元都在同一时间处理，因此state[2][self.i]初始化为None。
        # 预测阶段，输出序列是通过词元一个接着一个解码的，因此state[2][self.i]包含着直到当前时间步第i个块解码的输出表示
        if state[2][self.i] is None:
            key_values = X
        else:
            key_values = torch.cat((state[2][self.i], X), axis=1)
        state[2][self.i] = key_values # 将已经生成的词元和当前时间步的输入拼接起来，构建 key_values，确保每个时间步都能访问到之前生成的所有词元。
        
        if self.training: # 训练阶段
            batch_size, seq_size, _ = X.shape
            # dec_valid_lens的size：(batch_size,seq_size)，其中每一行是[1,2,...,seq_size]
            dec_valid_lens = torch.arange(1, seq_size + 1, device=X.device).repeat(batch_size, 1)
        else:
            dec_valid_lens = None # 构建dec_valid_lens，以便任何查询都只会与解码器中所有已经生成词元的位置（即直到该查询位置为止）进行注意力计算。

        # 自注意力
        X2 = self.attention1(X, key_values, key_values, dec_valid_lens)
        Y = self.addnorm1(X, X2)
        # 编码器－解码器注意力。
        # enc_outputs的开头:(batch_size,num_steps,num_hiddens)
        Y2 = self.attention2(Y, enc_outputs, enc_outputs, enc_valid_lens)
        Z = self.addnorm2(Y, Y2)
        return self.addnorm3(Z, self.ffn(Z)), state

In [None]:
x = [
  [1, 0, 1, 0], # Input 1
  [0, 2, 0, 2], # Input 2
  [1, 1, 1, 1]  # Input 3
 ]
x = torch.tensor(x, dtype=torch.float32).to(device)
x

In [None]:
def initialize_W_KQV(embedding_size, W_K_size, W_Q_size,  W_V_size):
    """ 注意：W_K_size = W_Q_size，因为后面需要计算注意力得分，必须保证K*Q.T是合理的 """
    # 输出为词嵌入，maybe：(seq_size, embedding_size)
    W_K = torch.normal(0, 1, (embedding_size, W_K_size), device = device) * 0.01
    W_Q = torch.normal(0, 1, (embedding_size, W_Q_size), device = device) * 0.01
    W_V = torch.normal(0, 1, (embedding_size, W_V_size), device = device) * 0.01

    # 附加梯度
    params = [W_K, W_Q, W_V]
    for param in params:
        param.requires_grad_(True)
    
    return params

In [None]:
parms = initialize_W_KQV(4, 4, 4, 4)
parms

In [None]:
def get_QKV(X, parms):
    """ parms = [W_Q, W_K, W_V] """
    W_Q, W_K, W_V = parms
    Q = torch.matmul(X, W_Q)
    K = torch.matmul(X, W_K)
    V = torch.matmul(X, W_V)
    return [Q, K, V]

In [None]:
Q, K, V = get_QKV(x, parms)
Q, K, V 

In [None]:
# 计算注意力分数 - Q乘K.T
# Q.shape, K.shape
attention_scores = torch.matmul(Q, K.T)
attention_scores # 输出的维度为：(seq_size, seq_size)

In [None]:
# 计算softmax - 这里我们忽略了除以sqrt(d_k)，因为根据博客的内容，只是为了防止内积过大，这里我们暂时没有这个需求，所以直接使用softmax
attention_scores_softmax = softmax(attention_scores, dim=-1) # dim=-1，指使行的和等于1
attention_scores_softmax

In [None]:
V.shape, attention_scores_softmax.shape

# test sth

In [None]:
class test(nn.Module):
    def __init__(self, embedding_size, query_size, bias=False, **kwargs):
        super(test, self).__init__(**kwargs)
        self.W_q = nn.Linear(embedding_size, query_size, bias=bias)
    def forward(self, X):
        # print(self.W_q.shape)
        queries = self.W_q(X) # RuntimeError: mat1 and mat2 shapes cannot be multiplied (50x512 and 128x512) 代表是 XW
        return queries

seq_size = 50
embedding_size = 512
query_size = 128

X = torch.rand(1, seq_size, embedding_size)
test = test(embedding_size, query_size)
queries = test(X)
queries.shape