In [2]:
import torch
import torch.nn.functional as F


In [19]:
# 点积注意力
x1 = torch.randn(2,3,4)
x2 = torch.randn(2,5,4)
print("x1", x1.shape)
print("x2", x2.shape)
print("x2转置", x2.transpose(1,2).shape)

raw_weights = torch.bmm(x1, x2.transpose(1,2))
# 按第三个维度进行softmax，固定的第一、二维度下，和为1
attn_weights = F.softmax(raw_weights, dim=2)
print("attn_weights", attn_weights.shape)
attn_output = torch.bmm(attn_weights, x2)
print("attn_output", attn_output.shape)


x1 torch.Size([2, 3, 4])
x2 torch.Size([2, 5, 4])
x2转置 torch.Size([2, 4, 5])
attn_weights torch.Size([2, 3, 5])
attn_output torch.Size([2, 3, 4])


In [3]:
# 缩放注意力
x1 = torch.randn(2,3,4)
x2 = torch.randn(2,5,4)
print("x1", x1.shape)
print("x2", x2.shape)
print("x2转置", x2.transpose(1,2).shape)

raw_weights = torch.bmm(x1, x2.transpose(1,2))
scaling_factor = x1.size(-1) ** 0.5  # 防梯度爆炸，按维度缩放
print("scaling_factor", scaling_factor) # 防梯度爆炸
scaled_weights = raw_weights  / scaling_factor
print("scaled_weights", scaled_weights.shape)
# 按第三个维度进行softmax，固定的第一、二维度下，和为1
attn_weights = F.softmax(scaled_weights, dim=2)
print("attn_weights", attn_weights.shape)
attn_output = torch.bmm(attn_weights, x2)
print("attn_output", attn_output.shape)

x1 torch.Size([2, 3, 4])
x2 torch.Size([2, 5, 4])
x2转置 torch.Size([2, 4, 5])
scaling_factor 2.0
scaled_weights torch.Size([2, 3, 5])
attn_weights torch.Size([2, 3, 5])
attn_output torch.Size([2, 3, 4])


In [28]:
# QKV
#1. 创建 Query、Key 和 Value 张量
q = torch.randn(2, 3, 4) # 形状 (batch_size, seq_len1, feature_dim)
k = torch.randn(2, 4, 4) # 形状 (batch_size, seq_len2, feature_dim)
v = torch.randn(2, 4, 8) # 形状 (batch_size, seq_len2, feature_dim)
# 2. 计算点积，得到原始权重，形状为 (batch_size, seq_len1, seq_len2)
raw_weights = torch.bmm(q, k.transpose(1, 2))
# 3. 将原始权重进行缩放（可选），形状仍为 (batch_size, seq_len1, seq_len2)
scaling_factor = q.size(-1) ** 0.5
scaled_weights = raw_weights / scaling_factor
# 4. 应用 softmax 函数，使结果的值在 0 和 1 之间，且每一行的和为 1
attn_weights = F.softmax(scaled_weights, dim=-1) # 形状仍为 (batch_size, seq_len1, seq_len2)
print(attn_weights.shape)
# 5. 与 Value 相乘，得到注意力分布的加权和 , 形状为 (batch_size, seq_len1, feature_dim)
attn_output = torch.bmm(attn_weights, v)
print(attn_output.shape)

torch.Size([2, 3, 4])
torch.Size([2, 3, 8])


In [6]:
 # 一个形状为 (batch_size, seq_len, feature_dim) 的张量 x
x = torch.randn(2, 3, 4) # 形状 (batch_size, seq_len, feature_dim)
# 定义线性层用于将 x 转换为 Q, K, V 向量
linear_q = torch.nn.Linear(4, 4)
linear_k = torch.nn.Linear(4, 4)
linear_v = torch.nn.Linear(4, 4)
# 通过线性层计算 Q, K, V
Q = linear_q(x) # 形状 (batch_size, seq_len, feature_dim)
K = linear_k(x) # 形状 (batch_size, seq_len, feature_dim)
V = linear_v(x) # 形状 (batch_size, seq_len, feature_dim)
# 计算 Q 和 K 的点积，作为相似度分数 , 也就是自注意力原始权重
raw_weights = torch.bmm(Q, K.transpose(1, 2)) # 形状 (batch_size, seq_len, seq_len)
# 将自注意力原始权重进行缩放
scale_factor = K.size(-1) ** 0.5  # 这里是 4 ** 0.5
scaled_weights = raw_weights / scale_factor # 形状 (batch_size, seq_len, seq_len)
# 对缩放后的权重进行 softmax 归一化，得到注意力权重
attn_weights = F.softmax(scaled_weights, dim=2) # 形状 (batch_size, seq_len, seq_len)
# 将注意力权重应用于 V 向量，计算加权和，得到加权信息
attn_outputs = torch.bmm(attn_weights, V) # 形状 (batch_size, seq_len, feature_dim)
print("attn_output", attn_outputs.shape)

attn_output torch.Size([2, 3, 4])


In [18]:
# 假设有h个头，每个头的维度是d_k
# 输入x的维度是(batch_size, seq_len, d_model)

batch_size, seq_len, dim = 2,3,10
h = 5
dk = dim // h
x = torch.randn(batch_size, seq_len, dim)
print("x.shape", x.shape)
# 1. 线性变换生成多头的Q、K、V
W_q = torch.nn.Linear(dim, h * d_k)
W_k = torch.nn.Linear(dim, h * d_k)
W_v = torch.nn.Linear(dim, h * d_k)

print("w_q.shape", W_q(x).shape)
print("w_q.shape.view", W_q(x).view(batch_size, seq_len, h, d_k).shape)

# 2. 将输出reshape成多头形式
Q = W_q(x).view(batch_size, seq_len, h, d_k).transpose(1, 2)
K = W_k(x).view(batch_size, seq_len, h, d_k).transpose(1, 2)
V = W_v(x).view(batch_size, seq_len, h, d_k).transpose(1, 2)
print("Q.shape", Q.shape)
print("K.shape", Q.shape)
print("V.shape", Q.shape)


# 3. 每个头独立计算注意力
attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_k**0.5)
attention_weights = F.softmax(attention_scores, dim=-1)
print("attention_weights.shape", attention_weights.shape)
head_outputs = torch.matmul(attention_weights, V)
print("head_outputs.shape", head_outputs.shape)


print("head_outputs.transpose", head_outputs.transpose(1, 2).shape)
print("head_outputs.transpose.contiguous", head_outputs.transpose(1, 2).contiguous().shape)
# print("head_outputs.transpose.contiguous.view", head_outputs.transpose(1, 2).contiguous().shape)
# 4. 合并多头的输出
multi_head_output = head_outputs.transpose(1, 2).contiguous().view(batch_size, seq_len, h * d_k)
print("multi_head_output", multi_head_output.shape)

# 5. 最后通过一个线性层整合所有头的信息
final_output = torch.nn.Linear(h * d_k, d_model)(multi_head_output)

x.shape torch.Size([2, 3, 10])
w_q.shape torch.Size([2, 3, 30])
w_q.shape.view torch.Size([2, 3, 5, 6])
Q.shape torch.Size([2, 5, 3, 6])
K.shape torch.Size([2, 5, 3, 6])
V.shape torch.Size([2, 5, 3, 6])
attention_weights.shape torch.Size([2, 5, 3, 3])
head_outputs.shape torch.Size([2, 5, 3, 6])
head_outputs.transpose torch.Size([2, 3, 5, 6])
head_outputs.transpose.contiguous torch.Size([2, 3, 5, 6])
multi_head_output torch.Size([2, 3, 30])


In [17]:
# 1. 创建一个多头注意力层
multihead_attn = torch.nn.MultiheadAttention(embed_dim=16, num_heads=8)
# 2. 生成一个形状为 (batch_size, seq_len, feature_dim) 的张量 x
x = torch.randn(2, 3, 16) # 形状 (batch_size, seq_len, feature_dim)
# 3. 调用多头注意力层，得到输出和注意力权重
output, attn_weights = multihead_attn(x, x, x)
print("output", output.shape)
print("attn_weights", attn_weights.shape)

output torch.Size([2, 3, 16])
attn_weights torch.Size([3, 2, 2])
