## Attention

### 计算两个向量相似度


In [18]:
import math
import torch

# Query, Key 初始化
Q = torch.tensor([2.0, 3.0, 1.0])
K1 = torch.tensor([1.0, 2.0, 1.0])  # 'apple'
K2 = torch.tensor([1.0, 1.0, 2.0])  # 'orange'

# 点积计算
dot_product1 = torch.dot(Q, K1)
dot_product2 = torch.dot(Q, K2)

# 缩放因子
d_k = Q.size(0)
scale_factor = math.sqrt(d_k)

# 缩放点积
scaled_dot_product1 = dot_product1 / scale_factor
scaled_dot_product2 = dot_product2 / scale_factor

# Softmax 计算
weights = torch.nn.functional.softmax(torch.tensor(
    [scaled_dot_product1, scaled_dot_product2]), dim=0)

print("权重:", weights)

权重: tensor([0.7604, 0.2396])


#### 机器翻译 Attention-Seq2Seq

传统的 Seq2Seq 模型在处理长句子时存在信息损失的问题，注意力机制通过动态权重分配来解决


In [19]:
import torch
import torch.nn as nn


class AttentionSeq2Seq(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(AttentionSeq2Seq, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim)  # 将输入序列编码为隐藏状态序列
        self.decoder = nn.LSTM(hidden_dim, hidden_dim)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        # 输出注意力权重，（batch_size，seq_length）
        self.output_layer = nn.Linear(hidden_dim, output_dim)
        # （batch_size，seq_length，output_dim）

    def forward(self, src, tgt):
        # Encoder
        encoder_output, (hidden, cell) = self.encoder(src)

        # Decoder with Attention
        output = []
        for i in range(tgt.size(0)):
            # 计算注意力权重
            attention_weights = torch.tanh(self.attention(
                torch.cat((hidden, encoder_output), dim=2)))
            attention_weights = torch.softmax(attention_weights, dim=1)

            # 注意力加权和
            weighted = torch.sum(encoder_output * attention_weights, dim=1)

            # Decoder
            out, (hidden, cell) = self.decoder(
                weighted.unsqueeze(0), (hidden, cell))
            out = self.output_layer(out)
            output.append(out)

        return torch.stack(output)