## attention 1


In [2]:
import torch 
import torch.nn as nn
import torch.nn.functional as F


In [17]:
class SelfAttentionv1(nn.Module):
    def __init__(self, hidden_dim):
        super(SelfAttentionv1, self).__init__()
        self.hidden_dim = hidden_dim # 隐藏层维度
        self.q_prj = nn.Linear(hidden_dim, hidden_dim) # 线性变换
        self.k_prj = nn.Linear(hidden_dim, hidden_dim) # 线性变换
        self.v_prj = nn.Linear(hidden_dim, hidden_dim) # 线性变换

    def forward(self, x):
        # x: [batch_size, seq_len, hidden_dim]
        Q = self.q_prj(x)
        K = self.k_prj(x)
        V = self.v_prj(x)

        # Q: [batch_size, seq_len, hidden_dim]
        # K: [batch_size, seq_len, hidden_dim]
        # V: [batch_size, seq_len, hidden_dim]

        q_k = torch.matmul(Q, K.transpose(1, 2))
        # q_k: [batch_size, seq_len, seq_len]
        # K.transpose(1, 2): [batch_size, hidden_dim, seq_len]

        attention_weights = F.softmax(q_k / torch.sqrt(torch.tensor(self.hidden_dim)), dim=-1)
        # attention_weights: [batch_size, seq_len, seq_len]
        attention_scroes = torch.matmul(attention_weights, V)
        # attention_scroes: [batch_size, seq_len, hidden_dim]
        print('注意力分数为：\n', attention_scroes)
        print('注意力为：\n', attention_weights)
        return attention_scroes

    


X = torch.rand(3, 2, 4)
# X.shape: [batch_size, seq_len, hidden_dim]
net = SelfAttentionv1(4) # 初始化 输入参数为隐藏层维度hidden_dim
net(X)

    

注意力分数为：
 tensor([[[ 0.8778,  0.1924,  0.2765, -0.4848],
         [ 0.8742,  0.1992,  0.2778, -0.4821]],

        [[ 1.0152,  0.1694,  0.2651, -0.4668],
         [ 1.0153,  0.1681,  0.2641, -0.4667]],

        [[ 1.0385,  0.0987,  0.2702, -0.6029],
         [ 1.0400,  0.1005,  0.2747, -0.6045]]], grad_fn=<UnsafeViewBackward0>)
注意力为：
 tensor([[[0.4380, 0.5620],
         [0.4558, 0.5442]],

        [[0.4680, 0.5320],
         [0.4648, 0.5352]],

        [[0.5236, 0.4764],
         [0.5137, 0.4863]]], grad_fn=<SoftmaxBackward0>)


tensor([[[ 0.8778,  0.1924,  0.2765, -0.4848],
         [ 0.8742,  0.1992,  0.2778, -0.4821]],

        [[ 1.0152,  0.1694,  0.2651, -0.4668],
         [ 1.0153,  0.1681,  0.2641, -0.4667]],

        [[ 1.0385,  0.0987,  0.2702, -0.6029],
         [ 1.0400,  0.1005,  0.2747, -0.6045]]], grad_fn=<UnsafeViewBackward0>)

## attention -2(效率提升) -适用于小网络情况

In [18]:
class SelfAttentionv2(nn.Module):
    def __init__ (self, dim):
        super(SelfAttentionv2, self).__init__( )
        self.dim = dim
        self.proj = nn.Linear(dim, dim * 3)
        self.outputs = nn.Linear(dim, dim)

    def forward(self, x):
        # x: [batch_size, seq_len, hidden_dim]
        QKV = self.proj(x)
        Q, K, V = torch.split(QKV, self.dim, dim=-1)
        # Q, K, V: [batch_size, seq_len, hidden_dim]
        attention_weights = F.softmax(torch.matmul(Q, K.transpose(1, 2) / torch.sqrt(torch.tensor(self.dim))), dim=-1)
        attention_scores = torch.matmul(attention_weights, V)

        outputs = self.outputs(attention_scores)
        print('注意力分数为：\n', attention_scores)
        print('注意力为：\n', attention_weights)
        return outputs
    
X = torch.rand(3, 2, 4)
net = SelfAttentionv2(4)
net(X)


注意力分数为：
 tensor([[[-0.7213,  0.4493, -0.5222, -0.0127],
         [-0.7342,  0.4358, -0.5476, -0.0024]],

        [[-0.7223,  0.4778, -0.5426, -0.1256],
         [-0.7189,  0.4836, -0.5337, -0.1305]],

        [[-0.5797,  0.5654, -0.2956, -0.0695],
         [-0.5816,  0.5637, -0.2993, -0.0679]]], grad_fn=<UnsafeViewBackward0>)
注意力为：
 tensor([[[0.5430, 0.4570],
         [0.5207, 0.4793]],

        [[0.4644, 0.5356],
         [0.4475, 0.5525]],

        [[0.5131, 0.4869],
         [0.5042, 0.4958]]], grad_fn=<SoftmaxBackward0>)


tensor([[[ 0.5543,  0.6297,  0.1301, -0.1723],
         [ 0.5677,  0.6364,  0.1352, -0.1872]],

        [[ 0.5232,  0.6501,  0.1275, -0.1786],
         [ 0.5182,  0.6480,  0.1257, -0.1729]],

        [[ 0.4433,  0.5613,  0.0788, -0.0422],
         [ 0.4453,  0.5622,  0.0795, -0.0443]]], grad_fn=<ViewBackward0>)

## 加入细节（dropout的加入）

In [29]:
class SelfAttentionv2(nn.Module):
    def __init__ (self, dim):
        super(SelfAttentionv2, self).__init__( )
        self.dim = dim
        self.att_drop = nn.Dropout(0.1)
        self.proj = nn.Linear(dim, dim * 3)
        self.outputs = nn.Linear(dim, dim)

    def forward(self, x, attention_mask=None):
        # atten_mask: [batch_size, seq_len]
        # x: [batch_size, seq_len, hidden_dim]
        QKV = self.proj(x)
        Q, K, V = torch.split(QKV, self.dim, dim=-1)
        # Q, K, V: [batch_size, seq_len, hidden_dim]
        attention_weights = torch.matmul(Q, K.transpose(1, 2) / torch.sqrt(torch.tensor(self.dim)))
        if attention_mask is not None:
            attention_mask = attention_weights.masked_fill(attention_mask == 0, float('-1e20'))
        attention_scores = F.softmax(attention_weights, dim=-1)
        # print('注意力分数为（不带drop）：\n', attention_scores)
        attention_scores_drop = self.att_drop(attention_scores)
        outputs_drop = torch.matmul(attention_scores_drop, V)

        outputs = self.outputs(outputs_drop)
        # print('注意力分数为（带drop）：\n', attention_scores_drop)
        # print('注意力为（带drop）：\n', attention_scores)
        return outputs
    
X = torch.rand(3, 4, 4)
b = torch.tensor(
    [
        [1, 1, 1, 0],
        [1, 1, 0, 0],
        [1, 0, 0, 0],
    ]
)
print(b.shape)
mask = b.unsqueeze(dim=1).repeat(1, 4, 1)

net = SelfAttentionv2(4)
net(X, mask)


torch.Size([3, 4])


tensor([[[-0.3588,  0.5376,  0.0166,  0.4375],
         [-0.3394,  0.4976, -0.0449,  0.3910],
         [-0.3584,  0.5382,  0.0190,  0.4383],
         [-0.3584,  0.5377,  0.0172,  0.4387]],

        [[-0.3928,  0.4786,  0.0060,  0.4454],
         [-0.4350,  0.4744, -0.0084,  0.4409],
         [-0.4663,  0.5095,  0.0474,  0.4613],
         [-0.3782,  0.4911, -0.0161,  0.4294]],

        [[-0.3991,  0.6046, -0.0762,  0.3775],
         [-0.3918,  0.5564, -0.0970,  0.3846],
         [-0.3964,  0.6046, -0.0763,  0.3767],
         [-0.3934,  0.5569, -0.0966,  0.3850]]], grad_fn=<ViewBackward0>)

## attention-3 完整版

In [33]:
class SelfAttentionv3(nn.Module):
    def __init__(self, dim):
        super(SelfAttentionv3, self).__init__()
        self.dim = dim  # 隐藏层维度
        self.att_drop = nn.Dropout(0.1) # dropout
        self.q_prj = nn.Linear(dim, dim) # Q矩阵
        self.k_prj = nn.Linear(dim, dim) # K矩阵
        self.v_prj = nn.Linear(dim, dim) # V矩阵
        self.outputs_prj = nn.Linear(dim, dim) # 输出矩阵

    def forward(self, x, attention_mask=None):
        # x: [batch_size, seq_len, hidden_dim]
        Q = self.q_prj(x)
        K = self.k_prj(x)
        V = self.v_prj(x)
        # Q, K, V: [batch_size, seq_len, hidden_dim]

        # 计算注意力分数(未经过softmax)
        atten_weights = torch.matmul(Q, K.transpose(1, 2) / torch.sqrt(torch.tensor(self.dim)))

        # mask
        if attention_mask is not None:
            # mask: [batch_size, seq_len]
            # 如果有mask，将mask为0的位置的权重设置为负无穷
            atten_weights = atten_weights.masked_fill(attention_mask == 0, float('-1e20'))
        print('atten_weights为：\n', atten_weights)
        atten_scores = F.softmax(atten_weights, dim=-1) # softmax
        print('注意力分数为：\n', atten_scores)

        # dropout
        atten_scores_drop = self.att_drop(atten_scores)

        # 输出
        outputs_drop = torch.matmul(atten_scores_drop, V)
        outputs = self.outputs_prj(outputs_drop)

        return outputs
X = torch.rand(3, 4, 4)
b = torch.tensor(
    [
        [1, 1, 1, 0],
        [1, 1, 0, 0],
        [1, 0, 0, 0],
    ]
)
mask = b.unsqueeze(dim=1).repeat(1, 4, 1)
# print(mask)
net = SelfAttentionv3(4)
net(X, mask)



        

atten_weights为：
 tensor([[[ 6.0511e-02,  3.8119e-02,  8.7185e-02, -1.0000e+20],
         [ 6.9878e-02,  4.0051e-02,  8.0080e-02, -1.0000e+20],
         [ 2.0244e-02,  8.3548e-03,  4.3131e-02, -1.0000e+20],
         [ 3.0784e-02,  2.3479e-02,  6.2891e-02, -1.0000e+20]],

        [[ 1.2752e-02,  1.2739e-01, -1.0000e+20, -1.0000e+20],
         [-1.3652e-01, -8.6551e-02, -1.0000e+20, -1.0000e+20],
         [ 3.4274e-02,  7.9726e-02, -1.0000e+20, -1.0000e+20],
         [-4.1723e-02,  2.5581e-02, -1.0000e+20, -1.0000e+20]],

        [[ 4.3268e-02, -1.0000e+20, -1.0000e+20, -1.0000e+20],
         [ 1.4062e-02, -1.0000e+20, -1.0000e+20, -1.0000e+20],
         [-7.5964e-03, -1.0000e+20, -1.0000e+20, -1.0000e+20],
         [-4.3503e-02, -1.0000e+20, -1.0000e+20, -1.0000e+20]]],
       grad_fn=<MaskedFillBackward0>)
注意力分数为：
 tensor([[[0.3328, 0.3254, 0.3418, 0.0000],
         [0.3355, 0.3256, 0.3389, 0.0000],
         [0.3321, 0.3282, 0.3398, 0.0000],
         [0.3305, 0.3281, 0.3413, 0.0000]],



tensor([[[-0.2996, -0.2629,  0.2752, -0.1491],
         [-0.1614, -0.4810,  0.4685,  0.0148],
         [-0.1612, -0.4807,  0.4680,  0.0142],
         [-0.2221, -0.3747,  0.3709, -0.0697]],

        [[-0.3277, -0.2888,  0.2875, -0.1256],
         [-0.3262, -0.2918,  0.2892, -0.1245],
         [-0.1795, -0.4096,  0.3289, -0.1212],
         [-0.1785, -0.4099,  0.3288, -0.1215]],

        [[-0.2251, -0.4599,  0.4430, -0.0071],
         [-0.2251, -0.4599,  0.4430, -0.0071],
         [-0.2251, -0.4599,  0.4430, -0.0071],
         [-0.2251, -0.4599,  0.4430, -0.0071]]], grad_fn=<ViewBackward0>)

## Multiheadattention

In [None]:
import torch    
import torch.nn as nn
import torch.nn.functional as F

In [43]:
class Multiheadattention(nn.Module):
    def __init__(self, dim, num_heads):
        super(Multiheadattention, self).__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.q_prj = nn.Linear(dim, dim)
        self.k_prj = nn.Linear(dim, dim)
        self.v_prj = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(0.1)
        self.outputs = nn.Linear(dim, dim)

    def forward(self, x, attention_mask=None):
        batch_size, seq_len, _ = x.size()
        # x: [batch_size, seq_len, hidden_dim]
        Q = self.q_prj(x)
        K = self.k_prj(x)
        V = self.v_prj(x)
        # Q, K, V: [batch_size, seq_len, hidden_dim]

        # shape 变成 （batch_size, num_head, seq_len, head_dim）
        Q_heads = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        K_heads = K.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        V_heads = V.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        # Q_heads, K_heads, V_heads: [batch_size, num_heads, seq_len, head_dim]

        # 计算注意力分数
        atten_weights = torch.matmul(
            Q_heads, K_heads.permute(0, 1, 3, 2) / torch.sqrt(torch.tensor(self.head_dim))
            )
        # atten_weights: [batch_size, num_heads, seq_len, seq_len]

        # mask
        if attention_mask is not None:
            atten_weights = atten_weights.masked_fill(attention_mask == 0 , float('-inf'))
        print('atten_weights为：\n', atten_weights)
        # softmax
        atten_scores = F.softmax(atten_weights, dim=-1)
        # atten_scores: [batch_size, num_heads, seq_len, seq_len]
        print('atten_scores为：\n', atten_scores)
        
        # dropout
        atten_scores_drop  = self.dropout(atten_scores) 
        # atten_scores_drop: [batch_size, num_heads, seq_len, seq_len]
        outputs = atten_scores @ V_heads
        # outputs: [batch_size, num_heads, seq_len, head_dim]

        # shape 变成 （batch_size, seq_len, hidden_dim）
        outputs = outputs.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, self.dim)

        outputs = self.outputs(outputs)
    
        return outputs

X = torch.rand(3, 4, 128)
b = (torch.tensor([
    [1, 1, 1, 0],
    [1, 1, 0, 0],
    [1, 0, 0, 0],
])
.unsqueeze(dim=1)
.unsqueeze(dim=2)
.expand(3, 8, 4, 4)
)
mask = b
print(b.shape)
net = Multiheadattention(128, 8)  # 8个头

net(X, mask)

        











        

torch.Size([3, 8, 4, 4])
atten_weights为：
 tensor([[[[-0.0115,  0.0934,  0.1705,    -inf],
          [ 0.0662,  0.0352,  0.1450,    -inf],
          [ 0.0251,  0.0474,  0.1311,    -inf],
          [ 0.0045,  0.0365,  0.0454,    -inf]],

         [[ 0.0339, -0.1279, -0.0885,    -inf],
          [-0.0651, -0.2255, -0.1660,    -inf],
          [ 0.0286, -0.1330, -0.1031,    -inf],
          [-0.0301, -0.2029, -0.1718,    -inf]],

         [[-0.1100, -0.0994, -0.1472,    -inf],
          [-0.0950, -0.0587, -0.0683,    -inf],
          [-0.0690, -0.0657, -0.0617,    -inf],
          [-0.0596, -0.1162, -0.0576,    -inf]],

         [[-0.1514, -0.1138, -0.2064,    -inf],
          [-0.2150, -0.2159, -0.2233,    -inf],
          [-0.1203, -0.0865, -0.1552,    -inf],
          [-0.2066, -0.2073, -0.2831,    -inf]],

         [[ 0.0709, -0.0083,  0.0465,    -inf],
          [ 0.0079,  0.0728, -0.0163,    -inf],
          [ 0.0654,  0.1308,  0.0179,    -inf],
          [ 0.1569,  0.0594,  0.0484, 

tensor([[[ 0.3772, -0.0779, -0.2052,  ..., -0.0576, -0.0062,  0.0177],
         [ 0.3744, -0.0813, -0.2066,  ..., -0.0566, -0.0094,  0.0183],
         [ 0.3736, -0.0817, -0.2060,  ..., -0.0572, -0.0072,  0.0196],
         [ 0.3753, -0.0803, -0.2048,  ..., -0.0594, -0.0069,  0.0200]],

        [[ 0.2413, -0.1107, -0.1947,  ..., -0.0364, -0.0559, -0.0388],
         [ 0.2421, -0.1127, -0.1933,  ..., -0.0397, -0.0544, -0.0376],
         [ 0.2460, -0.1129, -0.1936,  ..., -0.0401, -0.0508, -0.0384],
         [ 0.2465, -0.1129, -0.1923,  ..., -0.0393, -0.0514, -0.0363]],

        [[ 0.3836, -0.0500, -0.1790,  ..., -0.1062,  0.1555, -0.0078],
         [ 0.3836, -0.0500, -0.1790,  ..., -0.1062,  0.1555, -0.0078],
         [ 0.3836, -0.0500, -0.1790,  ..., -0.1062,  0.1555, -0.0078],
         [ 0.3836, -0.0500, -0.1790,  ..., -0.1062,  0.1555, -0.0078]]],
       grad_fn=<ViewBackward0>)