# 0.Intro
1. https://www.cnblogs.com/miners/p/15101283.html
2.

In [8]:
import torch
from torch.nn.functional import softmax

# 1. Input
- input data： 3个输入，每个长度为4

In [3]:
x = [
    [1, 0, 1, 0], # Input 1
    [0, 2, 0, 2], # Input 2
    [1, 1, 1, 1]  # Input 3
    ]
x = torch.tensor(x, dtype=torch.float32)
x.shape

torch.Size([3, 4])

# 2. 手动设一个weight matrix
- 下面的3个`w`是实际要学习的权重，有了这三个w，才可以获取key，query和value。
- 这3个`W`矩阵的shape取决于input data，因为要进行@运算，使得结果是3个`N*N`的qkv矩阵（这里N=3）

In [5]:
w_key = [
    [0, 0, 1],
    [1, 1, 0],
    [0, 1, 0],
    [1, 1, 0]
]
w_query = [
    [1, 0, 1],
    [1, 0, 0],
    [0, 0, 1],
    [0, 1, 1]
]
w_value = [
    [0, 2, 0],
    [0, 3, 0],
    [1, 0, 3],
    [1, 1, 0]
]
w_key = torch.tensor(w_key, dtype=torch.float32)
w_query = torch.tensor(w_query, dtype=torch.float32)
w_value = torch.tensor(w_value, dtype=torch.float32)

# print("Weights for key: \n", w_key)
# print("Weights for query: \n", w_query)
# print("Weights for value: \n", w_value)

# 3. 矩阵乘法计算qkv
1. Note: 通常在神经网络的初始化过程中，这些参数`w`都是比较小的，一般会在Gaussian, Xavier and Kaiming distributions随机采样完成。
2. 在我们实际的应用中，有可能会在点乘后，加上一个bias的向量
3. qkv都是`N*N`的矩阵

In [6]:
keys = x @ w_key
querys = x @ w_query
values = x @ w_value

print("Keys: \n", keys)
print("Querys: \n", querys)
print("Values: \n", values)

Keys: 
 tensor([[0., 1., 1.],
        [4., 4., 0.],
        [2., 3., 1.]])
Querys: 
 tensor([[1., 0., 2.],
        [2., 2., 2.],
        [2., 1., 3.]])
Values: 
 tensor([[1., 2., 3.],
        [2., 8., 0.],
        [2., 6., 3.]])


# 4. 计算attn scores: q和k相乘
1. 为了获取input1的attention score，我们使用点乘来处理所有的key和query
2. 比如：【请在edit模式下阅读】，[1,0,2]表示input1的query，必须乘keys的T才可以
            [0, 4, 2]
[1, 0, 2] x [1, 4, 3] = [2, 4, 4]
            [1, 0, 1]


In [7]:
attn_scores = querys @ keys.T   # keys@querys.T
# tensor([[ 2.,  4.,  4.],  # attention scores from Query 1
#         [ 4., 16., 12.],  # attention scores from Query 2
#         [ 4., 12., 10.]]) # attention scores from Query 3
attn_scores

tensor([[ 2.,  4.,  4.],
        [ 4., 16., 12.],
        [ 4., 12., 10.]])

- 通常会给attn scores添加softmax

In [13]:
attn_scores_softmax = softmax(attn_scores, dim=-1)
print(attn_scores_softmax)
# tensor([[6.3379e-02, 4.6831e-01, 4.6831e-01],
#         [6.0337e-06, 9.8201e-01, 1.7986e-02],
#         [2.9539e-04, 8.8054e-01, 1.1917e-01]])

# For readability, approximate the above as follows
attn_scores_softmax = [
  [0.0, 0.5, 0.5],  # attention scores from Query 1
  [0.0, 1.0, 0.0],  # attention scores from Query 2
  [0.0, 0.9, 0.1]   # attention scores from Query 3
]
attn_scores_softmax = torch.tensor(attn_scores_softmax)
print(attn_scores_softmax)

tensor([[6.3379e-02, 4.6831e-01, 4.6831e-01],
        [6.0337e-06, 9.8201e-01, 1.7986e-02],
        [2.9539e-04, 8.8054e-01, 1.1917e-01]])
tensor([[0.0000, 0.5000, 0.5000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 0.9000, 0.1000]])


# 5.value乘score
1. 得到3个weighted values
2. 比如：[0,0.5,0.5]是input 1的attn score，我需要这3个attn score分别乘到value上去，以表示input 1对于3个value的“看法”，得到的矩阵就是
1: 0.0 * [1, 2, 3] = [0.0, 0.0, 0.0]
2: 0.5 * [2, 8, 0] = [1.0, 4.0, 0.0]
3: 0.5 * [2, 6, 3] = [1.0, 3.0, 1.5]
3. 上述矩阵相加，就是input 1的new representation，**实际上是value矩阵在attn下加权求和的过程**

In [21]:
# `None`表示在指定位置(此处是第二维)添加一维。
# value变成(3,1,3),attn_scores_softmax.T变成(3,3,1)
# 此时第一dim表示的是N,
weighted_values = values[:,None,:] * attn_scores_softmax.T[:,:,None]
print(weighted_values)

tensor([[[0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000]],

        [[1.0000, 4.0000, 0.0000],
         [2.0000, 8.0000, 0.0000],
         [1.8000, 7.2000, 0.0000]],

        [[1.0000, 3.0000, 1.5000],
         [0.0000, 0.0000, 0.0000],
         [0.2000, 0.6000, 0.3000]]])


- 把所有的weighted values进行element-wise的相加
- **其中的结果向量[2.0, 7.0, 1.5]就是ouput1的和其他key交互的query representation**。

In [22]:
outputs = weighted_values.sum(dim=0)
print(outputs)

tensor([[2.0000, 7.0000, 1.5000],
        [2.0000, 8.0000, 0.0000],
        [2.0000, 7.8000, 0.3000]])


- 因此是把n个1×T的encoding，变成n个1×n的encoding
- 需要的参数3×T×n+bias