### Transformer模型Encoder原理精讲及其PyTorch逐行实现
https://www.bilibili.com/video/BV1cP4y1V7GF?spm_id_from=333.337.search-card.all.click&vd_source=afe449886875b2cd7aa123878846a9f3


In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='2,3,4,5'
import torch
print(torch.cuda.device_count())

4


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='2,3,4,5'
import torch
print('CUDA_VISIBLE_DEVICES Count:',torch.cuda.device_count())


import torch
import numpy
import torch.nn as nn
import torch.nn.functional as F

# 关于word embedding， 以序列建模为例
# 考虑source和target序列
# 构建序列，序列的字符在词表中以索引表示

src = ['I have a hd ,','but it is a big dog .']
tgt = ['wo you yi ge hd ,','dan shi ta shi yi zhi gou .']
src_len = 8  # for pad and position embedding 
tgt_len = 10  # for pad and position embedding 
max_pos_len = 20
# word_pad = ['P', 'S', 'E']

import numpy as np
# 从原始的句子生成字典和序号输入
def sentence2input(sentence):
    '''
    input: sentence
    output: sequence, word2idx, idx2word
    '''
    source_split_words = [s.split(' ') for s in sentence]
    word = []
    for i in [j.split(' ') for j in sentence]:
        word.extend(i)
    # generate vocabulary =================
    vocab = np.array(word)
    vocab = np.unique(vocab)
    idx2word = dict(enumerate(vocab,start=1))  # 从1开始因为pad填充0
    word2idx = {v: k for k, v in idx2word.items()}
    # word2idx = {w:i for i, w in enumerate(vocab)} 
    sequence = []
    for n in source_split_words:
        bs = [word2idx[w] for w in n]
        sequence.append(bs)
    return sequence, word2idx, idx2word


def pad(input,max_len,pad_value=0):
    '''
    input: sentence
    output: sequence, word2idx, idx2word
    '''
    import copy
    pad_ = []
    for i in input:
        # print(i)
        ii = copy.deepcopy(i)
        if len(i)<max_len:
            error_len = max_len - len(i)
            for _ in range(error_len):
                ii.append(pad_value)
        pad_.append(ii[:max_len])
    pad_ = torch.IntTensor(pad_)
    return pad_

src_input, src_vocab_word2idx, enc_vocab_idx2word = sentence2input(src)
tgt_input, tgt_vocab_word2idx, tgt_vocab_idx2word = sentence2input(tgt)
src_vocab_len = len(src_vocab_word2idx)  # 后面会用到
tgt_vocab_len = len(tgt_vocab_word2idx)  # 后面会用到
print(src_input)
print(tgt_input)
print('end!')
src_input_T = pad(src_input,src_len,pad_value=0)
tgt_input_T = pad(tgt_input,tgt_len,pad_value=0)
print(src_input_T)
print(tgt_input_T)

CUDA_VISIBLE_DEVICES Count: 4
[[3, 8, 4, 9, 1], [6, 11, 10, 4, 5, 7, 2]]
[[9, 11, 10, 4, 6, 1], [3, 7, 8, 7, 10, 12, 5, 2]]
end!
tensor([[ 3,  8,  4,  9,  1,  0,  0,  0],
        [ 6, 11, 10,  4,  5,  7,  2,  0]], dtype=torch.int32)
tensor([[ 9, 11, 10,  4,  6,  1,  0,  0,  0,  0],
        [ 3,  7,  8,  7, 10, 12,  5,  2,  0,  0]], dtype=torch.int32)


In [3]:
# 构造word embedding
model_dim = 8 # 512

src_embedding_table = nn.Embedding(
                                   +1,model_dim)  # 初始化一个embedding类，shape：num_embeddings: int, embedding_dim: int
tgt_embedding_table = nn.Embedding(tgt_vocab_len+1,model_dim)  # 调用的是nn.Embedding类的forword方法，直接调用类后面一个括号就是调用该类中的forward方法
print(src_embedding_table.weight)

print(src_input_T)
src_embedding = src_embedding_table(src_input_T)
tgt_embedding = tgt_embedding_table(tgt_input_T)
print(src_embedding)

Parameter containing:
tensor([[ 2.3609,  0.1995, -0.4773, -0.0060, -0.1113,  0.6070,  0.7216, -1.2232],
        [ 0.0999,  0.3541, -0.0295,  0.8658, -0.1582,  0.9843,  1.1086, -0.8304],
        [ 0.9446, -0.4625, -0.0565,  1.5416,  0.4657, -0.8963, -1.9402, -0.3984],
        [ 0.8807,  1.0814, -0.0042,  0.4982,  0.6170, -1.7120,  0.0210,  1.2159],
        [ 0.2230,  0.0111, -1.2859, -0.9706, -0.3395,  0.5946,  2.0974, -0.8544],
        [ 0.6314,  1.2585, -0.2734, -0.2422,  0.3816, -0.7744,  0.2272,  0.6160],
        [-1.5049, -0.1470, -1.6718, -2.2119,  1.1717, -2.1795,  0.1091,  1.5841],
        [-0.8387, -0.6542, -1.2333,  0.0528,  1.3309,  0.5169, -0.6209,  0.9065],
        [-0.3801, -0.8342, -1.5328, -0.9427,  1.2614,  0.0968, -0.5540,  0.8890],
        [-0.7521, -0.0358,  0.6511, -1.1870,  0.6755, -0.7864,  0.7283,  0.1648],
        [-0.5014, -1.1260, -0.3892,  0.3321, -0.2281,  0.3304,  2.7161, -0.1130],
        [ 0.0074,  0.2132, -1.6190, -0.5002, -1.2446, -2.4929, -1.0454, -0.8

In [4]:
# 构造position embeddings
# src_len = 8 是input长度
# 实例化nn.Embedding类
# src_pos_embedding_table = nn.Embedding(src_len+1,model_dim)  # 初始化一个embedding类，shape：num_embeddings: int, embedding_dim: int
# tgt_pos_embedding_table = nn.Embedding(tgt_len+1,model_dim)  # 调用的是nn.Embedding类的forword方法，直接调用类后面一个括号就是调用该类中的forward方法

### 构造position embeddings
$$ PE(pos, 2i) = sin(pos/10000^{2i/d_model})  $$
$$ PE(pos, 2i+1) = cos(pos/1000^{2i/d_model})  $$

 where $pos$ is the position and $i$ is the dimension. 


In [4]:
# 构造position embeddings
# src_len = 8 是input长度

# 构造全长 position embedding table
# max_pos_len = 20
pos_mat = torch.arange(max_pos_len).reshape(-1,1)  # tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
i_mat = torch.pow(1000, torch.arange(0,model_dim,2).reshape(1,-1)/model_dim)  # tensor([[  1.0000,   5.6234,  31.6228, 177.8279]])
pos_emb_table = torch.zeros(max_pos_len,model_dim)
pos_emb_table[:,0::2] = torch.sin(pos_mat/i_mat)
pos_emb_table[:,1::2] = torch.cos(pos_mat/i_mat)
pos_embedding = nn.Embedding(max_pos_len,model_dim)
# pos_embedding.weight  # nn.Embedding.weight随机初始化方式是标准正态分布，即均值μ=0，方差σ=1的正态分布。
pos_embedding.weight = nn.Parameter(pos_emb_table,requires_grad=False)  # 这里是修改nn.Embedding类的初始化权重.weight，改为计算出的pos_emb_table
#  torch.nn.Parameter是继承自torch.Tensor的子类，其主要作用是作为nn.Module中的可训练参数使用。它与torch.Tensor的区别就是nn.Parameter会自动被认为是module的可训练参数，即加入到parameter()这个迭代器中去；而module中非nn.Parameter()的普通tensor是不在parameter中的。

# 获取 position count
src_pos = [list(range(src_len)) for _ in src]  # 遍历样本src，src_len=8
src_pos = torch.IntTensor(src_pos)
print(src_pos)

tgt_pos = [list(range(tgt_len)) for _ in src]  # 遍历样本src，tgt_len=10
tgt_pos = torch.IntTensor(tgt_pos)
print(tgt_pos)

src_pos_embedding = pos_embedding(src_pos)  # src 和tgt 输入到一个全长的  position embedding table中
tgt_pos_embedding = pos_embedding(tgt_pos)
src_pos_embedding.size(), tgt_pos_embedding.size()

tensor([[0, 1, 2, 3, 4, 5, 6, 7],
        [0, 1, 2, 3, 4, 5, 6, 7]], dtype=torch.int32)
tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], dtype=torch.int32)


(torch.Size([2, 8, 8]), torch.Size([2, 10, 8]))

In [44]:
# 构造encode 的 self attention mask
# mask shape [batchsize, src_len,model_dim]
# mask shape [batchsize, tgt_len,model_dim]
src_real_len = [len(i.split(' ')) for i in src]
valid_encoder_pos = torch.cat([torch.unsqueeze(F.pad(torch.zeros(L,dtype=torch.int32) ,pad=[0,src_len-L],value=1),0) for L in src_real_len],0)
print(valid_encoder_pos) # 这是使用unsqueeze的方法 把1维变2维
valid_encoder_pos = torch.unsqueeze(valid_encoder_pos,2) #需要再次unsqueeze 把2维变3维，这样才能使用bmm函数，保留batch size
# print(valid_encoder_pos) 
valid_encoder_pos_metr = (torch.bmm(valid_encoder_pos, valid_encoder_pos.transpose(1,2)))  # 模拟QK.T相乘
mask_encode_self_attention = valid_encoder_pos_metr.to(torch.bool)
mask_encode_self_attention

# 下面是reshape的方法
# src_real_len = [len(i.split(' ')) for i in src]
# valid_encoder_pos = torch.cat([F.pad(torch.zeros(L,dtype=torch.int32) ,pad=[0,src_len-L],value=1) for L in src_real_len],0)
# valid_encoder_pos = valid_encoder_pos.reshape(len(src),-1)  # 这是使用reshpe的方法
# print(valid_encoder_pos)
# mask_encode_self_attention = valid_encoder_pos.to(torch.bool)
# mask_encode_self_attention


tensor([[0, 0, 0, 0, 0, 0, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 1]], dtype=torch.int32)


tensor([[[False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True]],

        [[False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, F

In [65]:
# 构造encode 的 self-ttention mask
# mask shape [batchsize, src_len,model_dim]
# mask shape [batchsize, tgt_len,model_dim]
src_real_len = [len(i.split(' ')) for i in src]
valid_encoder_pos = torch.cat([torch.unsqueeze(F.pad(torch.ones(L,dtype=torch.int32) ,pad=[0,src_len-L],value=0),0) for L in src_real_len],0)
print(valid_encoder_pos) # 这是使用unsqueeze的方法 把1维变2维
valid_encoder_pos = torch.unsqueeze(valid_encoder_pos,2) #需要再次unsqueeze 把2维变3维，这样才能使用bmm函数，保留batch size
# print(valid_encoder_pos) 


invalid_encoder_pos = 1 - valid_encoder_pos



invalid_encoder_pos_matrix = (torch.bmm(invalid_encoder_pos, invalid_encoder_pos.transpose(1,2)))  # 模拟QK.T相乘
mask_encode_self_attention = invalid_encoder_pos_matrix.to(torch.bool)
mask_encode_self_attention

# 下面是reshape的方法
# src_real_len = [len(i.split(' ')) for i in src]
# valid_encoder_pos = torch.cat([F.pad(torch.zeros(L,dtype=torch.int32) ,pad=[0,src_len-L],value=1) for L in src_real_len],0)
# valid_encoder_pos = valid_encoder_pos.reshape(len(src),-1)  # 这是使用reshpe的方法
# print(valid_encoder_pos)
# mask_encode_self_attention = valid_encoder_pos.to(torch.bool)
# mask_encode_self_attention


tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0]], dtype=torch.int32)


tensor([[[False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True]],

        [[False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, F

In [32]:
# mask 实施 测试

score = torch.randn(len(src),src_len,src_len)
score
masked_score = score.masked_fill(mask_encode_self_attention, -np.inf)
masked_score
F.softmax(masked_score,-1)

tensor([[[0.0289, 0.0186, 0.1601, 0.2390, 0.3970, 0.1229, 0.0101, 0.0236],
         [0.0646, 0.0754, 0.1508, 0.1151, 0.2201, 0.0957, 0.0513, 0.2271],
         [0.2041, 0.0515, 0.2022, 0.1600, 0.1166, 0.0529, 0.1599, 0.0527],
         [0.0988, 0.0744, 0.1625, 0.1116, 0.0330, 0.1326, 0.3431, 0.0440],
         [0.0885, 0.0339, 0.1148, 0.4421, 0.0137, 0.0439, 0.0954, 0.1676],
         [0.0755, 0.1598, 0.0075, 0.0681, 0.1574, 0.1667, 0.1825, 0.1825],
         [0.0793, 0.0506, 0.1599, 0.4487, 0.0338, 0.2278, 0.0000, 0.0000],
         [0.0672, 0.0669, 0.3448, 0.0479, 0.1743, 0.2989, 0.0000, 0.0000]],

        [[0.0370, 0.4836, 0.0765, 0.0292, 0.1137, 0.0568, 0.0910, 0.1122],
         [0.1186, 0.0192, 0.0872, 0.2018, 0.0471, 0.1275, 0.0802, 0.3184],
         [0.0174, 0.4050, 0.0492, 0.1051, 0.1201, 0.1937, 0.0527, 0.0567],
         [0.0169, 0.0319, 0.2156, 0.2948, 0.1078, 0.1528, 0.1535, 0.0266],
         [0.1405, 0.0512, 0.0181, 0.1362, 0.2283, 0.0783, 0.3102, 0.0371],
         [0.3372, 0.028

In [69]:
# 构造 intra attention mask 
src_real_len = [len(i.split(' ')) for i in src]
tgt_real_len = [len(i.split(' ')) for i in tgt]
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L,dtype=torch.int32) ,pad=[0,src_len-L],value=0),0) for L in src_real_len],0),2)
print(valid_encoder_pos.size())
valid_decoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L,dtype=torch.int32) ,pad=[0,tgt_len-L],value=0),0) for L in tgt_real_len],0),2)
print(valid_decoder_pos.size())
valid_cross_pos_matrix = torch.bmm(valid_decoder_pos, valid_encoder_pos.transpose(1,2))  #注意这里 tgt-decode是Q，src-encoed是K，计算公式是Q和K转置矩阵乘
valid_cross_pos_matrix.size()
invalid_cross_pos_matrix = 1 - valid_cross_pos_matrix
mask_cross_self_attention = invalid_cross_pos_matrix.to(torch.bool)
mask_cross_self_attention

torch.Size([2, 8, 1])
torch.Size([2, 10, 1])


tensor([[[False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False,  True,  True],
         [ True,  True,  True,  True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True,  True,  True,  True]],

        [[False, False, False, False, False, False, False,  True],
         [False, False, False, False, False, False, False,  True],
         [False, False, False, False, False, False, False,  True],
         [False, False, False, False, False, False, False,  True],
         [False, False, False, False, False, False, False,  

In [86]:
# 构造 decode attention mask -- 有点不一样
# 构造下三角矩阵 为什么？ 因为decoder的输入要把后面的数据mask掉 每次输入的数据向后移一位 即看到的范围越来越大
m_ = [torch.unsqueeze(F.pad(torch.tril(torch.ones((L,L))),(0,(tgt_len-L),0,(tgt_len-L))),0)  for L in tgt_real_len]
print(m_[0].size())
valide_decoder_tri_matrix = torch.cat(m_)
invalide_decoder_tri_matrix = 1 - valide_decoder_tri_matrix
invalide_decoder_tri_matrix = invalide_decoder_tri_matrix.to(torch.bool)

torch.Size([1, 10, 10])


In [87]:
invalide_decoder_tri_matrix

tensor([[[False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False, False,  True,  True,  True,  True,  True,  True,  True],
         [False, False, False, False,  True,  True,  True,  True,  True,  True],
         [False, False, False, False, False,  True,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False, False,  True,  True,  True],
         [False, False, False, False, False, False, False, False,  True,  True],
         [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True]],

        [[False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False, Fa

In [94]:
# mask 实施 测试

score = torch.randn(len(src),tgt_len,tgt_len)
score
masked_score = score.masked_fill(invalide_decoder_tri_matrix, -1e9)
# print(masked_score)
F.softmax(masked_score,-1)[0]



tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.4672, 0.5328, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.1145, 0.2677, 0.6178, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.1166, 0.2938, 0.0396, 0.5500, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.2554, 0.0477, 0.1712, 0.2855, 0.2401, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.2036, 0.0844, 0.2609, 0.1679, 0.0591, 0.2242, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0604, 0.1446, 0.2196, 0.0840, 0.3627, 0.0892, 0.0395, 0.0000, 0.0000,
         0.0000],
        [0.1445, 0.0139, 0.1103, 0.1852, 0.1359, 0.1485, 0.2378, 0.0239, 0.0000,
         0.0000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000]])

In [None]:
# 构造scaled self attention 函数
def scale_dot_product_attention(Q,K,V,attention_mask):  # Q K 是 batch的， 同时，也是batch * multi-head的
    #  shape of Q K V : [batch_size,num_head,src_len,model_dim/num_head]
    score = torch.bmm(Q, K.transpose(-1,-2))/torch.sqrt(model_dim)
    masked_score = score.masked_fill(attention_mask, -1e9)
    prob = F.softmax(masked_score,-1)
    context = torch.bmm(prob,V)
    return context



In [95]:
transformer = nn.Transformer(nhead=2, num_encoder_layers=6)
src_ = torch.rand((10,32,512))
tgt_ = torch.rand((20,32,512))
out = transformer(src_,tgt_)
out.size()

In [97]:
# # 4 大类
# TransformerEncoder
#     TransformerEncoderLayer

# TransformerDecoder
#     TransformerDe  coderLayer

torch.Size([20, 32, 512])

In [28]:
# Transformer Masked loss
# 使用机器翻译任务 模拟
import torch 
import torch.nn as nn
import torch.nn.functional as F 

# 两个句子，每个句子3个单词，那就是一共6个单词
Logits = torch.randn(2,3,4) # bs=2,seqlen=3,vocab_size=class=4 
Logits = Logits.transpose(1,2)  # 在torch 官方crossentropy调用的时候需要的维度是bs，C，d 没有经过softmax之前
lable = torch.randint(0,4,(2,3))
Logits.size(),lable.size()
# 这里的交叉熵是6个单词的平均的交叉熵
F.cross_entropy(Logits,lable) 
# 返回每一个单词的交叉熵
F.cross_entropy(Logits,lable,reduction='none') 
# 构造mask len 即假设tgt的有效长度只有2，有1位是padding的，那就需要把这一位mask掉再进行loss掉统计
tgt_len = torch.Tensor([2,3]).to(torch.int32) # 假设两个样本长度为2，3

valid_mask = torch.cat([torch.unsqueeze(F.pad(torch.ones(l),(0,max(tgt_len)-l)),0) for l in tgt_len])
F.cross_entropy(Logits,lable,reduction='none') * valid_mask

# 或者
lable[0,2] = -100
F.cross_entropy(Logits,lable,reduction='none')

tensor([[2.3064, 1.4692, 0.0000],
        [2.3875, 0.3449, 2.5509]])

In [13]:
Logits,lable
# tensor([[[ 1.5035,  1.1617,  0.8024,  0.4093],
#          [ 0.0355, -0.5075, -1.2482, -0.7190],
#          [ 0.5320,  1.7259,  0.4466,  0.6156]],

# tensor([[3, 1, 1],
#          [2, 2, 3]]))

(tensor([[[-4.1631e-01, -1.1203e+00,  7.8211e-01],
          [-1.0408e+00, -2.0775e+00,  1.8497e+00],
          [ 8.0159e-01, -3.4229e-01,  2.1815e+00],
          [ 8.0410e-01, -4.7673e-01,  1.9605e+00]],
 
         [[ 4.2155e-01,  6.3033e-01, -1.6153e+00],
          [-5.7698e-02, -9.6715e-01, -1.7739e+00],
          [-1.4661e-01, -4.5681e-02,  1.0823e-03],
          [-4.9995e-01,  8.1735e-01, -4.4952e-01]]]),
 tensor([[3, 1, 1],
         [2, 2, 3]]))

In [7]:
lable

tensor([[2, 3, 0],
        [1, 1, 3]])

tensor([[0.8964, 2.6555, 0.0000],
        [1.5174, 1.7465, 1.1464]])

tensor([[0.8964, 2.6555, 0.0000],
        [1.5174, 1.7465, 0.0000]])