In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# word embedding 以序列建模为例
# 考虑source sentence 和 target sentence
# 构建序列，序列的字符以其在词表中的索引的形式表示
torch.manual_seed(1)
batch_size = 2

# 词表大小
max_num_src_words = 8
max_num_tgt_words = 8
model_embedding_dim = 8 #原文为512


# 序列的最大长度   超参数 
max_src_seq_len = 5
max_tgt_seq_len = 5
max_position_len = 5

# 生成batch_size个句子的长度信息
src_len = torch.randint(2, 6, (batch_size,))
tgt_len = torch.randint(2, 6, (batch_size,))
src_len,tgt_len

(tensor([3, 5]), tensor([2, 2]))

In [3]:
# 单词索引构成源句子和目标句子，构建batch, 做了padding， 默认值为0
src_seq =[torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max(src_len)- L)), 0) \
          for L in src_len]
tgt_seq =[torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max(tgt_len)- L)), 0) \
          for L in tgt_len]

src_seq = torch.cat(src_seq, dim=0)
tgt_seq = torch.cat(tgt_seq, dim=0)

print(src_seq, src_seq.shape)
print(tgt_seq, tgt_seq.shape)

tensor([[4, 7, 1, 0, 0],
        [6, 1, 4, 5, 7]]) torch.Size([2, 5])
tensor([[6, 6],
        [3, 6]]) torch.Size([2, 2])


In [9]:
# 构造word embedding 
src_embedding_table = nn.Embedding(max_num_src_words + 1, model_embedding_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words + 1, model_embedding_dim)
print(tgt_embedding_table.weight)

src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)
print(tgt_embedding)

Parameter containing:
tensor([[-0.8011, -1.5776, -0.9171, -0.2311,  1.0470, -1.5918,  0.0814,  1.0832],
        [-0.5794, -0.5948,  0.0714,  0.3420,  0.8866, -0.8954,  0.1360,  0.6579],
        [-0.9102, -0.1423,  0.2989,  1.4571,  0.2304, -0.1479, -0.6358,  0.3535],
        [-0.0321, -0.5684, -1.4244, -1.3247, -2.0823, -0.6323,  1.5993,  1.9342],
        [ 0.5931,  1.8194, -0.8792, -1.1781,  0.2504,  0.3679, -0.5098, -0.0992],
        [-0.5083,  1.2397,  0.4237,  0.2669,  0.0768, -1.0789,  0.7933, -0.6170],
        [-1.5175, -1.0135,  1.3760,  1.4397, -2.2479,  1.3209, -0.5391,  0.3898],
        [ 1.3822, -1.0321,  0.7139, -1.5632, -0.3591, -0.8194,  1.6901,  0.2580],
        [-0.0032,  0.1425,  0.9438, -0.2258,  0.4633, -1.6465, -0.8888,  1.7225]],
       requires_grad=True)
tensor([[[-1.5175, -1.0135,  1.3760,  1.4397, -2.2479,  1.3209, -0.5391,
           0.3898],
         [-1.5175, -1.0135,  1.3760,  1.4397, -2.2479,  1.3209, -0.5391,
           0.3898]],

        [[-0.0321, -0.56

False

In [5]:
# 构造position embedding
pos_mat = torch.arange(max_position_len).reshape(-1, 1)
i_mat = torch.pow(10000, torch.arange(0, 8, 2).reshape(1, -1) / model_embedding_dim)
pe_embedding_table = torch.zeros(max_position_len, model_embedding_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat)

# 位置索引
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len])
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in src_len])

pe_embedding = nn.Embedding(max_position_len, model_embedding_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)

src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)

In [6]:
i_mat , pos_mat, pos_mat / i_mat, pe_embedding.weight, src_pos, tgt_pos, tgt_pe_embedding

(tensor([[   1.,   10.,  100., 1000.]]),
 tensor([[0],
         [1],
         [2],
         [3],
         [4]]),
 tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.0000e+00, 1.0000e-01, 1.0000e-02, 1.0000e-03],
         [2.0000e+00, 2.0000e-01, 2.0000e-02, 2.0000e-03],
         [3.0000e+00, 3.0000e-01, 3.0000e-02, 3.0000e-03],
         [4.0000e+00, 4.0000e-01, 4.0000e-02, 4.0000e-03]]),
 Parameter containing:
 tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00],
         [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.99

In [33]:
# 构造encoder的self-attention mask
# mask的shape: [batch_size, max_src_len, max_src_len],值为1或为-inf
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len) - L)), 0) \
                               for L in src_len]), 2)
valid_encoder_pos_matrix = torch.bmm(valid_encoder_pos, valid_encoder_pos.transpose(1, 2))
invalid_encoder_pos_matrix = 1 - valid_encoder_pos_matrix
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)

print(invalid_encoder_pos_matrix)
print(mask_encoder_self_attention)

tensor([[[0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]])
tensor([[[False, False, False,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True]],

        [[False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False]]])


In [38]:
# score = torch.randn(batch_size, max(src_len), max(src_len))
# 
# masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
# prob = torch.softmax(masked_score, dim=-1)
# 
# print(score)
# print(masked_score)
# print(prob)

tensor([[[ 1.0757, -0.5536, -1.6160,  0.0934, -1.3898],
         [-0.3105,  1.0693,  1.4394,  1.3694,  0.4539],
         [-0.0498,  0.3745,  1.4389,  1.4151, -0.1589],
         [-0.7360, -2.0311, -1.1064,  0.1879,  0.1146],
         [ 1.1904,  0.5201, -0.3884, -0.0316, -1.1085]],

        [[-1.8874, -0.0863,  1.4791,  0.9428, -0.1244],
         [-1.5083, -0.7200,  0.7923, -0.3385,  1.4161],
         [ 0.4738,  0.0827, -1.3034,  0.5190,  1.3395],
         [-0.2388, -0.0680,  1.1349,  0.8658,  0.6334],
         [-0.5392,  0.0182,  1.4142, -1.7438,  0.1129]]])
tensor([[[ 1.0757e+00, -5.5361e-01, -1.6160e+00, -1.0000e+09, -1.0000e+09],
         [-3.1046e-01,  1.0693e+00,  1.4394e+00, -1.0000e+09, -1.0000e+09],
         [-4.9785e-02,  3.7450e-01,  1.4389e+00, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]],

        [[-1.8874e+00, -8.6326e-02,  1.4791e+00,  9.42

In [11]:
# # scaled的重要性
# alpha1 = 0.1
# alpha2 = 10
# score = torch.randn(5)
# prob1 = F.softmax(score*alpha1, dim=-1)
# prob2 = F.softmax(score*alpha2, dim=-1)
# print(score)
# print(prob1, prob2)
# 
# def softmax_func(score):
#     return F.softmax(score, dim=-1)
# 
# jaco_mat1 = torch.autograd.functional.jacobian(softmax_func, score*alpha1)
# jaco_mat2 = torch.autograd.functional.jacobian(softmax_func, score*alpha2)
# print(jaco_mat1)
# print(jaco_mat2)

In [11]:
model = nn.Transformer(batch_first=True)
print(model)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, o