In [36]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [15]:
# word embedding 以序列建模为例
# 考虑source sentence 和 target sentence
# 构建序列，序列的字符以其在词表中的索引的形式表示
torch.manual_seed(1)
batch_size = 2

# 词表大小
max_num_src_words = 8
max_num_tgt_words = 8
model_embedding_dim = 8 #原文为512


# 序列的最大长度   超参数 
max_src_seq_len = 5
max_tgt_seq_len = 5
max_position_len = 5

src_len = torch.randint(2, 6, (batch_size,))
tgt_len = torch.randint(2, 6, (batch_size,))
src_len,tgt_len

(tensor([3, 5]), tensor([2, 2]))

In [17]:
# 单词索引构成源句子和目标句子，构建batch, 做了padding， 默认值为0
src_seq =[torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max(src_len)- L)), 0) \
          for L in src_len]
tgt_seq =[torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max(tgt_len)- L)), 0) \
          for L in tgt_len]

src_seq = torch.cat(src_seq, dim=0)
tgt_seq = torch.cat(tgt_seq, dim=0)

print(src_seq, src_seq.shape)
print(tgt_seq, tgt_seq.shape)

tensor([[7, 4, 5, 0, 0],
        [2, 4, 3, 5, 7]]) torch.Size([2, 5])
tensor([[3, 6],
        [6, 3]]) torch.Size([2, 2])


In [18]:
# 构造word embedding 
src_embedding_table = nn.Embedding(max_num_src_words + 1, model_embedding_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words + 1, model_embedding_dim)
print(tgt_embedding_table.weight)

src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)
print(tgt_embedding)

Parameter containing:
tensor([[ 0.2053,  0.3051,  0.5357, -0.4312,  0.1573,  1.2540,  1.3275, -0.4954],
        [-1.9804,  1.7986,  0.1018,  0.3400, -0.6447, -0.2870,  3.3212, -0.4021],
        [-0.3030, -1.7618,  0.6348, -0.8044, -1.0371, -1.0669, -0.2085, -0.2155],
        [ 2.2952,  0.6749,  1.7133, -1.7943, -1.5208,  0.9196, -0.5484, -0.3472],
        [ 0.4730, -0.4286,  0.5514, -1.5474,  0.7575, -0.4068, -0.1277,  0.2804],
        [ 1.7460,  1.8550, -0.7064,  2.5571,  0.7705, -1.0739, -0.2015, -0.5603],
        [-0.6240, -0.9773,  0.8748,  0.9873,  0.2505, -0.7930,  0.5231,  1.2236],
        [-0.1095,  0.3126,  1.5038,  0.5038, -0.5685,  0.8376,  1.7837, -0.1954],
        [-1.1435, -0.6512, -0.1032,  0.6937, -0.5413,  0.8952, -0.8825,  0.5318]],
       requires_grad=True)
tensor([[[ 2.2952,  0.6749,  1.7133, -1.7943, -1.5208,  0.9196, -0.5484,
          -0.3472],
         [-0.6240, -0.9773,  0.8748,  0.9873,  0.2505, -0.7930,  0.5231,
           1.2236]],

        [[-0.6240, -0.97

In [19]:
# 构造position embedding
pos_mat = torch.arange(max_position_len).reshape(-1, 1)
i_mat = torch.pow(10000, torch.arange(0, 8, 2).reshape(1, -1) / model_embedding_dim)
pe_embedding_table = torch.zeros(max_position_len, model_embedding_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat)

# 位置索引
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len])
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in src_len])

pe_embedding = nn.Embedding(max_position_len, model_embedding_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)

src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)

In [6]:
i_mat , pos_mat, pos_mat / i_mat, pe_embedding.weight, src_pos, tgt_pos, tgt_pe_embedding

(tensor([[   1.,   10.,  100., 1000.]]),
 tensor([[0],
         [1],
         [2],
         [3],
         [4]]),
 tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.0000e+00, 1.0000e-01, 1.0000e-02, 1.0000e-03],
         [2.0000e+00, 2.0000e-01, 2.0000e-02, 2.0000e-03],
         [3.0000e+00, 3.0000e-01, 3.0000e-02, 3.0000e-03],
         [4.0000e+00, 4.0000e-01, 4.0000e-02, 4.0000e-03]]),
 Parameter containing:
 tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00],
         [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.99

In [33]:
# 构造encoder的self-attention mask
# mask的shape: [batch_size, max_src_len, max_src_len],值为1或为-inf
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len) - L)), 0) \
                               for L in src_len]), 2)
valid_encoder_pos_matrix = torch.bmm(valid_encoder_pos, valid_encoder_pos.transpose(1, 2))
invalid_encoder_pos_matrix = 1 - valid_encoder_pos_matrix
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)

print(invalid_encoder_pos_matrix)
print(mask_encoder_self_attention)

tensor([[[0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]])
tensor([[[False, False, False,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True]],

        [[False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False]]])


In [38]:
# score = torch.randn(batch_size, max(src_len), max(src_len))
# 
# masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
# prob = torch.softmax(masked_score, dim=-1)
# 
# print(score)
# print(masked_score)
# print(prob)

tensor([[[ 1.0757, -0.5536, -1.6160,  0.0934, -1.3898],
         [-0.3105,  1.0693,  1.4394,  1.3694,  0.4539],
         [-0.0498,  0.3745,  1.4389,  1.4151, -0.1589],
         [-0.7360, -2.0311, -1.1064,  0.1879,  0.1146],
         [ 1.1904,  0.5201, -0.3884, -0.0316, -1.1085]],

        [[-1.8874, -0.0863,  1.4791,  0.9428, -0.1244],
         [-1.5083, -0.7200,  0.7923, -0.3385,  1.4161],
         [ 0.4738,  0.0827, -1.3034,  0.5190,  1.3395],
         [-0.2388, -0.0680,  1.1349,  0.8658,  0.6334],
         [-0.5392,  0.0182,  1.4142, -1.7438,  0.1129]]])
tensor([[[ 1.0757e+00, -5.5361e-01, -1.6160e+00, -1.0000e+09, -1.0000e+09],
         [-3.1046e-01,  1.0693e+00,  1.4394e+00, -1.0000e+09, -1.0000e+09],
         [-4.9785e-02,  3.7450e-01,  1.4389e+00, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]],

        [[-1.8874e+00, -8.6326e-02,  1.4791e+00,  9.42

In [11]:
# # scaled的重要性
# alpha1 = 0.1
# alpha2 = 10
# score = torch.randn(5)
# prob1 = F.softmax(score*alpha1, dim=-1)
# prob2 = F.softmax(score*alpha2, dim=-1)
# print(score)
# print(prob1, prob2)
# 
# def softmax_func(score):
#     return F.softmax(score, dim=-1)
# 
# jaco_mat1 = torch.autograd.functional.jacobian(softmax_func, score*alpha1)
# jaco_mat2 = torch.autograd.functional.jacobian(softmax_func, score*alpha2)
# print(jaco_mat1)
# print(jaco_mat2)