In [59]:
import torch
import numpy
import torch.nn as nn
import torch.nn.functional as F

In [139]:
# word embedding 以序列建模为例
# 考虑source sentence 和 target sentence
# 构建序列，序列的字符以其在词表中的索引的形式表示
torch.manual_seed(1)
batch_size = 2

# 词表大小
max_num_src_words = 8
max_num_tgt_words = 8
model_embedding_dim = 8 #原文为512


# 序列的最大长度   超参数 
max_src_seq_len = 5
max_tgt_seq_len = 5
max_position_len = 5

src_len = torch.randint(2, 6, (batch_size,))
tgt_len = torch.randint(2, 6, (batch_size,))
src_len,tgt_len

(tensor([3, 5]), tensor([2, 2]))

In [154]:
# 单词索引构成源句子和目标句子，构建batch, 做了padding， 默认值为0
src_seq =[torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max_src_seq_len - L)), 0) \
          for L in src_len]
tgt_seq =[torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max_tgt_seq_len - L)), 0) \
          for L in tgt_len]

src_seq = torch.cat(src_seq, dim=0)
tgt_seq = torch.cat(tgt_seq, dim=0)

print(src_seq, src_seq.shape)
print(tgt_seq, tgt_seq.shape)

tensor([[4, 6, 4, 0, 0],
        [6, 2, 6, 4, 7]]) torch.Size([2, 5])
tensor([[7, 3, 0, 0, 0],
        [2, 7, 0, 0, 0]]) torch.Size([2, 5])


In [155]:
# 构造word embedding 
src_embedding_table = nn.Embedding(max_num_src_words + 1, model_embedding_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words + 1, model_embedding_dim)
print(tgt_embedding_table.weight)

src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)
print(tgt_embedding)

Parameter containing:
tensor([[ 0.4890, -0.1593,  0.1896,  0.6536,  1.0295, -0.4674,  1.2100, -0.9293],
        [-1.0647, -0.8734,  0.1444, -0.1146,  0.4031,  0.7834,  1.6241, -1.9684],
        [ 0.4186,  0.7187, -0.4918,  0.6325,  0.4409, -2.0395,  1.0629,  0.7760],
        [ 0.0096, -0.2610,  0.4011,  0.1672, -0.6525,  2.3191, -0.3156,  1.2293],
        [ 0.0074,  0.3208,  0.8575, -0.0639,  0.1633,  1.1105,  0.6586, -0.1845],
        [ 0.2699,  0.4034, -1.3365,  1.0815, -0.2198, -0.7506,  0.9105,  1.4996],
        [ 1.0353, -0.1632,  0.2580,  0.8273, -0.2423, -0.1800,  0.9584,  0.3781],
        [-1.5045, -0.1863, -0.9321,  0.3055,  1.4964,  0.2790, -0.5353,  1.1382],
        [ 1.3459, -0.4643, -0.3743,  0.9706, -1.7283, -0.6890, -0.4400,  0.3497]],
       requires_grad=True)
tensor([[[-1.5045, -0.1863, -0.9321,  0.3055,  1.4964,  0.2790, -0.5353,
           1.1382],
         [ 0.0096, -0.2610,  0.4011,  0.1672, -0.6525,  2.3191, -0.3156,
           1.2293],
         [ 0.4890, -0.1593

In [148]:
# 构造position embedding
pos_mat = torch.arange(max_position_len).reshape(-1, 1)
i_mat = torch.pow(10000, torch.arange(0, 8, 2).reshape(1, -1) / model_embedding_dim)
pe_embedding_table = torch.zeros(max_position_len, model_embedding_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat)

# 位置索引
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len])
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in src_len])

pe_embedding = nn.Embedding(max_position_len, model_embedding_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)

src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)

In [152]:
i_mat , pos_mat, pos_mat / i_mat, pe_embedding.weight, src_pos, tgt_pos, tgt_pe_embedding

(tensor([[   1.,   10.,  100., 1000.]]),
 tensor([[0],
         [1],
         [2],
         [3],
         [4]]),
 tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.0000e+00, 1.0000e-01, 1.0000e-02, 1.0000e-03],
         [2.0000e+00, 2.0000e-01, 2.0000e-02, 2.0000e-03],
         [3.0000e+00, 3.0000e-01, 3.0000e-02, 3.0000e-03],
         [4.0000e+00, 4.0000e-01, 4.0000e-02, 4.0000e-03]]),
 Parameter containing:
 tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00],
         [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.99

In [169]:
alpha1 = 0.1
alpha2 = 10
score = torch.randn(5)
prob1 = F.softmax(score*alpha1, dim=-1)
prob2 = F.softmax(score*alpha2, dim=-1)
print(score)
print(prob1, prob2)

tensor([-1.3961, -0.4079,  0.2204,  0.4656,  2.0270])
tensor([0.1697, 0.1873, 0.1995, 0.2044, 0.2390]) tensor([1.3599e-15, 2.6620e-11, 1.4253e-08, 1.6562e-07, 1.0000e+00])


In [170]:
def softmax_func(score):
    return F.softmax(score, dim=-1)

jaco_mat1 = torch.autograd.functional.jacobian(softmax_func, score*alpha1)
jaco_mat2 = torch.autograd.functional.jacobian(softmax_func, score*alpha2)
print(jaco_mat1)
print(jaco_mat2)



tensor([[ 0.1409, -0.0318, -0.0339, -0.0347, -0.0406],
        [-0.0318,  0.1522, -0.0374, -0.0383, -0.0448],
        [-0.0339, -0.0374,  0.1597, -0.0408, -0.0477],
        [-0.0347, -0.0383, -0.0408,  0.1626, -0.0489],
        [-0.0406, -0.0448, -0.0477, -0.0489,  0.1819]])
tensor([[ 1.3599e-15, -3.6201e-26, -1.9383e-23, -2.2523e-22, -1.3599e-15],
        [-3.6201e-26,  2.6620e-11, -3.7942e-19, -4.4088e-18, -2.6620e-11],
        [-1.9383e-23, -3.7942e-19,  1.4253e-08, -2.3606e-15, -1.4253e-08],
        [-2.2523e-22, -4.4088e-18, -2.3606e-15,  1.6562e-07, -1.6562e-07],
        [-1.3599e-15, -2.6620e-11, -1.4253e-08, -1.6562e-07,  2.3842e-07]])
