<a href="https://colab.research.google.com/github/Dominique-Yiu/ColabCode/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer Realization
**Transformer Architecture** \
1. Encoder
- Input words embedding
  - Turn the sparse one-hot vector into the dense contiguouly vector by FFN without bias.
- Position encoding
- Multi-head self-attention
- Feed-forword network
2. Decoder
- Output words embedding
- Masked multi-head self-attention
- Multi-head cross-attention
- Feed-forword network
- Softmax

In [2]:
import torch
import numpy
import torch.nn as nn
import torch.nn.functional as F

In [28]:
batch_size = 2
# the size of the word sheet
max_num_src_words = 8
max_num_tgt_words = 8
# 
model_dim = 8
# the max length of sequence
max_src_seq_len = 5
max_tgt_seq_len = 5
# 位置索引最大值
max_position_len = 5
# generate the sequence length randomly, its size is fixed
# src_len = torch.randint(2, 5, (batch_size,))
# tgt_len = torch.randint(2, 5, (batch_size,))
src_len = torch.Tensor([2, 4]).to(torch.int32)
tgt_len = torch.Tensor([4, 3]).to(torch.int32)

# generate the src/tgt sentence, and pad this sentence with default value '0'
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max(src_len) - L)), 0) \
           for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max(tgt_len) - L)), 0) \
           for L in tgt_len])

"""Word Embedding"""
# 构造 Word Embedding
src_embedding_table = nn.Embedding(max_num_src_words + 1, model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words + 1, model_dim)
src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)

print(src_embedding_table.weight)
print(src_seq)
print(src_embedding)

Parameter containing:
tensor([[-1.3421, -0.5922,  0.7105, -0.4137, -0.7792,  0.8271, -1.1737,  1.1708],
        [-0.7755, -0.8535, -0.8674,  1.2121, -0.1808,  1.3193,  0.4921,  0.3922],
        [-0.3492,  0.6076,  0.5111,  1.8655, -0.8650, -0.2841,  0.5962,  0.1457],
        [ 0.7635, -0.8373, -2.0204, -0.5166,  0.4622, -1.5357,  0.7542,  0.3448],
        [-0.6157, -0.9285,  1.5628, -0.3538,  0.3983,  0.9700,  1.7446, -0.3886],
        [-0.3686,  1.4301, -0.6616, -1.0094, -0.7734, -0.6186, -1.6067, -0.7960],
        [-2.1911,  0.5905,  0.1929,  0.6911, -0.0636,  0.0418,  0.7898, -0.2072],
        [-1.5851,  1.5669, -0.3050,  0.9676,  0.7270, -0.2039,  0.9162,  0.7299],
        [ 0.3286, -0.9448,  1.1110,  1.0238, -1.2310,  0.6511, -1.2549,  0.1557]],
       requires_grad=True)
tensor([[7, 4, 0, 0],
        [2, 3, 7, 1]])
tensor([[[-1.5851,  1.5669, -0.3050,  0.9676,  0.7270, -0.2039,  0.9162,
           0.7299],
         [-0.6157, -0.9285,  1.5628, -0.3538,  0.3983,  0.9700,  1.7446,
 

In [29]:
"""Position Embedding"""
# 构造Position Embedding
pos_mat  =  torch.arange(max_position_len).reshape((-1, 1))
i_mat = torch.pow(10000, torch.arange(0, 8, 2).reshape((1, -1)) / model_dim)
pe_embedding_table = torch.zeros(max_position_len, model_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat)

pe_embedding = nn.Embedding(max_position_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad = False)
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len]).to(torch.int32)
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in tgt_len]).to(torch.int32)

src_pos_embedding = pe_embedding(src_pos)
tgt_pos_embedding = pe_embedding(tgt_pos)
print(src_pos_embedding)
print(tgt_pos_embedding)

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00]],

        [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00]

$Attention(Q,K,V)=softmax(\frac{QK^T}{\sqrt{d_k}})V$

In [41]:
import numpy as np
"""Encoder: Self-Attention Mask"""
# 构造encoder的self-attention mask
# mask的shape: [batch_size, max_src_len, max_src_len]，数值为1/-inf
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len) - L)), 0) for L in src_len]), 2)
valid_encoder_pos_matrix = torch.bmm(valid_encoder_pos, valid_encoder_pos.transpose(1, 2))
invalid_encoder_pos_matrix = 1 - valid_encoder_pos_matrix
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)
score = torch.randn(batch_size, max(src_len), max(src_len))
masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
prob = F.softmax(masked_score, -1)

print(score)
print(masked_score)
print(prob)

tensor([[[ 2.3577, -0.2677,  0.8554, -1.4181],
         [-0.5159,  0.1013,  1.0361, -0.9863],
         [-0.6121,  1.4148, -1.6303, -1.5475],
         [-0.0495, -0.0552,  0.1952, -1.0923]],

        [[-0.2355, -1.3529,  0.3448,  0.3238],
         [ 0.0567,  0.5248, -1.5022, -0.0481],
         [ 1.2487, -0.4582, -1.1756, -0.1060],
         [ 0.7508,  2.8218, -0.1943, -0.6140]]])
tensor([[[ 2.3577e+00, -2.6770e-01, -1.0000e+09, -1.0000e+09],
         [-5.1594e-01,  1.0129e-01, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]],

        [[-2.3549e-01, -1.3529e+00,  3.4482e-01,  3.2380e-01],
         [ 5.6668e-02,  5.2479e-01, -1.5022e+00, -4.8116e-02],
         [ 1.2487e+00, -4.5824e-01, -1.1756e+00, -1.0598e-01],
         [ 7.5077e-01,  2.8218e+00, -1.9434e-01, -6.1401e-01]]])
tensor([[[0.9325, 0.0675, 0.0000, 0.0000],
         [0.3504, 0.6496, 0.0000, 0.0000],
         [0.2500, 0.2500, 

以上主要实现了词向量，位置编码，编码器子注意力的掩码