# Build transformer from scratch

In [1]:
%run -i transformer.py

In [2]:
torch.manual_seed(42)

<torch._C.Generator at 0x117b5feb0>

## 1. Unit Test

In [3]:
max_seq_len = 4
input_dim, embed_dim = 2, 6

### 1.1 Test WordEmbedding

In [4]:
x = torch.rand(2, 2).unsqueeze(0)
assert x.shape == (1, 2, 2)

we = WordEmbedding(input_dim=input_dim, embed_dim=embed_dim)
xe = we(x)
assert xe.shape == (1, 2, 6)

### 1.2 Test PositionEmbedding

In [5]:
pe = PositionEmbedding(max_seq_len=4, model_dim=embed_dim)
xpe = pe(xe)
assert xpe.shape == (1, 2, 6)

In [6]:
expected = torch.tensor([[[0.0000, 1.0000], [0.8415, 0.5403]]])
pe = PositionEmbedding(max_seq_len=2, model_dim=2)
actual = pe.pe
assert torch.allclose(expected, actual, 1e-4)
actual

tensor([[[0.0000, 1.0000],
         [0.8415, 0.5403]]])

### 1.3 Test MultiHeadAttention

In [7]:
x = torch.rand(2, 6).float().unsqueeze(0)
print(f"x shape: {x.shape}")
print(f"x: {x}")

mha = MultiHeadAttention(
    input_dim=embed_dim, 
    n_heads=3, 
    max_seq_len=max_seq_len,
)
y = mha(x, is_masked=False)
print(f"y shape: {y.shape}")
y

x shape: torch.Size([1, 2, 6])
x: tensor([[[0.2969, 0.8317, 0.1053, 0.2695, 0.3588, 0.1994],
         [0.5472, 0.0062, 0.9516, 0.0753, 0.8860, 0.5832]]])
y shape: torch.Size([1, 2, 6])


tensor([[[ 0.1410, -0.2415, -0.0318,  0.3922, -0.4950, -0.0911],
         [ 0.1424, -0.2383, -0.0315,  0.3907, -0.4926, -0.0913]]],
       grad_fn=<ViewBackward0>)

### 1.4 Test SelfAttentionBlock

In [8]:
sab = SelfAttentionBlock(
    input_dim=embed_dim, 
    n_heads=3, 
    max_seq_len=max_seq_len,
)
y = sab(x)
assert y.shape == (1, 2, 6)
y

tensor([[[ 0.4230,  0.5208,  0.1785,  0.0954,  0.3441,  0.2695],
         [ 0.6548, -0.3133,  1.0914, -0.0742,  1.1456,  0.7610]]],
       grad_fn=<AddBackward0>)

### 1.5 Test CrossAttentionBlock

In [9]:
cab = CrossAttentionBlock(
    input_dim=embed_dim, 
    n_heads=3, 
    max_seq_len=max_seq_len,
)
h = torch.rand(x.shape)
y = cab(x, h)
assert y.shape == (1, 2, 6)
y

tensor([[[ 0.4657,  0.6793,  0.2087,  0.4324,  0.3082,  0.9980],
         [ 0.8305, -0.1967,  1.0616,  0.1902,  0.8449,  1.2522]]],
       grad_fn=<AddBackward0>)

### 1.6 Test FeedForwardBlock

In [10]:
ffn = FeedForwardBlock(
    input_dim=embed_dim,
    hidden_dim=2*embed_dim,
    output_dim=embed_dim,
)
y = ffn(x)
assert y.shape == (1, 2, 6)
y


tensor([[[ 0.1941,  0.3956,  0.2387,  0.4547,  0.7300,  0.1099],
         [ 0.6936, -0.4001,  0.9455,  0.0020,  1.0205,  0.8613]]],
       grad_fn=<AddBackward0>)

### 1.7 Test Encoder

In [11]:
encoder = Encoder(
    input_dim=embed_dim, 
    ffn_hidden_dim=2*embed_dim,
    n_heads=3, 
    max_seq_len=max_seq_len
)
y = encoder(x)
assert y.shape == (1, 2, 6)
y

tensor([[[ 0.0229,  0.6373,  0.9636,  0.5993,  1.0467,  0.6053],
         [ 0.2535, -0.0047,  1.0777, -0.2200,  1.0148,  1.1037]]],
       grad_fn=<AddBackward0>)

### 1.8 Test Decoder

In [12]:
decoder = Decoder(
    input_dim=embed_dim, 
    ffn_hidden_dim=2*embed_dim,
    n_heads=3, 
    max_seq_len=max_seq_len
)
y = decoder(x, h)
assert y.shape == (1, 2, 6)
y

tensor([[[ 0.4443,  0.4832,  0.6520, -0.1377,  0.9540,  0.2160],
         [ 0.6955, -0.1804,  1.1963, -0.3065,  1.6590,  0.4834]]],
       grad_fn=<AddBackward0>)

### 1.9 Test Encoders

In [13]:
encoders = Encoders(
    n_layers = 2,
    input_dim = input_dim, 
    embed_dim = embed_dim,
    ffn_hidden_dim = 2 * embed_dim,
    n_heads = 3, 
    max_seq_len = 4,
    dropout_rate = 0.1
)

In [14]:
x = torch.rand(2, 2).float().unsqueeze(0)
y = encoders(x)
assert y.shape == (1, 2, 6)
y

tensor([[[-1.4552,  0.7041, -0.2659,  0.0997, -0.7425,  1.6597],
         [ 1.0727, -1.0704, -0.2171,  0.3087, -1.3805,  1.2866]]],
       grad_fn=<NativeLayerNormBackward0>)

### 1.10 Test Decoders

In [15]:
decoders = Decoders(
    n_layers = 1,
    input_dim = input_dim, 
    embed_dim = embed_dim,
    ffn_hidden_dim = 2 * embed_dim,
    n_heads = 3, 
    max_seq_len = 4,
    dropout_rate = 0.1
)
h = torch.rand(2, embed_dim).float().unsqueeze(0)
y = decoders(x, h)
assert y.shape == (1, 2, 2)
y

tensor([[[ 0.3198,  0.5929],
         [-0.4275,  0.0638]]], grad_fn=<ViewBackward0>)

### 1.11 Test Transformer

In [18]:
transformer = Transformer(
    n_layers = 1,
    input_dim = input_dim, 
    embed_dim = embed_dim,
    ffn_hidden_dim = 2 * embed_dim,
    n_heads = 3, 
    max_seq_len = 4,
    dropout_rate = 0.1
)
y = transformer(x)
assert y.shape == (1, 2, 2)
y

tensor([[[ 0.3497,  0.3230],
         [-0.1306,  0.2206]]], grad_fn=<ViewBackward0>)