# Embedding layer

    Convert each word to input sequence and generate embedding vector.

In [3]:
import torch 
import torch.nn as nn
import math 
import copy 

class Embedding(nn.Module):
    def __init__(self, vocab_size, dmodel = 512) -> None:
        super(Embedding, self).__init__()
        self.vocab_size = vocab_size
        self.dmodel = dmodel
        self.embed_layer = nn.Embedding(self.vocab_size, self.dmodel)
    def forward(self, x):
        embed_out = self.embed_layer(x)
        return embed_out * math.sqrt(self.dmodel)

# Positional Encoding

    
Étant donné que notre modèle ne contient ni récurrence ni convolution, afin que le modèle puisse utiliser l'ordre de la séquence, nous devons injecter des informations sur la position relative ou absolue des tokens dans la séquence. Pour ce faire, nous ajoutons des encodages positionnels aux embeddings d'entrée au bas des piles de l'encodeur et du décodeur. Les encodages positionnels ont la même dimension $d_{\text{modèle}}$ que les embeddings, afin que les deux puissent être additionnés.

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_seq_len, d_model = 512) -> None:
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        pos = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        frequency = torch.pow(10000, -torch.arange(0, d_model, 2, dtype = torch.float) / d_model)
        pe = torch.zeros((max_seq_len, d_model))
        pe[:, 0::2] = torch.sin(pos * frequency)
        pe[:, 1::2] = torch.cos(pos * frequency)
        self.register_buffer('pe', pe)
    def forward(self, embed_vect):
        return embed_vect + self.pe 

# Attention Layer

- First we create three vector(Query, Key, Value) from encoder input vector. Here new vector dimension is smaller than embedding vector.

- we compute the dot products of the query with all keys, divide each by square root of d_k. Here d_k is the dimension of key vector. In our case d_k = d_v = d_model/h = 64 (d_model=512).

- we apply softmax function and multiply with value matrix.

- Then we resize output dimension and pass through a linear function.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, dmodel = 512, n_head = 8, dropout_rate = 0.2) -> None: 
        super().__init__()
        self.dmodel = dmodel
        self.n_head = n_head
        self.dropout = nn.Dropout(p = dropout_rate)
        self.head_dim = int(dmodel / n_head)
        self.softmax = nn.Softmax(dim = -1)
        self.w_key = nn.Linear(dmodel, dmodel)
        self.w_query = nn.Linear(dmodel, dmodel)
        self.w_value = nn.Linear(dmodel, dmodel)
        self.w_output_project = nn.Linear(dmodel, dmodel)

    def attention(self, key, query, value, mask = None):
        attention_score = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)

TypeError: matmul(): argument 'input' (position 1) must be Tensor, not int