In [1]:
import tensorflow as tf
import numpy as np

In [2]:
tf.__version__

'2.9.1'

In [3]:
input_embeddings = [[
    "Salut", "comment", "ca", "va", "?",
]]

output_embeddings = [[
    "<START>", "Hi", "how", "are", "you", "?",
]]
print(input_embeddings)
print(output_embeddings)

[['Salut', 'comment', 'ca', 'va', '?']]
[['<START>', 'Hi', 'how', 'are', 'you', '?']]


In [4]:
def get_vocabulary(sequences):

    token_to_info = {}

    for sequence in sequences:
        for word in sequence:
            if word not in token_to_info:
                token_to_info[word] = len(token_to_info)
    return token_to_info

input_voc = get_vocabulary(input_embeddings)
output_voc = get_vocabulary(output_embeddings)

input_voc["<START>"] = len(input_voc)
input_voc["<END>"] = len(input_voc)
input_voc["<PAD>"] = len(input_voc)

output_voc["<END>"] = len(output_voc)
output_voc["<PAD>"] = len(output_voc)

print(input_voc)
print(output_voc)

{'Salut': 0, 'comment': 1, 'ca': 2, 'va': 3, '?': 4, '<START>': 5, '<END>': 6, '<PAD>': 7}
{'<START>': 0, 'Hi': 1, 'how': 2, 'are': 3, 'you': 4, '?': 5, '<END>': 6, '<PAD>': 7}


In [5]:
def sequences_to_int(sequences, voc):
    for sequence in sequences:
        for s, word in enumerate(sequence):
            sequence[s] = voc[word]
    return(np.array(sequences))

input_seq = sequences_to_int(input_embeddings, input_voc)
output_seq = sequences_to_int(output_embeddings, output_voc)

print(input_seq)
print(output_seq)

[[0 1 2 3 4]]
[[0 1 2 3 4 5]]


In [6]:
class EmbeddingLayer(tf.keras.layers.Layer):

    def __init__(self, nb_token, **kwargs):
        self.nb_token = nb_token
        super(**kwargs).__init__()

    def build(self, input_shape):
        self.word_embedding = tf.keras.layers.Embedding(
            self.nb_token, 256,
        )
        super().build(input_shape)

    def call(self, x):
        embed = self.word_embedding(x)
        return embed


class ScaledDotProductAttention(tf.keras.layers.Layer):

    def __init__(self, **kwargs):
        super(**kwargs).__init__()

    def build(self, input_shape):
        self.query_layer = tf.keras.layers.Dense(256)
        self.value_layer = tf.keras.layers.Dense(256)
        self.key_layer = tf.keras.layers.Dense(256)
        super().build(input_shape)

    def call(self, x):
        Q = self.query_layer(x)
        K = self.key_layer(x)
        V = self.value_layer(x)
        QK = tf.matmul(Q, K, transpose_b=True)
        QK = QK / tf.math.sqrt(256.)
        softmax_QK = tf.nn.softmax(QK, axis=-1)
        attention = tf.matmul(softmax_QK, V)
        # print("Shape Q", Q.shape)
        # print("Shape K", K.shape)
        # print("Shape V", V.shape)
        # print("Shape QK", QK.shape)
        # print("Shape softmax", softmax_QK.shape)
        # print("Shape attention", attention.shape)
        return attention

def test():
    layer_input = tf.keras.Input(shape=(5))
    embedding = EmbeddingLayer(nb_token=5)(layer_input)
    attention = ScaledDotProductAttention()(embedding)
    model = tf.keras.Model(layer_input, attention)
    #model.summary()
    return model

m_test = test()
out = m_test(input_seq)
print(out.shape)

(1, 5, 256)


In [8]:
class MultiHeadAttention(tf.keras.layers.Layer):

    def __init__(self, dim=256, nb_head=8, **kwargs):
        self.head_dim = 256 // 8
        self.nb_head = nb_head
        super(**kwargs).__init__()

    def build(self, input_shape):
        self.query_layer = tf.keras.layers.Dense(256)
        self.value_layer = tf.keras.layers.Dense(256)
        self.key_layer = tf.keras.layers.Dense(256)
        self.out_proj = tf.keras.layers.Dense(256)
        super().build(input_shape)

    def call(self, x):
        Q = self.query_layer(x)
        K = self.key_layer(x)
        V = self.value_layer(x)
        batch_size = tf.shape(Q)[0]
        seq_len = tf.shape(Q)[1]

        Q = tf.reshape(Q, [batch_size, seq_len, self.nb_head, self.head_dim])
        K = tf.reshape(K, [batch_size, seq_len, self.nb_head, self.head_dim])
        V = tf.reshape(V, [batch_size, seq_len, self.nb_head, self.head_dim])

        Q = tf.transpose(Q, [0, 2, 1, 3])
        K = tf.transpose(K, [0, 2, 1, 3])
        V = tf.transpose(V, [0, 2, 1, 3])

        Q = tf.reshape(Q, [batch_size * self.nb_head, seq_len, self.head_dim])
        K = tf.reshape(K, [batch_size * self.nb_head, seq_len, self.head_dim])
        V = tf.reshape(V, [batch_size * self.nb_head, seq_len, self.head_dim])

        # Scaled dot product attention
        QK = tf.matmul(Q, K, transpose_b=True)
        QK = QK / tf.math.sqrt(256.)
        softmax_QK = tf.nn.softmax(QK, axis=-1)
        attention = tf.matmul(softmax_QK, V)

        attention = tf.reshape(attention, [batch_size, self.nb_head, seq_len, self.head_dim])

        attention = tf.transpose(attention, [0, 2, 1, 3])

        # Concat
        attention = tf.reshape(attention, [batch_size, seq_len, self.nb_head * self.head_dim])

        out_attention = self.out_proj(attention)

        return out_attention

def test():
    layer_input = tf.keras.Input(shape=(5))
    embedding = EmbeddingLayer(nb_token=5)(layer_input)
    multi_attention = MultiHeadAttention()(embedding)
    model = tf.keras.Model(layer_input, multi_attention)
    model.summary()
    return model

m_test = test()
out = m_test(input_seq)
print(out.shape)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 5)]               0         
                                                                 
 embedding_layer_2 (Embeddin  (None, 5, 256)           1280      
 gLayer)                                                         
                                                                 
 multi_head_attention (Multi  (None, 5, 256)           263168    
 HeadAttention)                                                  
                                                                 
Total params: 264,448
Trainable params: 264,448
Non-trainable params: 0
_________________________________________________________________
(1, 5, 256)


In [10]:
class EncoderLayer(tf.keras.layers.Layer):

    def __init__(self, **kwargs):
        super(**kwargs).__init__()

    def build(self, input_shape):
        self.multi_head_attention = MultiHeadAttention()
        self.norm = tf.keras.layers.LayerNormalization()
        self.dense_out = tf.keras.layers.Dense(256)
        super().build(input_shape)

    def call(self, x):
        attention = self.multi_head_attention(x)
        post_attention = self.norm(x + attention)
        x = self.dense_out(post_attention)
        enc_output = self.norm(x + post_attention)
        return enc_output

def test():
    layer_input = tf.keras.Input(shape=(5))
    embedding = EmbeddingLayer(nb_token=5)(layer_input)
    enc_output = EncoderLayer()(embedding)
    model = tf.keras.Model(layer_input, enc_output)
    model.summary()
    return model

m_test = test()
out = m_test(input_seq)
print(out.shape)

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 5)]               0         
                                                                 
 embedding_layer_4 (Embeddin  (None, 5, 256)           1280      
 gLayer)                                                         
                                                                 
 encoder_layer_2 (EncoderLay  (None, 5, 256)           329472    
 er)                                                             
                                                                 
Total params: 330,752
Trainable params: 330,752
Non-trainable params: 0
_________________________________________________________________
(1, 5, 256)
