## Join Transformer Encoder and Decoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tensorflow import math, cast, float32, linalg, ones, maximum, newaxis
from numpy import array
from tensorflow.keras.layers import Input 
from tensorflow.keras import Model

 Padding mask

In [3]:
def padding_mask(input):
    mask = math.equal(input, 0)
    mask = cast(mask, float32)
    
    return mask

In [4]:
input = array([1,2,3,4,0,0,0])
print(padding_mask(input))

tf.Tensor([0. 0. 0. 0. 1. 1. 1.], shape=(7,), dtype=float32)


In [5]:
def lookahead_mask(shape):
    # Mask out future entries by marking them with a 1.0 
    mask = 1 - linalg.band_part(ones((shape, shape)), -1, 0)
    return mask

In [6]:
%run 07_Transformer_Encoder.ipynb #Magic command to import from the ipynb
%run 08_Transformer_Decoder.ipynb

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 5, 512)]     0           []                               
                                                                                                  
 multi_head_attention_6 (MultiH  (None, 5, 512)      131776      ['input_1[0][0]',                
 eadAttention)                                                    'input_1[0][0]',                
                                                                  'input_1[0][0]']                
                                                                                                  
 dropout_13 (Dropout)           (None, 5, 512)       0           ['multi_head_attention_6[0][0]'] 
                                                                                              

In [7]:

class TransformerModel(Model):
    def __init__(self, enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length,
                h, d_k, d_v, d_model, d_ff_inner, n, rate, **kwargs):
        super().__init__(**kwargs)
        
        # Set up the encoder
        self.encoder = Encoder(enc_vocab_size, enc_seq_length, h, d_k, d_v, d_model,
                                d_ff_inner, n, rate)
        # Set up the decoder
        self.decoder = Decoder(dec_vocab_size, dec_seq_length, h, d_k, d_v, d_model,
                                d_ff_inner, n, rate)
        # Define the final dense layer
        self.model_last_layer = Dense(dec_vocab_size)
    
    def padding_mask(self,input):
        mask = math.equal(input, 0)
        mask = cast(mask, float32)
        return mask[:, newaxis, newaxis, :]
    
    def lookahead_mask(self,shape):
        # Mask out future entries by marking them with a 1.0 
        mask = 1 - linalg.band_part(ones((shape, shape)), -1, 0)
        return mask
    
    def call(self, encoder_input, decoder_input, training):
        enc_padding_mask = self.padding_mask(encoder_input)
        # Create and combine padding and look-ahead masks to be fed into the decoder
        dec_in_padding_mask = self.padding_mask(decoder_input)
        dec_in_lookahead_mask = self.lookahead_mask(decoder_input.shape[1])
        dec_in_lookahead_mask = maximum(dec_in_padding_mask, dec_in_lookahead_mask)
        
        # Feed the input into the encoder
        encoder_output = self.encoder(encoder_input, enc_padding_mask, training)
        
        # Feed the encoder output into the decoder
        decoder_output = self.decoder(decoder_input, encoder_output,
                               dec_in_lookahead_mask, enc_padding_mask, training)
                                      
        # Pass the decoder output through a final dense layer
        model_output = self.model_last_layer(decoder_output)
        
        return model_output

        

In [8]:
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys 
d_v = 64 # Dimensionality of the linearly projected values
d_ff = 2048 # Dimensionality of the inner fully connected layer 
d_model = 512 # Dimensionality of the model sub-layers' outputs
n = 6 # Number of layers in the encoder stack
dropout_rate = 0.1 # Frequency of dropping the input units in the dropout layers


enc_vocab_size = 20 # Vocabulary size for the encoder 
dec_vocab_size = 20 # Vocabulary size for the decoder
enc_seq_length = 5 # Maximum length of the input sequence 
dec_seq_length = 5 # Maximum length of the target sequence ...

In [9]:
# from model import TransformerModel

# Create model
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length,
                                   dec_seq_length, h, d_k, d_v, d_model, d_ff, n,
                                   dropout_rate)

In [10]:
encoder = EncoderLayer(enc_seq_length, h, d_k, d_v, d_model, d_ff, dropout_rate)
encoder.build_graph().summary()
decoder = DecoderLayer(dec_seq_length, h, d_k, d_v, d_model, d_ff, dropout_rate)
decoder.build_graph().summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 5, 512)]     0           []                               
                                                                                                  
 multi_head_attention_44 (Multi  (None, 5, 512)      131776      ['input_3[0][0]',                
 HeadAttention)                                                   'input_3[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 dropout_81 (Dropout)           (None, 5, 512)       0           ['multi_head_attention_44[0][0]']
                                                                                            