In [None]:
import import_ipynb
from Decoder import Decoder
from EncoderTransformer import Encoder

In [1]:
from tensorflow import cast,math,linalg,float32,ones,maximum,newaxis
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense

In [5]:
class TransformerModel(Model):
    def __init__(self,enc_vocab_size,dec_vocab_size,enc_seq_len,dec_seq_len,h,d_k,d_v,d_model,d_ff_inner,n,rate,**kwargs):
        super(TransformerModel,self).__init__(**kwargs)
        
        #setup encoder
        self.encoder=Encoder(enc_vocab_size,enc_seq_len,h,d_k,d_v,d_model,d_ff_inner,n,rate)
        
        #setup decoder
        self.decoder=Decoder(dec_vocab_size,dec_seq_len,h,d_k,d_v,d_model,d_ff_inner,n,rate)
        
        #final dense layer
        self.model_last_layer=Dense(dec_vocab_size)
        
    def padding_mask(self,input):
        #creating mask which marks zero padding values in the input by a 1.0
        mask=math.equal(input,0)
        mask=cast(mask,float32)
        
        #shape of the mask should be broadcastable to the shape of the attention weights that it will be masking later on
        return mask[:,newaxis,newaxis,:]
    
    def lookahead_mask(self,shape):
        #mask out future entries by marking them with a 1.0
        mask=1-linalg.band_part(ones((shape,shape)),-1,0)
        
        return mask
    
    def call(self,encoder_input,decoder_input,training):
        #creating padding mask to the mask encoder inputs and the encoder outputs in the decoder
        enc_padding_mask=self.padding_mask(encoder_input)
        
        #create and combine padding and lookahead mask to be fed into the decoder
        dec_in_padding_mask=self.padding_mask(decoder_input)
        dec_in_lookahead_mask=self.lookahead_mask(decoder_input.shape[1])
        dec_in_lookahead_mask=maximum(dec_in_padding_mask,dec_in_lookahead_mask)
        
        #feed the input into the enoder
        encoder_output=self.encoder(input_seq=encoder_input,padding_mask=enc_padding_mask,training=training)
        
        #feed the encoder output into decoder
        decoder_output=self.decoder(output_target=decoder_input,encoder_output=encoder_output,lookahead_mask=dec_in_lookahead_mask,padding_mask=enc_padding_mask,training=training)
        
        #pass the decoder output through a final dense layer
        model_output=self.model_last_layer(decoder_output)
        
        return model_output

In [4]:
enc_vocab_size = 20 # Vocabulary size for the encoder
dec_vocab_size = 20 # Vocabulary size for the decoder
 
enc_seq_length = 5  # Maximum length of the input sequence
dec_seq_length = 5  # Maximum length of the target sequence
 
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_ff = 2048  # Dimensionality of the inner fully connected layer
d_model = 512  # Dimensionality of the model sub-layers' outputs
n = 6  # Number of layers in the encoder stack
 
dropout_rate = 0.1  # Frequency of dropping the input units in the dropout layers
 
# Create model
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)
