In [1]:
import import_ipynb
from EncoderTransformer import AddNormalization,FeedForward,PositionEmbeddingFixedWeights,MultiHeadAttention
from tensorflow.keras.layers import Dropout,Layer

importing Jupyter notebook from EncoderTransformer.ipynb
tf.Tensor(
[[[ 0.5888975  -0.03546252  1.2934693  ...  0.29983702 -0.12287652
    1.1293975 ]
  [-0.33865193  0.5251558   0.98935276 ...  0.5499459  -0.5448272
    0.9404903 ]
  [ 0.96026725 -0.0517008   0.673421   ...  0.51295304 -0.5779533
    1.6848341 ]
  [ 0.10532728  0.25390226  1.3362669  ...  0.4487259  -0.67684835
    1.2439642 ]
  [ 0.28498533  0.07896406  1.1497618  ... -0.5809479  -1.048168
    0.06464329]]

 [[ 0.49799728  0.24435085  1.6806055  ...  1.3838269   0.3301044
    0.8125566 ]
  [ 0.6332804   0.00798275  1.9614557  ...  1.4987017  -0.27502206
    1.915403  ]
  [ 1.5618489   1.0401852   1.0593464  ...  1.318754    1.032027
    1.4514806 ]
  [-0.1379486  -0.147596    2.0032861  ...  1.5441874  -0.32821053
    1.0109472 ]
  [ 0.53216666  0.6764732   1.3017755  ...  0.7820213  -0.66310406
   -0.00645708]]

 [[ 0.00663272  0.08982807  1.0020491  ...  1.9915534  -0.82418656
    0.46473923]
  [-0.12742728  0.8488

In [2]:
class DecoderLayer(Layer):
    def __init__(self,h,d_k,d_v,d_model,d_ff,rate,**kwargs):
        super(DecoderLayer,self).__init__(**kwargs)
        self.multihead_attention1=MultiHeadAttention(h,d_k,d_v,d_model)
        self.dropout1=Dropout(rate)
        self.add_norm1=AddNormalization()
        self.multihead_attention2=MultiHeadAttention(h,d_k,d_v,d_model)
        self.dropout2=Dropout(rate)
        self.add_norm2=AddNormalization()
        self.feed_forward=FeedForward(d_ff,d_model)
        self.dropout3=Dropout(rate)
        self.add_norm3=AddNormalization()
    
    def call(self,x,encoder_output,lookahead_mask,padding_mask,training):
        #multihead attention layer
        multihead_output1=self.multihead_attention1(queries=x,keys=x,values=x,mask=lookahead_mask)
        #expected shape: (batch_size,sequence_len,d_model)
        
        #dropout layer
        multihead_output1=self.dropout1(multihead_output1,training=training)
        
        #normalization layer
        addnorm_output1=self.add_norm1(x,multihead_output1)
        #expected shape: (batch_size,seq_len,d_model)
        
        #multihead attention layer
        multihead_output2=self.multihead_attention2(queries=addnorm_output1,keys=encoder_output,values=encoder_output,mask=padding_mask)
        
        #dropout layer
        multihead_output2=self.dropout2(multihead_output2,training=training)
        
        #normalization
        addnorm_output2=self.add_norm2(addnorm_output1,multihead_output2)
        
        #fully connected layer
        feedforward_output=self.feed_forward(addnorm_output2)
        #expected shape: (batch_size,seq_len,d_model)
        
        #normalization
        return self.add_norm3(addnorm_output2,feedforward_output)
    

#implementing decoder

class Decoder(Layer):
    def __init__(self,vocab_size,sequence_length,h,d_k,d_v,d_model,d_ff,n,rate,**kwargs):
        super(Decoder,self).__init__(**kwargs)
        self.pos_encoding=PositionEmbeddingFixedWeights(sequence_length,vocab_size,d_model)
        self.dropout=Dropout(rate)
        self.decoder_layer=[DecoderLayer(h,d_k,d_v,d_model,d_ff,rate) for _ in range(n)]
        
    def call(self,output_target,encoder_output,lookahead_mask,padding_mask,training):
        #generate pos encoding
        pos_encoding_output=self.pos_encoding(output_target)
        #output shape: (number of sentences,seq_len,d_model)
        
        #dropout layer
        x=self.dropout(pos_encoding_output,training=training)
        
        #passing positional encoded values to each encoder layer
        for i,layer in enumerate(self.decoder_layer):
            x=layer(x=x,encoder_output=encoder_output,lookahead_mask=lookahead_mask,padding_mask=padding_mask,training=training)
            # x,encoder_output,lookahead_mask,padding_mask,training
        
        return x

### testing the code

In [3]:
from numpy import random
 
#same values as in the paper

dec_vocab_size = 20  # Vocabulary size for the decoder
input_seq_length = 5  # Maximum length of the input sequence
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_ff = 2048  # Dimensionality of the inner fully connected layer
d_model = 512  # Dimensionality of the model sub-layers' outputs
n = 6  # Number of layers in the decoder stack
 
batch_size = 64  # Batch size from the training process
dropout_rate = 0.1  # Frequency of dropping the input units in the dropout layers

In [4]:
input_seq = random.random((batch_size, input_seq_length))
enc_output = random.random((batch_size, input_seq_length, d_model))
 
decoder = Decoder(dec_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)
print(decoder(output_target=input_seq, encoder_output=enc_output,lookahead_mask= None,padding_mask=None  ,training=True))

tf.Tensor(
[[[ 0.81869984  0.07329611 -0.7121857  ...  0.06149567 -1.5820209
   -0.9657264 ]
  [ 0.42286608 -0.49725017 -0.6564052  ... -1.3056955  -1.1364115
   -1.321313  ]
  [ 0.6034274   0.9116654  -1.2240113  ... -0.69463056 -1.0881586
   -1.1272588 ]
  [ 0.70999557 -0.0203667  -0.37315777 ... -1.2273759  -0.68237084
   -0.9260234 ]
  [ 0.06351195  0.1670516  -0.34705114 ... -0.4679266  -1.1768199
   -1.0989532 ]]

 [[ 0.6111519   0.5872731  -0.9024712  ... -0.6064365  -1.3971554
   -0.42652026]
  [ 0.3366668   0.06065558 -0.47282937 ... -0.94821805 -0.4994914
   -0.97895104]
  [ 0.56306094  0.61369336 -0.8127428  ... -0.6496656  -1.0926722
   -0.6118529 ]
  [ 0.818286    0.58568794 -0.15268405 ... -1.1520269  -1.4175295
   -0.61413664]
  [ 0.4942264   1.0535638  -0.09177745 ... -0.5824398  -1.1441345
   -1.1041598 ]]

 [[ 0.28710374 -0.3347942  -0.6578675  ... -0.9090593  -1.2015926
   -0.38640565]
  [ 0.38105392  0.47729585  0.44782373 ... -1.1570584  -1.3183422
   -0.82421714]
