In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import nn
from tensorflow.keras.activations import softmax
from keras import layers
from tensorflow.keras.layers import Dense,LayerNormalization ## alternative for nn.linear
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-12-04 19:25:06.106679: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-04 19:25:06.142171: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 19:25:06.142200: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 19:25:06.143052: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-04 19:25:06.148904: I tensorflow/core/platform/cpu_feature_guar

In [2]:
X_data = np.load("../data/data_mfcc.npy")
X_data = np.transpose(X_data, (0, 2, 1))
X_data=X_data[:100]
print(X_data.shape)
data = pd.read_csv(
    "../data/LJSpeech-1.1/metadata.csv",
    sep="|",
    header=None,
    names=["ID", "Text1", "Text2"],
)
texts = data["Text1"].to_list()
ID = data["ID"].to_list()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
num_classes = len(tokenizer.word_index) + 1  # Add 1 for the padding token
sequences = tokenizer.texts_to_sequences(texts)
Y_data = pad_sequences(sequences, padding="post", maxlen=30)
Y_data=Y_data[:100]
print(num_classes)
print(Y_data.shape)
# print(X_data[0])

(100, 500, 20)
14518
(100, 30)


In [3]:
# Assuming X_train is your original array with shape (100, 500, 20)
original_shape = X_data.shape
target_shape = (100, 500, 30)

# Calculate the padding needed for each dimension
pad_width = [(0, 0)] * len(original_shape)  # Initialize with zero padding for existing dimensions

for i in range(len(original_shape)):
    pad_width[i] = (0, target_shape[i] - original_shape[i])

# Pad the array
X_data = np.pad(X_data, pad_width=pad_width, mode='constant', constant_values=0)

# Now, X_train_padded will have the shape (100, 500, 30)
print(X_data.shape)
# print(X_data[0])


(100, 500, 30)


In [4]:
def create_self_attention_mask(sequence_length):
    # Create a lower triangular matrix with ones
    mask = 1 - tf.linalg.band_part(tf.ones((sequence_length, sequence_length)), -1, 0)
    # Add a large negative value to the upper triangle
    mask = mask * -1e9
    return mask

In [5]:
def scaled_dot_product(q, k, v, mask):
    d_k = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_qk = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(d_k)

    if mask is not None:
        scaled_qk += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_qk, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

In [6]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, d_model, max_sequence_length):
        super(PositionalEncoding, self).__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    
    def call(self, inputs):
        even_i = tf.range(0, self.d_model, 2, dtype=tf.float32)
        denominator = tf.pow(10000.0, even_i / self.d_model)
        position = tf.reshape(
            tf.range(self.max_sequence_length, dtype=tf.float32),
            (1, self.max_sequence_length, 1),
        )
        even_PE = tf.sin(position / denominator)
        odd_PE = tf.cos(position / denominator)
        stacked = tf.stack([even_PE, odd_PE], axis=2)
        PE = tf.reshape(stacked, (1, self.max_sequence_length, -1))
        return PE

In [7]:
class ConvolutionalLayer():
    def __init__(self, input_shape, filters=32, kernel_size=3, **kwargs):
        super(ConvolutionalLayer, self).__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size

        # Extract the number of filters from the input shape
        if isinstance(input_shape, tuple):
            self.filters = input_shape[-1]

        self.conv1 = layers.Conv1D(filters=self.filters, kernel_size=self.kernel_size, padding="same")
        self.batch_norm1 = layers.BatchNormalization()
        self.relu1 = layers.ReLU()

        self.conv2 = layers.Conv1D(filters=self.filters, kernel_size=self.kernel_size, padding="same")
        self.batch_norm2 = layers.BatchNormalization()
        self.relu2 = layers.ReLU()

    
    def call(self, inputs, training=None, mask=None):
        conv1_out = self.relu1(self.batch_norm1(self.conv1(inputs), training=training))
        conv2_out = self.relu2(self.batch_norm2(self.conv2(conv1_out), training=training))
        print("CNN output shape is ",gap_out.shape)
        return gap_out

In [8]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = tf.keras.layers.Dense(3 * d_model, use_bias=False)
        self.linear_layer = tf.keras.layers.Dense(d_model, activation='relu')

    def split_heads(self, x, batch_size):
        if len(x.shape) == 2:
            # Expand dimensions to simulate batch_size=1 and sequence_length=30
            x = tf.expand_dims(tf.expand_dims(x, axis=0), axis=1)
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, x, mask):
        print("MultiHeadAttention input shape",x.shape)
        batch_size, _, _ = x.shape

        qkv = self.qkv_layer(x)
        q, k, v = tf.split(qkv, 3, axis=-1)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        values, attention = scaled_dot_product(q, k, v, mask)

        values = tf.transpose(values, perm=[0, 2, 1, 3])
        values = tf.reshape(values, (batch_size, -1, self.num_heads * self.head_dim))
        out = self.linear_layer(values)
        print("MultiHeadAttention output shape is ",out.shape)
        return out


In [9]:
class PositionwiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = tf.keras.layers.Dense(hidden)
        self.linear2 = tf.keras.layers.Dense(d_model)
        self.relu = tf.keras.layers.ReLU()
        self.dropout = tf.keras.layers.Dropout(rate=drop_prob)

    
    def call(self, x):
        print("Input shape for positonal encoding",x.shape)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        print("output shape from postional encoding",x.shape)
        return x

In [10]:
class MultiHeadCrossAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadCrossAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = tf.keras.layers.Dense(2 * d_model)
        self.q_layer = tf.keras.layers.Dense(d_model)
        self.linear_layer = tf.keras.layers.Dense(d_model)

    
    def call(self, x, y, mask):
        batch_size, sequence_length, d_model = tf.shape(x)
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = tf.reshape(kv, (batch_size, sequence_length, self.num_heads, 2 * self.head_dim))
        q = tf.reshape(q, (batch_size, sequence_length, self.num_heads, self.head_dim))
        kv = tf.transpose(kv, perm=[0, 2, 1, 3])
        q = tf.transpose(q, perm=[0, 2, 1, 3])
        k, v = tf.split(kv, 2, axis=-1)
        
        values, attention = scaled_dot_product(q, k, v, mask)
        values = tf.transpose(values, perm=[0, 2, 1, 3])
        values = tf.reshape(values, (batch_size, sequence_length, d_model))
        out = self.linear_layer(values)
        return out


In [11]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(epsilon=1e-5)
        self.dropout1 = tf.keras.layers.Dropout(rate=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(epsilon=1e-5)
        self.dropout2 = tf.keras.layers.Dropout(rate=drop_prob)

    
    def call(self, x, self_attention_mask, training=None):
        residual_x = x
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x, training=training)
        x = self.norm1(x + residual_x)

        residual_x = x
        x = self.ffn(x)
        x = self.dropout2(x, training=training)
        x = self.norm2(x + residual_x)

        return x

In [12]:
class SequentialEncoder(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length):
        super(SequentialEncoder, self).__init__()
        self.layers = [EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)]

    
    def call(self, x, training=True, mask=None):
        for layer in self.layers:
            x = layer(x, training, mask)
        return x


In [13]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers,
                 max_sequence_length):
        super(Encoder, self).__init__()
        self.convolutional_layer = ConvolutionalLayer(input_shape=(max_sequence_length, 30))  # Assuming input shape (max_sequence_length, 20)
        self.positional_encoding = PositionalEncoding(d_model=d_model, max_sequence_length=max_sequence_length)
        self.layers = SequentialEncoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length)
    
    
    def call(self, x, self_attention_mask):
        # Assuming x is the output from the convolutional layer
        print("Encoder input shape is",x.shape)
        x = self.convolutional_layer.call(x)
        x = self.positional_encoding(x)
        x = self.layers(x, self_attention_mask)
        return x


In [14]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(epsilon=1e-5)
        self.dropout1 = tf.keras.layers.Dropout(rate=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(epsilon=1e-5)
        self.dropout2 = tf.keras.layers.Dropout(rate=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(epsilon=1e-5)
        self.dropout3 = tf.keras.layers.Dropout(rate=drop_prob)

    
    def call(self, x, y, self_attention_mask, cross_attention_mask, training=None):
        print("Shape of x before encoder_decoder_attention:", x.shape)
        print("Shape of y before encoder_decoder_attention:", y.shape)

        _y = y
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y, training=training)
        y = self.layer_norm1(y + _y)

        _y = y
        print("Shape of x before encoder_decoder_attention:", x.shape)
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y, training=training)
        y = self.layer_norm2(y + _y)

        _y = y
        y = self.ffn(y)
        y = self.dropout3(y, training=training)
        y = self.layer_norm3(y + _y)
        return y


In [15]:
class SequentialDecoder(tf.keras.layers.Layer):
    def __init__(self, layers):
        super(SequentialDecoder, self).__init__()
        self.layers = layers
    
    def call(self, x, y, self_attention_mask, cross_attention_mask, training=None):
        for layer in self.layers:
            y = layer(x, y, self_attention_mask, cross_attention_mask, training=training)
        return y

In [16]:
class Transformer(tf.keras.Model):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers,
                 max_sequence_length):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length)
        self.decoder = SequentialDecoder([DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])
        self.final_layer = tf.keras.layers.Dense(units=max_sequence_length, activation='softmax')
    
    def call(self, inputs, training=None):
        x = inputs['input_x']
        y = inputs['input_y']
        self_attention_mask_encoder = inputs['self_attention_mask_encoder']
        self_attention_mask_decoder = inputs['self_attention_mask_decoder']
        encoder_decoder_attention_mask = inputs['encoder_decoder_attention_mask']

        print("Input shape:", x.shape)

        encoder_output = self.encoder(x, self_attention_mask_encoder)

        print("Encoder output shape:", encoder_output.shape)

        decoder_output = self.decoder(encoder_output, y, self_attention_mask_decoder, encoder_decoder_attention_mask, training=training)

        print("Decoder output shape:", decoder_output.shape)

        output = self.final_layer(decoder_output)
        return output

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_val.shape)
print(Y_train.shape)
print(Y_val.shape)


(80, 500, 30)
(20, 500, 30)
(80, 30)
(20, 30)


In [18]:
d_model = 30
ffn_hidden = 120
num_heads = 1
drop_prob = 0.1
num_layers = 2
max_sequence_length = 500
transformer_model = Transformer(
    d_model=d_model,
    ffn_hidden=ffn_hidden,
    num_heads=num_heads,
    drop_prob=drop_prob,
    num_layers=num_layers,
    max_sequence_length=max_sequence_length,
)

2023-12-04 19:25:09.482781: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2023-12-04 19:25:09.482822: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: Pc
2023-12-04 19:25:09.482827: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: Pc
2023-12-04 19:25:09.482994: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 535.129.3
2023-12-04 19:25:09.483009: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 535.129.3
2023-12-04 19:25:09.483012: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:241] kernel version seems to match DSO: 535.129.3


In [19]:
transformer_model.compile(optimizer='adam', loss='mean_squared_error')
# transformer_model.build(input_shape=(100,500,30))  # Replace your_input_shape with the actual input shape
# transformer_model.summary()


In [20]:
# Assuming max_sequence_length is the actual length of your input sequences
max_sequence_length = X_train.shape[1]

# Create self-attention mask for encoder
self_attention_mask_encoder = create_self_attention_mask(max_sequence_length)
self_attention_mask_encoder = tf.expand_dims(self_attention_mask_encoder, axis=0)  # Add batch dimension
self_attention_mask_encoder = tf.tile(self_attention_mask_encoder, [X_train.shape[0], 1, 1])  # Tile to match the number of samples

# Create self-attention mask for decoder
self_attention_mask_decoder = create_self_attention_mask(500)  # Assuming 30 is the length of your target sequence
self_attention_mask_decoder = tf.expand_dims(self_attention_mask_decoder, axis=0)  # Add batch dimension
self_attention_mask_decoder = tf.tile(self_attention_mask_decoder, [X_train.shape[0],1, 1])  # Tile to match the number of samples

# Create encoder-decoder attention mask
encoder_decoder_attention_mask = create_self_attention_mask(500)  # Assuming 30 is the length of your target sequence
encoder_decoder_attention_mask = tf.expand_dims(encoder_decoder_attention_mask, axis=0)  # Add batch dimension
encoder_decoder_attention_mask = tf.tile(encoder_decoder_attention_mask, [X_train.shape[0],1, 1])  # Tile to match the number of samples
print(self_attention_mask_encoder.shape)
print(self_attention_mask_decoder.shape)
print(encoder_decoder_attention_mask.shape)

(80, 500, 500)
(80, 500, 500)
(80, 500, 500)


In [21]:

# Assuming you have modified your Transformer model to accept these masks separately
transformer_model.fit(
    {
        'input_x': X_train,
        'input_y': Y_train,
        'self_attention_mask_encoder': self_attention_mask_encoder,
        'self_attention_mask_decoder': self_attention_mask_decoder,
        'encoder_decoder_attention_mask': encoder_decoder_attention_mask
    },
    Y_train,
    epochs=1,
    batch_size=30
)


Input shape: (None, 500, 30)
Encoder input shape is (None, 500, 30)
CNN output shape is  (None, 30)
MultiHeadAttention input shape (1, 500, 30)
MultiHeadAttention output shape is  (1, None, 30)
Input shape for positonal encoding (1, 500, 30)
output shape from postional encoding (1, 500, 30)
MultiHeadAttention input shape (1, 500, 30)
MultiHeadAttention output shape is  (1, None, 30)
Input shape for positonal encoding (1, 500, 30)
output shape from postional encoding (1, 500, 30)
Encoder output shape: (1, 500, 30)
Shape of x before encoder_decoder_attention: (1, 500, 30)
Shape of y before encoder_decoder_attention: (None, 30)
MultiHeadAttention input shape (None, 30)


ValueError: in user code:

    File "/home/abhi/anaconda3/envs/test/lib/python3.9/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/home/abhi/anaconda3/envs/test/lib/python3.9/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/abhi/anaconda3/envs/test/lib/python3.9/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/home/abhi/anaconda3/envs/test/lib/python3.9/site-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/home/abhi/anaconda3/envs/test/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_filed0t3bt1m.py", line 18, in tf__call
        decoder_output = ag__.converted_call(ag__.ld(self).decoder, (ag__.ld(encoder_output), ag__.ld(y), ag__.ld(self_attention_mask_decoder), ag__.ld(encoder_decoder_attention_mask)), dict(training=ag__.ld(training)), fscope)
    File "/tmp/__autograph_generated_fileabcr8nya.py", line 23, in tf__call
        ag__.for_stmt(ag__.ld(self).layers, None, loop_body, get_state, set_state, ('y',), {'iterate_names': 'layer'})
    File "/tmp/__autograph_generated_fileabcr8nya.py", line 21, in loop_body
        y = ag__.converted_call(ag__.ld(layer), (ag__.ld(x), ag__.ld(y), ag__.ld(self_attention_mask), ag__.ld(cross_attention_mask)), dict(training=ag__.ld(training)), fscope)
    File "/tmp/__autograph_generated_file0o2eepzw.py", line 13, in tf__call
        y = ag__.converted_call(ag__.ld(self).self_attention, (ag__.ld(y),), dict(mask=ag__.ld(self_attention_mask)), fscope)
    File "/tmp/__autograph_generated_filev5lv2sn9.py", line 11, in tf__call
        (batch_size, _, _) = ag__.ld(x).shape

    ValueError: Exception encountered when calling layer 'transformer' (type Transformer).
    
    in user code:
    
        File "/tmp/ipykernel_38863/2657846362.py", line 28, in call  *
            decoder_output = self.decoder(encoder_output, y, self_attention_mask_decoder, encoder_decoder_attention_mask, training=training)
        File "/home/abhi/anaconda3/envs/test/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/tmp/__autograph_generated_fileabcr8nya.py", line 23, in tf__call
            ag__.for_stmt(ag__.ld(self).layers, None, loop_body, get_state, set_state, ('y',), {'iterate_names': 'layer'})
        File "/tmp/__autograph_generated_fileabcr8nya.py", line 21, in loop_body
            y = ag__.converted_call(ag__.ld(layer), (ag__.ld(x), ag__.ld(y), ag__.ld(self_attention_mask), ag__.ld(cross_attention_mask)), dict(training=ag__.ld(training)), fscope)
        File "/tmp/__autograph_generated_file0o2eepzw.py", line 13, in tf__call
            y = ag__.converted_call(ag__.ld(self).self_attention, (ag__.ld(y),), dict(mask=ag__.ld(self_attention_mask)), fscope)
        File "/tmp/__autograph_generated_filev5lv2sn9.py", line 11, in tf__call
            (batch_size, _, _) = ag__.ld(x).shape
    
        ValueError: Exception encountered when calling layer 'sequential_decoder' (type SequentialDecoder).
        
        in user code:
        
            File "/tmp/ipykernel_38863/576445799.py", line 8, in call  *
                y = layer(x, y, self_attention_mask, cross_attention_mask, training=training)
            File "/home/abhi/anaconda3/envs/test/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
            File "/tmp/__autograph_generated_file0o2eepzw.py", line 13, in tf__call
                y = ag__.converted_call(ag__.ld(self).self_attention, (ag__.ld(y),), dict(mask=ag__.ld(self_attention_mask)), fscope)
            File "/tmp/__autograph_generated_filev5lv2sn9.py", line 11, in tf__call
                (batch_size, _, _) = ag__.ld(x).shape
        
            ValueError: Exception encountered when calling layer 'decoder_layer' (type DecoderLayer).
            
            in user code:
            
                File "/tmp/ipykernel_38863/2717270461.py", line 22, in call  *
                    y = self.self_attention(y, mask=self_attention_mask)
                File "/home/abhi/anaconda3/envs/test/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
                    raise e.with_traceback(filtered_tb) from None
                File "/tmp/__autograph_generated_filev5lv2sn9.py", line 11, in tf__call
                    (batch_size, _, _) = ag__.ld(x).shape
            
                ValueError: Exception encountered when calling layer 'multi_head_attention_2' (type MultiHeadAttention).
                
                in user code:
                
                    File "/tmp/ipykernel_38863/2633592716.py", line 19, in call  *
                        batch_size, _, _ = x.shape
                
                    ValueError: not enough values to unpack (expected 3, got 2)
                
                
                Call arguments received by layer 'multi_head_attention_2' (type MultiHeadAttention):
                  • x=tf.Tensor(shape=(None, 30), dtype=int32)
                  • mask=tf.Tensor(shape=(None, 500, 500), dtype=float32)
            
            
            Call arguments received by layer 'decoder_layer' (type DecoderLayer):
              • x=tf.Tensor(shape=(1, 500, 30), dtype=float32)
              • y=tf.Tensor(shape=(None, 30), dtype=int32)
              • self_attention_mask=tf.Tensor(shape=(None, 500, 500), dtype=float32)
              • cross_attention_mask=tf.Tensor(shape=(None, 500, 500), dtype=float32)
              • training=True
        
        
        Call arguments received by layer 'sequential_decoder' (type SequentialDecoder):
          • x=tf.Tensor(shape=(1, 500, 30), dtype=float32)
          • y=tf.Tensor(shape=(None, 30), dtype=int32)
          • self_attention_mask=tf.Tensor(shape=(None, 500, 500), dtype=float32)
          • cross_attention_mask=tf.Tensor(shape=(None, 500, 500), dtype=float32)
          • training=True
    
    
    Call arguments received by layer 'transformer' (type Transformer):
      • inputs={'input_x': 'tf.Tensor(shape=(None, 500, 30), dtype=float32)', 'input_y': 'tf.Tensor(shape=(None, 30), dtype=int32)', 'self_attention_mask_encoder': 'tf.Tensor(shape=(None, 500, 500), dtype=float32)', 'self_attention_mask_decoder': 'tf.Tensor(shape=(None, 500, 500), dtype=float32)', 'encoder_decoder_attention_mask': 'tf.Tensor(shape=(None, 500, 500), dtype=float32)'}
      • training=True
