In [None]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import layers
import numpy as np
import pandas as pd

In [None]:
X_data = np.load("../data/data_mfcc.npy")
X_data = np.transpose(X_data, (0, 2, 1))
data = pd.read_csv(
    "../data/LJSpeech-1.1/metadata.csv",
    sep="|",
    header=None,
    names=["ID", "Text1", "Text2"],
)
texts = data["Text1"].to_list()
ID = data["ID"].to_list()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
num_classes = len(tokenizer.word_index) + 1  # Add 1 for the padding token
sequences = tokenizer.texts_to_sequences(texts)
Y_data = pad_sequences(sequences, padding="post", maxlen=30)
print(num_classes)
print(Y_data.shape)
Y_data = Y_data.reshape((-1, 1))

In [None]:
class ConvolutionalLayer():
    def __init__(self, input_shape, filters=32, kernel_size=3, **kwargs):
        super(ConvolutionalLayer, self).__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size

        # Extract the number of filters from the input shape
        if isinstance(input_shape, tuple):
            self.filters = input_shape[-1]

        self.conv1 = layers.Conv1D(filters=self.filters, kernel_size=self.kernel_size, padding="same")
        self.batch_norm1 = layers.BatchNormalization()
        self.relu1 = layers.ReLU()

        self.conv2 = layers.Conv1D(filters=self.filters, kernel_size=self.kernel_size, padding="same")
        self.batch_norm2 = layers.BatchNormalization()
        self.relu2 = layers.ReLU()

        self.global_avg_pooling = layers.GlobalAveragePooling1D()

    def call(self, inputs, training=None, mask=None):
        conv1_out = self.relu1(self.batch_norm1(self.conv1(inputs), training=training))
        conv2_out = self.relu2(self.batch_norm2(self.conv2(conv1_out), training=training))
        gap_out = self.global_avg_pooling(conv2_out)
        return gap_out

In [None]:
# class PositionalEncodingLayer(layers.Layer):
#     def __init__(self, position, model_dim, **kwargs):
#         super(PositionalEncodingLayer, self).__init__(**kwargs)
#         self.position = position
#         self.model_dim = model_dim

#     def build(self, input_shape):
#         super(PositionalEncodingLayer, self).build(input_shape)

#     def call(self, inputs):
#         position = tf.range(start=0, limit=self.position, delta=1, dtype=tf.float32)
#         position = position / tf.cast(self.position, dtype=tf.float32)

#         inputs *= tf.math.sqrt(tf.cast(self.model_dim, dtype=tf.float32))
#         position_encoding = tf.expand_dims(position, 1) - tf.range(
#             start=0, limit=self.model_dim, delta=2, dtype=tf.float32
#         ) / tf.cast(self.model_dim, dtype=tf.float32)
#         position_encoding = tf.expand_dims(position_encoding, 0)

#         return inputs + position_encoding

#     def compute_output_shape(self, input_shape):
#         return input_shape
    


In [None]:
# Replace PositionalEncodingLayer with Embedding layer for positional embedding(input)
class PositionalEmbeddingLayer(layers.Layer):
    def __init__(self, position, model_dim):
        super(PositionalEmbeddingLayer, self).__init__()
        self.positional_embedding = layers.Embedding(
            input_dim=position, output_dim=model_dim
        )

    def call(self, inputs):
        positions = tf.range(tf.shape(inputs)[1], dtype=tf.float32)
        return inputs + self.positional_embedding(positions)

In [None]:

# Positional embedding layer (output)
class PositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, **kwargs):
        super(PositionEmbedding, self).__init__(**kwargs)
        self.sequence_length = sequence_length

    def build(self, input_shape):
        feature_length = input_shape[-1]
        self.position_embeddings = self.add_weight(
            shape=(self.sequence_length, feature_length),
            initializer=tf.keras.initializers.RandomNormal(),
            trainable=True,
        )

    def call(self, inputs):
        start_index = 0
        sequence_length = tf.shape(self.position_embeddings)[0]
        feature_length = tf.shape(self.position_embeddings)[-1]

        position_embeddings = tf.tile(
            tf.slice(
                self.position_embeddings,
                (start_index, 0),
                (sequence_length, feature_length),
            ),
            [tf.shape(inputs)[0] // sequence_length + 1, 1],
        )

        return tf.slice(position_embeddings, (0, 0), tf.shape(inputs))


In [None]:
class TransformerEncoderBlock(keras.layers.Layer):
    def __init__(self, head_size, num_heads, ff_dim, dropout, num_blocks=1, **kwargs):
        super(TransformerEncoderBlock, self).__init__(**kwargs)
        self.head_size = head_size
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.dropout = dropout
        self.num_blocks = num_blocks

        # Create a list of Transformer encoder blocks
        self.encoder_blocks = [self.build_encoder_block() for _ in range(num_blocks)]

    def build_encoder_block(self):
        return TransformerEncoderBlockSingle(
            head_size=self.head_size,
            num_heads=self.num_heads,
            ff_dim=self.ff_dim,
            dropout=self.dropout,
        )

    def build(self, input_shape):
        super(TransformerEncoderBlock, self).build(input_shape)
        # Ensure that the encoder blocks are built
        for encoder_block in self.encoder_blocks:
            encoder_block.build(input_shape)

    def call(self, inputs):
        # Stack multiple Transformer encoder blocks
        x = inputs
        for encoder_block in self.encoder_blocks:
            x = encoder_block(x)
        return x


class TransformerEncoderBlockSingle(keras.layers.Layer):
    def __init__(self, head_size, num_heads, ff_dim, dropout, **kwargs):
        super(TransformerEncoderBlockSingle, self).__init__(**kwargs)

        self.head_size = head_size
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.dropout = dropout

        # Multi-head self-attention layer
        self.self_attention = layers.MultiHeadAttention(
            key_dim=self.head_size,
            num_heads=self.num_heads,
            dropout=self.dropout,
        )

        # Feed Forward Part
        self.ffn_hidden = layers.Dense(self.ff_dim, activation="relu")
        self.ffn_output = layers.Dense(self.head_size)  # Use head_size here

        # Layer Normalization
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

        # Dropout
        self.dropout1 = layers.Dropout(self.dropout)
        self.dropout2 = layers.Dropout(self.dropout)

    def build(self, input_shape):
        super(TransformerEncoderBlockSingle, self).build(input_shape)

    def call(self, inputs):
        # Attention and Normalization
        self_attn_output = self.self_attention(inputs, inputs)
        self_attn_output = self.dropout1(self_attn_output)
        x = self.layernorm1(self_attn_output + inputs)

        # Feed Forward Part
        ffn_output = self.ffn_output(self.ffn_hidden(x))
        x = self.layernorm2(ffn_output + x)

        return x

In [None]:
class TransformerDecoder(keras.layers.Layer):
    def __init__(self, num_blocks, head_size, num_heads, ff_dim, dropout, output_dim, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.num_blocks = num_blocks
        self.decoder_blocks = [
            TransformerDecoderBlock(
                head_size=head_size,
                num_heads=num_heads,
                ff_dim=ff_dim,
                dropout=dropout,
            )
            for _ in range(num_blocks)
        ]
        self.output_dim = output_dim  # Add output_dim to the class attributes

    def call(self, inputs, encoder_outputs, mask=None, training=None):
        x = inputs
        for decoder_block in self.decoder_blocks:
            x = decoder_block(x, encoder_outputs, mask=mask, training=training)
        # Add a dense layer to produce the final output with the specified output_dim
        x = layers.Dense(self.output_dim, activation="softmax")(x)
        return x




class TransformerDecoderBlock(keras.layers.Layer):
    def __init__(self, head_size, num_heads, ff_dim, dropout, **kwargs):
        super(TransformerDecoderBlock, self).__init__(**kwargs)
        self.head_size = head_size
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.dropout = dropout

        # Self Attention Layer
        self.self_attention = layers.MultiHeadAttention(
            key_dim=head_size, num_heads=num_heads, dropout=dropout
        )

        # Encoder-Decoder Attention Layer
        self.encoder_decoder_attention = layers.MultiHeadAttention(
            key_dim=head_size, num_heads=num_heads, dropout=dropout
        )

        # Feed Forward Part
        self.ffn_hidden = layers.Dense(ff_dim, activation="relu")
        self.ffn_dropout = layers.Dropout(dropout)
        self.ffn_output = layers.Dense(head_size)

        # Layer Normalization
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)

        # Dropout
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)
        self.dropout3 = layers.Dropout(dropout)


    def call(self, inputs, encoder_outputs, mask=None, training=None):
        # Self Attention
        self_attn_output = self.self_attention(

            inputs, inputs, attention_mask=mask, return_attention_scores=False
        )


        self_attn_output = self.dropout1(self_attn_output, training=training)

        attn_output1 = self.layernorm1(self_attn_output + inputs)


        # Encoder-Decoder Attention
        enc_dec_attn_output = self.encoder_decoder_attention(

            attn_output1, encoder_outputs, return_attention_scores=False
        )


        enc_dec_attn_output = self.dropout2(enc_dec_attn_output, training=training)

        attn_output2 = self.layernorm2(enc_dec_attn_output + attn_output1)


        # Feed Forward
        ffn_output = self.ffn_output(

            self.ffn_dropout(self.ffn_hidden(attn_output2), training=training)
        )


        ffn_output = self.layernorm3(ffn_output + attn_output2)

        return ffn_output

In [None]:
input_shape = (X_data.shape[1], X_data.shape[2])
print(input_shape)


# conv_output = Convolutional_Layer(input_shape=(input_shape))

# # Apply positional encoding to the tensor output of the convolutional layer
# positional_encoding = PositionalEncodingLayer(position=500, model_dim=64)(conv_output)

In [None]:
class MyTransformerModel(keras.Model):
    def __init__(self, input_shape, position, model_dim, num_blocks):
        super(MyTransformerModel, self).__init__()

        # Define the Convolutional Layer
        self.convolutional_layer = ConvolutionalLayer(input_shape=input_shape)

        # Define the Positional Embedding Layer
        self.positional_embedding = PositionalEmbeddingLayer(position, model_dim)

        # Define the Transformer Encoder Block
        self.transformer_encoder = TransformerEncoderBlock(
            head_size=32, num_heads=4, ff_dim=128, dropout=0.1, num_blocks=num_blocks
        )

        # Define the Transformer Decoder Block
        self.transformer_decoder = TransformerDecoder(
            num_blocks=num_blocks,
            head_size=32,
            num_heads=4,
            ff_dim=128,
            dropout=0.1,
            output_dim=num_classes,  # Use num_classes instead of Y_data.shape[1]
        )

    def call(self, inputs, training=None, mask=None):
        # Missing input shape
        conv_output = self.convolutional_layer(inputs)

        # Apply Positional Embedding
        positional_embedding_output = self.positional_embedding(tf.cast(Y_data, dtype=tf.float32))

        # Apply Transformer Encoder
        transformer_encoder_output = self.transformer_encoder(conv_output)

        # Apply Transformer Decoder
        transformer_output_dec = self.transformer_decoder(
            encoder_outputs=transformer_encoder_output, inputs=positional_embedding_output
        )

        return transformer_output_dec


In [None]:

# Create an instance of the model
input_shape = (X_data.shape[1], X_data.shape[2])
num_blocks = 3
model = MyTransformerModel(
    input_shape=input_shape, position=500, model_dim=64, num_blocks=num_blocks
)


# Call the model on a batch of data to build it


# Print the model summary
# model.summary()


In [None]:
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

In [None]:
Y_data = Y_data.reshape((13100, 30, -1))
model.fit(X_data, Y_data, epochs=1, batch_size=32)