In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import nn
import keras
from tensorflow.keras.activations import softmax
from keras import layers
from tensorflow.keras.layers import Dense,LayerNormalization ,Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# X_data = np.load("../../data/data_mfcc.npy")
X_data = np.load("../data/data_64_30.npy")
X_data = np.transpose(X_data, (0, 2, 1))
# X_data=X_data[:100]
print(X_data.shape)


In [None]:
data = pd.read_csv(
    "../data/LJSpeech-1.1/metadata.csv",
    sep="|",
    header=None,
    names=["ID", "Text1", "Text2"],
)
texts = data["Text1"].to_list()
ID = data["ID"].to_list()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
# num_classes = len(tokenizer.word_index) + 1  # Add 1 for the padding token
sequences = tokenizer.texts_to_sequences(texts)
Y_data = pad_sequences(sequences, padding="post", maxlen=30)
# Y_data=Y_data[:100]
# print(num_classes)
print(Y_data.shape)

In [None]:
def create_self_attention_mask(sequence_length):
    # Create a lower triangular matrix with ones
    mask = 1 - tf.linalg.band_part(tf.ones((sequence_length, sequence_length)), -1, 0)
    # Add a large negative value to the upper triangle
    mask = mask * -1e9
    return mask

In [None]:
def scaled_dot_product(q, k, v, mask):
    d_k = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_qk = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(d_k)

    if mask is not None:
        scaled_qk += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_qk, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

In [None]:
class ConvolutionalLayer():
    def __init__(self, input_shape, filters=32, kernel_size=3, **kwargs):
        super(ConvolutionalLayer, self).__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size

        # Extract the number of filters from the input shape
        if isinstance(input_shape, tuple):
            self.filters = input_shape[-1]

        self.conv1 = layers.Conv1D(filters=self.filters, kernel_size=self.kernel_size, padding="same")
        self.batch_norm1 = layers.BatchNormalization()
        self.relu1 = layers.ReLU()

        self.conv2 = layers.Conv1D(filters=self.filters, kernel_size=self.kernel_size, padding="same")
        self.batch_norm2 = layers.BatchNormalization()
        self.relu2 = layers.ReLU()

    
    def call(self, inputs, training=None, mask=None):
        conv1_out = self.relu1(self.batch_norm1(self.conv1(inputs), training=training))
        conv2_out = self.relu2(self.batch_norm2(self.conv2(conv1_out), training=training))
        print("CNN output shape is ",conv2_out.shape)
        return conv2_out

In [None]:
class CNN(tf.keras.Model):
    def __init__(self):
        super(CNN, self).__init__()
        self.cnn_layer = ConvolutionalLayer(input_shape=(500,40))  # Adjust input shape

    def call(self, inputs, training=None):
        cnn_output = self.cnn_layer(inputs)
        return cnn_output


In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = tf.keras.layers.Dense(3 * d_model, use_bias=False)
        self.linear_layer = tf.keras.layers.Dense(d_model, activation='relu')

    def split_heads(self, x, batch_size):
        if len(x.shape) == 2:
            # Expand dimensions to simulate batch_size=1 and sequence_length=30
            x = tf.expand_dims(tf.expand_dims(x, axis=0), axis=1)
        x = tf.reshape(x, (64, -1, self.num_heads, self.head_dim))

        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, x, mask):
        print("MultiHeadAttention input shape",x.shape)
        batch_size, _, _ = x.shape

        qkv = self.qkv_layer(x)
        q, k, v = tf.split(qkv, 3, axis=-1)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        values, attention = scaled_dot_product(q, k, v, mask)

        values = tf.transpose(values, perm=[0, 2, 1, 3])
        values = tf.reshape(values, (batch_size, -1, self.num_heads * self.head_dim))
        out = self.linear_layer(values)
        print("MultiHeadAttention output shape is ",out.shape)
        return out


In [None]:
class PositionwiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = tf.keras.layers.Dense(hidden)
        self.linear2 = tf.keras.layers.Dense(d_model)
        self.relu = tf.keras.layers.ReLU()
        self.dropout = tf.keras.layers.Dropout(rate=drop_prob)

    
    def call(self, x):
        print("Input shape for positonal encoding",x.shape)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        print("output shape from postional encoding",x.shape)
        return x

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(epsilon=1e-5)
        self.dropout1 = tf.keras.layers.Dropout(rate=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(epsilon=1e-5)
        self.dropout2 = tf.keras.layers.Dropout(rate=drop_prob)

    
    def call(self, x, self_attention_mask, training=None):
        residual_x = x
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x, training=training)
        x = self.norm1(x + residual_x)

        residual_x = x
        x = self.ffn(x)
        x = self.dropout2(x, training=training)
        x = self.norm2(x + residual_x)

        return x

In [None]:
class SequentialEncoder(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length):
        super(SequentialEncoder, self).__init__()
        self.layers = [EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)]

    
    def call(self, x, training=True, mask=None):
        for layer in self.layers:
            x = layer(x, training, mask)
        return x


In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers,
                 max_sequence_length):
        super(Encoder, self).__init__()
        self.convolutional_layer = ConvolutionalLayer(input_shape=(max_sequence_length, 30))  # Assuming input shape (max_sequence_length, 20)
        self.layers = SequentialEncoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length)
    
    
    def call(self, x, self_attention_mask):
        # Assuming x is the output from the convolutional layer
        print("Encoder input shape is",x.shape)
        x = self.convolutional_layer.call(x)
        x = self.layers(x, self_attention_mask)
        return x

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers,
                 max_sequence_length):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length)
        self.final_layer = tf.keras.layers.Dense(units=max_sequence_length, activation='softmax')
    
    def call(self, inputs, training=None):
        x = inputs['input_x']
        y = inputs['input_y']
        self_attention_mask_encoder = inputs['self_attention_mask_encoder']

        print("Input shape:", x.shape)

        encoder_output = self.encoder(x, self_attention_mask_encoder)

        print("Encoder output shape:", encoder_output.shape)

        output = self.final_layer(encoder_output)
        return output

In [None]:
d_model = 64
ffn_hidden = 120
num_heads = 1
drop_prob = 0.1
num_layers = 2
max_sequence_length = 30
transformer_model = Transformer(
    d_model=d_model,
    ffn_hidden=ffn_hidden,
    num_heads=num_heads,
    drop_prob=drop_prob,
    num_layers=num_layers,
    max_sequence_length=max_sequence_length,
)

In [None]:
from sklearn.model_selection import train_test_split

# Assuming X_data and Y_data are your input features and labels
X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_val.shape)

print(Y_train.shape)
print(Y_val.shape)

In [None]:
transformer_model.compile(optimizer='adam', loss='mean_squared_error')
# model.build(input_shape=(13100,500,40))  # Replace your_input_shape with the actual input shape
# model.summary()


In [None]:
def loss_function(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

In [None]:
# Assuming max_sequence_length is the actual length of your input sequences
max_sequence_length = X_train.shape[1]

# Create self-attention mask for encoder
self_attention_mask_encoder = create_self_attention_mask(max_sequence_length)
self_attention_mask_encoder = tf.expand_dims(self_attention_mask_encoder, axis=0)  # Add batch dimension
self_attention_mask_encoder = tf.tile(self_attention_mask_encoder, [X_train.shape[0], 1, 1])  # Tile to match the number of samples

In [None]:
transformer_model.fit(
    {
        'input_x': X_train,
        'input_y': Y_train,
        'self_attention_mask_encoder': self_attention_mask_encoder,
    },
    Y_train,
    epochs=1,
    batch_size=30
)

In [None]:
prediction = model.predict(X_val)
print("prediction ",prediction[0])
print("actual ",Y_val[0])