In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import nn
import keras
from tensorflow.keras.activations import softmax
from keras import layers
from tensorflow.keras.layers import Dense,LayerNormalization ,Flatten,MultiHeadAttention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# X_data = np.load("../../data/data_mfcc.npy")
X_data = np.load("../data/data_64_30.npy")
X_data = np.transpose(X_data, (0, 2, 1))
# X_data=X_data[:100]
print(X_data.shape)


In [None]:
data = pd.read_csv(
    "../data/LJSpeech-1.1/metadata.csv",
    sep="|",
    header=None,
    names=["ID", "Text1", "Text2"],
)
texts = data["Text1"].to_list()
ID = data["ID"].to_list()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
num_classes = len(tokenizer.word_index) + 1  # Add 1 for the padding token
sequences = tokenizer.texts_to_sequences(texts)
Y_data = pad_sequences(sequences, padding="post", maxlen=30)
# Y_data=Y_data[:100]
print(num_classes)
print(Y_data.shape)

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embedding_dim=256):
        super(PositionalEncoding, self).__init__()
        self.max_length = max_length
        self.embedding_dim = embedding_dim
        self.pos_enc = self.add_positional_encoding()

    def add_positional_encoding(self):
        position = np.arange(self.max_length)[:, np.newaxis]
        div_term = np.exp(np.arange(0, self.embedding_dim, 2) * -(np.log(10000.0) / self.embedding_dim))
        pos_enc = np.zeros((self.max_length, self.embedding_dim))
        pos_enc[:, 0::2] = np.sin(position * div_term)
        pos_enc[:, 1::2] = np.cos(position * div_term)
        return tf.convert_to_tensor(pos_enc, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.pos_enc


In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embedding_dim=256):
        super(PositionalEncoding, self).__init__()
        self.max_length = max_length
        self.embedding_dim = embedding_dim
        self.pos_enc = self.add_positional_encoding()

    def add_positional_encoding(self):
        position = np.arange(self.max_length)[:, np.newaxis]
        div_term = np.exp(np.arange(0, self.embedding_dim, 2) * -(np.log(10000.0) / self.embedding_dim))
        pos_enc = np.zeros((self.max_length, self.embedding_dim))
        pos_enc[:, 0::2] = np.sin(position * div_term)
        pos_enc[:, 1::2] = np.cos(position * div_term)
        return tf.convert_to_tensor(pos_enc, dtype=tf.float32)

    def call(self, inputs):
        # Expand dimensions to match the shape of the CNN output
        pos_enc_expanded = tf.expand_dims(self.pos_enc, axis=0)
        pos_enc_tiled = tf.tile(pos_enc_expanded, [tf.shape(inputs)[0], 1, 1])
        return inputs + pos_enc_tiled


In [None]:
class ConvolutionalLayer(tf.keras.layers.Layer):
    def __init__(self, input_shape, filters=64, kernel_size=3, **kwargs):
        super(ConvolutionalLayer, self).__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size

        self.conv1 = layers.Conv1D(filters=self.filters, kernel_size=self.kernel_size, padding="same", trainable=True)
        self.batch_norm1 = layers.BatchNormalization()
        self.relu1 = layers.ReLU()

        self.conv2 = layers.Conv1D(filters=self.filters, kernel_size=self.kernel_size, padding="same",trainable=True)
        self.batch_norm2 = layers.BatchNormalization()
        self.relu2 = layers.ReLU()

    
    def call(self, inputs, training=None, mask=None):
        conv1_out = self.relu1(self.batch_norm1(self.conv1(inputs), training=training))
        conv2_out = self.relu2(self.batch_norm2(self.conv2(conv1_out), training=training))
        print("CNN output shape is  ", conv2_out.shape)
        return conv2_out

In [None]:
def create_self_attention_mask(sequence_length):
    mask = np.tril(np.ones((sequence_length,sequence_length)))
    mask[mask==0]=-np.inf
    mask[mask==1]=0
    return mask

In [None]:
def scaled_dot_product(q, k, v, mask):
    d_k = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_qk = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(d_k)

    if mask is not None:
        scaled_qk += mask

    attention_weights = tf.nn.softmax(scaled_qk)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

In [None]:
# class MultiHeadAttention(tf.keras.layers.Layer):
#     def __init__(self, d_model, num_heads):
#         super(MultiHeadAttention, self).__init__()
#         self.d_model = d_model
#         self.num_heads = num_heads
#         self.head_dim = d_model // num_heads
#         self.qkv_layer = tf.keras.layers.Dense(3 * d_model, use_bias=False)
#         self.linear_layer = tf.keras.layers.Dense(d_model, activation='relu')

#     def split_heads(self, x, batch_size):
#         if len(x.shape) == 2:
#             x = tf.expand_dims(tf.expand_dims(x, axis=0), axis=1)
#         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dim))
#         return tf.transpose(x, perm=[0, 2, 1, 3])

#     def call(self, x, mask):
#         batch_size, _, _ = x.shape

#         qkv = self.qkv_layer(x)
#         q, k, v = tf.split(qkv, 3, axis=-1)
#         q = self.split_heads(q, batch_size)
#         k = self.split_heads(k, batch_size)
#         v = self.split_heads(v, batch_size)
#         values, attention = scaled_dot_product(q, k, v, mask)
#         print("test")
#         values = tf.transpose(values, perm=[0, 2, 1, 3])
#         values = tf.reshape(values, (batch_size, -1, self.num_heads * self.head_dim))
#         out = self.linear_layer(values)
#         print("MultiHeadAttention output shape is ",out.shape)
#         return out

In [None]:
class PositionwiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = tf.keras.layers.Dense(hidden)
        self.linear2 = tf.keras.layers.Dense(d_model)
        self.relu = tf.keras.layers.ReLU()
        self.dropout = tf.keras.layers.Dropout(rate=drop_prob)

    
    def call(self, x):
        print("Input shape for positonal encoding",x.shape)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        print("output shape from postional encoding",x.shape)
        return x

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(epsilon=1e-5)
        self.dropout1 = tf.keras.layers.Dropout(rate=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(epsilon=1e-5)
        self.dropout2 = tf.keras.layers.Dropout(rate=drop_prob)

    
    def call(self, x, self_attention_mask, training=None):
        residual_x = x
        print("input shape for multihead ",x.shape)
        x = self.attention(x, mask=self_attention_mask)
        print("output shape for multihead ",x.shape)
        x = self.dropout1(x, training=training)
        x = self.norm1(x + residual_x)
        residual_x = x
        x = self.ffn(x)
        x = self.dropout2(x, training=training)
        x = self.norm2(x + residual_x)
        print("output shape for Encoderlayer ",x.shape)

        return x

In [None]:
class SequentialEncoder(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length):
        super(SequentialEncoder, self).__init__()
        self.layers = [EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)]

    
    def call(self, x, training=True, mask=None):
        for layer in self.layers:
            x = layer(x, training, mask)
        return x

In [None]:
class CNN(tf.keras.Model):
    def __init__(self):
        super(CNN, self).__init__()
        self.cnn_layer = ConvolutionalLayer(input_shape=(500,40))  # Adjust input shape

    def call(self, inputs, training=None):
        cnn_output = self.cnn_layer(inputs)
        return cnn_output


In [None]:
self_attention_mask = create_self_attention_mask(30)
class Encoder(tf.keras.layers.Layer):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers,
                 max_sequence_length):
        super(Encoder, self).__init__()
        self.max_length = max_sequence_length
        self.layers = SequentialEncoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length)

    def call(self, x, training=True):  
        # Assuming x is the output from the convolutional layer
        print("Encoder input shape is", x.shape)
        x = self.layers(x, training=training, mask=self_attention_mask)  # Pass 'training' to the SequentialEncoder
        return x

In [None]:
d_model = 64
max_length=30
num_heads=8
num_layers=1
ffn_hidden=1024
model = tf.keras.Sequential([CNN(),
                             Encoder(d_model=d_model,ffn_hidden=ffn_hidden,num_heads=num_heads,drop_prob=0.1,num_layers=num_layers,max_sequence_length=max_length),
                             Flatten(),
                             Dense(30)])
# model.build(input_shape=(None, 500, 20))

In [None]:
from sklearn.model_selection import train_test_split

# Assuming X_data and Y_data are your input features and labels
X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_val.shape)

print(Y_train.shape)
print(Y_val.shape)

In [None]:
optim = tf.keras.optimizers.Adam(learning_rate=5)
model.compile(optimizer=optim, loss='mean_squared_error',metrics=['mae'])
# model.build(input_shape=(13100,500,40))  # Replace your_input_shape with the actual input shape
# model.summary()


In [None]:
def loss_function(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

In [None]:
model.fit(X_train,Y_train,epochs=1,validation_data=(X_val, Y_val))

In [None]:
prediction = model.predict(X_val)
print("prediction ",prediction[0])
print("actual ",Y_val[0])