In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import librosa

In [None]:
import pandas as pd

data = pd.read_csv(
    "E:\\ML\\data\\LJSpeech-1.1\\metadata.csv",
    sep="|",
    header=None,
    names=["ID", "Text1", "Text2"],
)
texts = data["Text1"].to_list()
ID = data["ID"].to_list()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
test = tokenizer.word_index
print(sequences[:10])
X_test = pad_sequences(sequences, padding="post", maxlen=30)
y_text = np.array(X_test)

In [None]:
# Function to extract MFCC features from audio


def feature_extraction(audio_path, desired_frames):
    x, sample_rate = librosa.load(audio_path, res_type="kaiser_fast")
    mfcc = librosa.feature.mfcc(
        y=x,  # Audio signal
        sr=sample_rate,  # Sampling rate
        n_mfcc=20,  # Number of MFCC coefficients to compute
        n_fft=2048,  # FFT window size
        hop_length=512,  # Number of samples between successive frames
        n_mels=128,  # Number of Mel bands to generate
        htk=True,  # Use HTK formula for Mel filter banks
        norm="ortho",  # Normalization for Mel spectrogram
        center=False,  # Do not center the frame
        pad_mode="constant",  # Padding mode
    )
    if mfcc.shape[1] < desired_frames:
        pad_width = desired_frames - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)))

    # If the number of frames is greater than desired_frames, truncate
    elif mfcc.shape[1] > desired_frames:
        mfcc = mfcc[:, :desired_frames]
    return mfcc


# Load your dataset
dataset_directory = "E:\\ML\\data\\LJSpeech-1.1\\wavs\\"
audio_files = os.listdir(dataset_directory)

# Lists to store features and transcriptions
mfcc_features_list = []
transcription_sequences = []
i = 0
# Process each audio file
for audio_file in audio_files:
    audio_path = os.path.join(dataset_directory, audio_file)

    # Extract MFCC features
    mfcc_features = feature_extraction(audio_path, 500)
    mfcc_features_list.append(mfcc_features)
    if i % 1000 == 0:
        print(i)
    i += 1
X_mfcc = np.array(mfcc_features_list)
# np.save("E:\\ML\\data", X_mfcc)

In [None]:
def transformer_block(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    x = inputs
    x = layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = layers.Dropout(dropout)(x)
    res = x
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return res + x

In [None]:
def build_transformer_stt_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0.1,
):
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_block(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(dropout)(x)

    # Output layer for sequence data
    outputs = layers.Dense(vocab_size)(x)  # Adjust vocab_size based on your task
    return tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
# Assuming you have your features (X) and labels (y) ready
# X_mfcc = np.load("../data/data_mfcc.npy")
X_train, X_test, y_train, y_test = train_test_split(
    X_mfcc, y_text, test_size=0.2, random_state=42
)
print(X_mfcc.shape)
print(y_text.shape)
print(X_train.shape)
print(X_train.shape[1])

In [None]:
# Example usage
head_size = 256
num_heads = 4
ff_dim = 4
num_transformer_blocks = 4
mlp_units = [128]

# Assume vocab_size is the number of unique characters in your transcriptions
vocab_size = 30  # Replace with the actual size of your vocabulary

model = build_transformer_stt_model(
    input_shape=(X_mfcc.shape[1], X_mfcc.shape[2]),
    head_size=head_size,
    num_heads=num_heads,
    ff_dim=ff_dim,
    num_transformer_blocks=num_transformer_blocks,
    mlp_units=mlp_units,
)

# Compile the model with appropriate loss and optimizer
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="mae", metrics=["mae"])

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

In [None]:
X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
predicted = model.predict(X_train_reshaped[1])
print(predicted, y_train[1])