In [1]:
# !pip install --upgrade datasets fsspec aiohttp


## Importing libraries

In [2]:
from tensorflow.keras import layers, models
from huggingface_hub import login
import os
import numpy as np
import librosa
import tensorflow as tf
tf.config.run_functions_eagerly(True)
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import soundfile as sf
from datasets import load_dataset
from tqdm import tqdm
from itertools import islice
import string 

2025-05-04 18:57:40.342209: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-04 18:57:40.354192: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746365260.368956   38984 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746365260.374073   38984 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746365260.384183   38984 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

## Setting up variables for data downloading & training 

In [3]:
import os
login(token="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")
os.environ["HF_DATASETS_OFFLINE"] = "0"

In [4]:
batch_size = 32
epochs = 10 
validation_split = 0.2 

max_samples = 1000
max_audio_duration =5.0
# ample_rate=16000

In [5]:
CHARS = string.ascii_lowercase + " '"
CHARS

"abcdefghijklmnopqrstuvwxyz '"

##  Preprocess functions : utils

In [6]:


class AudioPreprocessor:
    def __init__(self, sample_rate=16000, max_duration=5.0):
        self.sample_rate = sample_rate
        self.max_duration = max_duration  # seconds
        self.max_len_samples = int(sample_rate * max_duration)
        self.max_len_time_steps = int(np.floor(self.max_len_samples / 512)) + 1


    def load_and_process_audio(self, file_path):
        """Loads audio, pads/truncates, and computes log Mel spectrogram."""
        try:
            y, sr = librosa.load(file_path, sr=self.sample_rate)
        except Exception as e:
            print(f"Error loading audio file {file_path}: {e}")
            return None # Return None if loading fails

        # Pad or truncate audio samples
        if len(y) > self.max_len_samples:
            y = y[:self.max_len_samples]
        else:
            y = np.pad(y, (0, max(0, self.max_len_samples - len(y))))

        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        if log_mel_spec.shape[1] > self.max_len_time_steps:
             log_mel_spec = log_mel_spec[:, :self.max_len_time_steps]
        elif log_mel_spec.shape[1] < self.max_len_time_steps:
             pad_width = self.max_len_time_steps - log_mel_spec.shape[1]
             log_mel_spec = np.pad(log_mel_spec, ((0, 0), (0, pad_width)), mode='constant')

       
        return log_mel_spec.T

class TextPreprocessor:
    def __init__(self):
        self.chars = sorted(list(CHARS))
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.padding_index = len(self.chars)
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.int_to_char[self.padding_index] = "<pad>" 


    def encode(self, text):
        encoded = [self.char_to_int[c] for c in text.lower() if c in self.char_to_int]
        return encoded

    def decode(self, seq):
        decoded_chars = [self.int_to_char[i] for i in seq if i != self.padding_index]
        return "".join(decoded_chars)

    def get_vocab_size(self):
        return len(self.chars) + 1 



## Data Loading & Preprocessing

Data used in training is fetched from hugging face "common_voice_13_0" , As it is taking high resource I have used the streaming to collect only 1000 samples for training

In [8]:
def load_data(sample_rate=16000, max_samples=max_samples, max_audio_duration=max_audio_duration):

    dataset = load_dataset(
        "mozilla-foundation/common_voice_13_0",
        "en", 
        split="train", 
        streaming=True,
        storage_options={"http": {}} )

   
    print(f"Taking the first {max_samples} samples...")
    dataset_head = list(islice(dataset, max_samples))

    audio_proc = AudioPreprocessor(sample_rate, max_audio_duration)
    text_proc = TextPreprocessor()

    temp_audio_dir = "temp_audio"
    os.makedirs(temp_audio_dir, exist_ok=True)

    X = [] 
    texts = [] 
    processed_count = 0
    for i, item in tqdm(enumerate(dataset_head), total=max_samples):
        try:
            audio_array = item["audio"]["array"]
          
            path = os.path.join(temp_audio_dir, f"sample_{i}.wav")
            sf.write(path, audio_array, samplerate=sample_rate)
            processed_audio = audio_proc.load_and_process_audio(path)

            original_text = item["sentence"]

            if processed_audio is not None:
                X.append(processed_audio)
                texts.append(original_text)
                processed_count += 1


        except Exception as e:
            print(f"Error processing sample {i}: {e}")
         

    print(f"Successfully processed {processed_count} samples.")

    print("Encoding text data...")
    encoded_texts = [text_proc.encode(t) for t in texts]
    target_seq_length = audio_proc.max_len_time_steps
    y = pad_sequences(
        encoded_texts,
        maxlen=target_seq_length, 
        padding='post', 
        value=text_proc.padding_index )

    X = pad_sequences(X, padding='post', dtype='float32', value=0.0)
    sample_weights = np.zeros_like(y, dtype=np.float32)
    for i, seq in enumerate(y):
        non_padding_indices = np.where(seq != text_proc.padding_index)[0]
        if len(non_padding_indices) > 0:
             sample_weights[i, non_padding_indices] = 1.0
    min_samples = min(len(X), len(y), len(sample_weights))
    X = np.array(X[:min_samples])
    y = y[:min_samples]
    sample_weights = sample_weights[:min_samples]

    print("Data loading and preprocessing complete.")
    return X, y, sample_weights, text_proc

In [9]:
X, y, sample_weights, text_proc = load_data()

X.shape , y.shape , sample_weights.shape

Taking the first 1000 samples...


Reading metadata...: 1013968it [00:58, 17387.54it/s]
100%|███████████████████████████████████████| 1000/1000 [00:22<00:00, 43.94it/s]


Successfully processed 1000 samples.
Encoding text data...
Data loading and preprocessing complete.


((1000, 157, 128), (1000, 157), (1000, 157))

## Model : Bidirectional LSTM with time distributed layers

In [10]:

# def build_asr_model(input_shape, output_dim):
#     inputs = layers.Input(shape=input_shape, name="input_features")
#     x = layers.Masking(mask_value=0.0, name="masking_input")(inputs)
#     x = layers.Bidirectional(layers.LSTM(128, return_sequences=True), name="bidirectional_lstm_1")(x)
#     x = layers.Bidirectional(layers.LSTM(128, return_sequences=True), name="bidirectional_lstm_2")(x)
#     x = layers.TimeDistributed(layers.Dense(256, activation='relu'), name="time_distributed_dense_1")(x)
#     outputs = layers.TimeDistributed(layers.Dense(output_dim, activation='softmax'), name="output_softmax")(x)

#     model = models.Model(inputs, outputs, name="asr_model")
#     model.compile(
#         optimizer='adam', 
#         loss='sparse_categorical_crossentropy',
#         metrics=['accuracy'] 
#     )
#     return model



def build_asr_model(input_shape, output_dim, dropout_rate=0.2):
    inputs = layers.Input(shape=input_shape, name="input_features")
    x = layers.Masking(mask_value=0.0, name="masking_input")(inputs)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=dropout_rate), name="bidirectional_lstm_1")(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=dropout_rate), name="bidirectional_lstm_2")(x)
    x = layers.TimeDistributed(layers.Dense(256, activation='relu'), name="time_distributed_dense_1")(x)
    outputs = layers.TimeDistributed(layers.Dense(output_dim, activation='softmax'), name="output_softmax")(x)
    model = models.Model(inputs, outputs, name="asr_model")
    model.compile(
        optimizer='adam', 
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


In [11]:
output_dim = text_proc.get_vocab_size()
input_shape = (X.shape[1], X.shape[2])
# model = build_asr_model(input_shape, output_dim)
model = build_asr_model(input_shape=(157, 128), output_dim=len(text_proc.chars)+1)

2025-05-04 19:00:41.213824: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [12]:

model.summary()

## Training

In [None]:
history = model.fit(
        X,
        y,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=validation_split,
        sample_weight=sample_weights
    )


Epoch 1/10


2025-05-04 19:00:49.853530: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 5s/step - accuracy: 0.0468 - loss: 1.1241 - val_accuracy: 0.0575 - val_loss: 1.0972
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 9s/step - accuracy: 0.0581 - loss: 1.1062 - val_accuracy: 0.0596 - val_loss: 1.0963
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 6s/step - accuracy: 0.0585 - loss: 1.0792 - val_accuracy: 0.0596 - val_loss: 1.0933
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 4s/step - accuracy: 0.0598 - loss: 1.0995 - val_accuracy: 0.0612 - val_loss: 1.0921
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 4s/step - accuracy: 0.0599 - loss: 1.0829 - val_accuracy: 0.0618 - val_loss: 1.0906
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 4s/step - accuracy: 0.0605 - loss: 1.0746 - val_accuracy: 0.0613 - val_loss: 1.0901
Epoch 7/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━

## Saving Model

In [19]:
ModelPath = "Model/asr_model.h5" 
model.save(ModelPath)



## Loading Model for Inferencing

In [23]:
model = tf.keras.models.load_model(ModelPath)

In [36]:
def transcribe_long_audio(model, audio_path, sample_rate=16000, chunk_duration=5.0, overlap=1.0):
    audio_proc = AudioPreprocessor(sample_rate, chunk_duration)
    text_proc = TextPreprocessor()

    y, sr = librosa.load(audio_path, sr=sample_rate)
    chunk_samples = int(sample_rate * chunk_duration)
    step_size = int(chunk_samples - sample_rate * overlap)

    transcript = ""

    for start in range(0, len(y), step_size):
        end = start + chunk_samples
        chunk = y[start:end]

        if len(chunk) < chunk_samples:
            chunk = np.pad(chunk, (0, chunk_samples - len(chunk)), mode='constant')

        mel_spec = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=128)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        if log_mel_spec.shape[1] > 157:
            log_mel_spec = log_mel_spec[:, :157]
        elif log_mel_spec.shape[1] < 157:
            pad_width = 157 - log_mel_spec.shape[1]
            log_mel_spec = np.pad(log_mel_spec, ((0, 0), (0, pad_width)), mode='constant')

        input_data = log_mel_spec.T[np.newaxis, ...] 
        prediction = model.predict(input_data)
        predicted_ids = np.argmax(prediction, axis=-1)[0]
        decoded_text = text_proc.decode(predicted_ids)
        transcript += decoded_text.strip() + " "

    return transcript.strip()


In [38]:
audioPath = "audio2.mpga"

transcribe_long_audio(model , audioPath)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


'the the'