In [1]:
import tensorflow as tf
import numpy as np
import librosa
import soundfile as sf
import os
import matplotlib.pyplot as plt
import shutil
from pathlib import Path
from audiomentations import AddGaussianNoise
import random
from scipy.signal import butter, lfilter
from IPython.display import Audio
     

In [2]:

class Augmenter:

    def __init__(self, sr=16000,
                 noise_prob=0.4, noise_max_amp=0.01,
                 reverb_prob=0.3, reverb_delay=0.025, reverb_decay=0.2,
                 shuffle_prob=0.05, shuffle_segments=3,
                 time_stretch_prob=0.2, time_stretch_range=(0.9, 1.1),
                 gaps_prob=0.08, gaps_n=4, gaps_max_duration=0.1,
                 freq_mask_prob=0.2, freq_mask_n=1,
                 shuffle_seg_dur=0.08, shuffle_overlap=0.02, shuffle_local_range=3):

        self.sr = sr
        self.noise_aug = AddGaussianNoise(p=1.0, max_amplitude=noise_max_amp)

        self.noise_prob = noise_prob
        self.reverb_prob = reverb_prob
        self.reverb_delay = reverb_delay
        self.reverb_decay = reverb_decay
        self.shuffle_prob = shuffle_prob
        self.shuffle_segments = shuffle_segments
        self.time_stretch_prob = time_stretch_prob
        self.time_stretch_range = time_stretch_range
        self.gaps_prob = gaps_prob
        self.gaps_n = gaps_n
        self.gaps_max_duration = gaps_max_duration
        self.freq_mask_prob = freq_mask_prob
        self.freq_mask_n = freq_mask_n
        self.shuffle_seg_dur = shuffle_seg_dur
        self.shuffle_overlap = shuffle_overlap
        self.shuffle_local_range = shuffle_local_range

    def augment(self, audio):
        if not isinstance(audio, tf.Tensor):
            audio = tf.convert_to_tensor(audio, dtype=tf.float32)
        #Set of distortions to be applied randomly with probabilities below
        distortions = []

        # 1. Noise
        # Adds gaussian noise -> makes it slightly grainy
        # Min max amplitude of noise - not set
        # p=1.0 - always apply noise
        if random.random() < self.noise_prob:
            distortions.append('noise')

        # 2. Reverb
        # Echo effect
        # Delay the audio by 0.1 -> reduce volume -> pad it to original length -> add
        if random.random() < self.reverb_prob:
            distortions.append('reverb')

        # 3. Shuffle
        # Break into n segments and concat them randomly
        if random.random() < self.shuffle_prob:
            distortions.append('shuffle')

        # 4. Time stretch
        # Randomly slows (0.9) or speeds (1.1) the audio / doesn't change pitch
        if random.random() < self.time_stretch_prob:
            distortions.append('time_stretch')

        # 5. Missing Gaps
        # Randomly insert silences/gaps in the audio
        if random.random() < self.gaps_prob:
            distortions.append('missing_gaps')

        # 6. Frequency Masking
        # Randomly masks a range of frequencies in the spectrogram
        #Butterworth filter is better than applying freqeuncy masks on spectogram (which already has frequency bins) because real wrld freq loss occurs during sound capture/transmission, affecting the raw audio.
        #Butterworth simulates this situation by removing frequency content from the waveform which can then go thru the rest of pipeline,
        #additionally it affects the phase relations and harmonics naturally in contrast to the crude zeroing of freq bins in spectogram

        if random.random() < self.freq_mask_prob:
            distortions.append('frequency_masking')



        # Apply selected distortions
        for distortion in distortions:
            if distortion == 'noise':
                audio = self._add_noise(audio)

            elif distortion == 'reverb':
                audio = self._add_reverb(audio)

            elif distortion == 'shuffle':
                audio = self._segment_shuffle(audio)

            elif distortion == 'time_stretch':
                audio = self._time_stretch(audio)

            elif distortion == 'missing_gaps':
                audio = self._add_missing_gaps(audio)

            elif distortion == 'frequency_masking':
                audio = self._add_frequency_mask(audio)

        return audio
    def _add_noise(self, audio):
        return self.noise_aug(audio, self.sr)

    def _add_reverb(self, audio):
        delay = int(self.reverb_delay * self.sr)  # Delay in samples (0.05 sec)
        reverb = tf.pad(audio * self.reverb_decay, [[delay, 0]]) #Amplitude scaling -> 0.2
        reverb = reverb[:tf.shape(audio)[0]]
        return audio + reverb

    def _segment_shuffle(self, audio, n_segments=None):
        # Previous method -> shuffle random large segments / unrealistic and destroys linguistic stuff
        # if n_segments is None:
        #     n_segments = self.shuffle_segments
        # segments = np.array_split(audio, n_segments)
        # np.random.shuffle(segments)
        # return np.concatenate(segments)


        #New method -> try to simulate temporal jitter / noise in the time domain -> split into micro-segments which are overlapping  -> the segments are shuffle locally within shuffle range
        seg_len = int(self.shuffle_seg_dur * self.sr)
        overlap = int(self.shuffle_overlap * self.sr)
        local_range = self.shuffle_local_range
        segments = []
        i = 0
        while i < len(audio):
            end = min(i + seg_len, len(audio))
            segments.append(audio[i:end])
            i += seg_len - overlap
        n_regions = min(4, max(1, len(segments) // 10))
        region_indices = random.sample(range(len(segments)), n_regions)
        shuffled = segments.copy()
        for r in region_indices:
            for offset in range(-local_range, local_range + 1):
                idx = r + offset
                if 0 <= idx < len(segments):
                    shift = random.randint(-local_range, local_range)
                    new_idx = max(0, min(len(segments) - 1, idx + shift))
                    shuffled[idx] = segments[new_idx]
        return np.concatenate(shuffled)[:len(audio)]


    def _time_stretch(self, audio):
        return tf.constant(librosa.effects.time_stretch(audio, rate=random.uniform(*self.time_stretch_range)),dtype = tf.float32)

    def _add_missing_gaps(self, audio, n_gaps=None, max_gap_duration=None):
        # if n_gaps is None:
        #     n_gaps = self.gaps_n
        # if max_gap_duration is None:
        #     max_gap_duration = self.gaps_max_duration
        # gap_audio = np.copy(audio)
        # for _ in range(n_gaps):
        #     gap_duration = random.uniform(0.1, max_gap_duration)
        #     gap_samples = int(gap_duration * self.sr)
        #     start = random.randint(0, max(1, len(audio) - gap_samples))
        #     gap_audio[start:start + gap_samples] = 0
        # return gap_audio

        #New method -> fill gaps with low level noises and make edges smoother
        if n_gaps is None:
            n_gaps = self.gaps_n
        if max_gap_duration is None:
            max_gap_duration = self.gaps_max_duration

        gap_audio = tf.identity(audio)

        for _ in range(n_gaps):
            gap_duration = tf.random.uniform([], 0.1, max_gap_duration, dtype=tf.float32)
            gap_samples = tf.cast(gap_duration * tf.cast(self.sr, tf.float32), tf.int32)
            start = tf.random.uniform([], 0, tf.shape(audio)[0] - gap_samples, dtype=tf.int32)

            fade_len = tf.minimum(tf.cast(0.05 * tf.cast(gap_samples, tf.float32), tf.int32), gap_samples // 4)
            fade_out = tf.cast(tf.linspace(1.0, 0.0, fade_len), tf.float32)
            fade_in = tf.cast(tf.linspace(0.0, 1.0, fade_len), tf.float32)

            mid_len = gap_samples - 2 * fade_len

            if mid_len > 0:
            #In case 0 values are needed.
            #     if random.random() < 0.5:
            #         gap_audio[start + fade_len:start + fade_len + mid_len] = 0
            #     else:
              noise = tf.random.normal([mid_len], stddev=0.001, dtype=tf.float32)
              indices = tf.reshape(tf.range(start + fade_len, start + fade_len + mid_len), (-1, 1))
              gap_audio = tf.tensor_scatter_nd_update(gap_audio, indices, noise)

        fade_indices = tf.reshape(tf.range(start + fade_len + mid_len, start + gap_samples), (-1, 1))
        fade_vals = gap_audio[start + fade_len + mid_len:start + gap_samples] * fade_in
        gap_audio = tf.tensor_scatter_nd_update(gap_audio, fade_indices, fade_vals)

        return gap_audio[:len(audio)]
    #APPLY FREQUENCY MASKING USING BUTTERWORTH FILTER
    #n_masks -> how many bands will be filtered out
    #A 16 kHz sampler can only capture up to 8 kHz frequencies because you need at least two samples per wave cycle to know what the wave looks like -> nyquist
    def _add_frequency_mask(self, audio, n_masks=None):
      if n_masks is None:
          n_masks = self.freq_mask_n
      if isinstance(audio, tf.Tensor):
        masked_audio = audio.numpy().copy()
      else:
        masked_audio = audio.copy()
      nyquist = self.sr/2
      for _ in range(n_masks):
        l_freq = random.uniform(500,5000)
        h_freq = l_freq + random.uniform(500,2000)
        l_freq = min(l_freq,nyquist-100)
        h_freq = min(h_freq,nyquist-100)
        b,a = butter(N=4, Wn=[l_freq,h_freq], btype= "bandstop", fs = self.sr) #N=4 th order -> a dip in frequency response where that removal band is with smooth edges
        masked_audio = lfilter(b,a,masked_audio)
      return masked_audio
augmenter=Augmenter()


In [12]:

# --- Configuration ---
# 1. Set the path to your main dataset directory
input_root_dir = "./dataset/LibriSpeech/train-clean-100/"

# 2. Set the path where you want to save the new dataset
output_root_dir = "./dataset/LibriSpeech-WAV-Complete/"
# ---

augmenter=Augmenter()
print(f"Starting transfer and conversion from '{input_root_dir}'...")
print(f"Output will be saved in '{output_root_dir}'.")

# Walk through the entire directory structure
for dirpath, _, filenames in os.walk(input_root_dir):
    for filename in filenames:
        # Construct the full path to the source file
        input_file_path = os.path.join(dirpath, filename)
        
        # Create the corresponding output directory structure
        relative_path = os.path.relpath(dirpath, input_root_dir)
        output_dir = os.path.join(output_root_dir, relative_path)
        os.makedirs(output_dir, exist_ok=True)

        # --- Logic to either convert or copy the file ---
        try:
            if filename.endswith(".flac"):
                # It's an audio file, so convert it to WAV

                # Create the full path for the output WAV file
                wav_filename = Path(filename).stem + ".wav"
                output_file_path = os.path.join(output_dir, wav_filename)
                audio, sr = librosa.load(input_file_path, sr=16000)
                # Apply your custom augmentations
                augmented_audio = augmenter.augment(audio)
                sf.write(output_file_path, augmented_audio, sr)
                

            else:
                # It's a non-audio file (e.g., .txt), so copy it directly
                
                # Construct the output path
                output_file_path = os.path.join(output_dir, filename)
                
                # Use shutil.copy2 to preserve file metadata
                shutil.copy2(input_file_path, output_file_path)

        except Exception as e:
            print(f"Error processing {input_file_path}: {e}")

print("\n--- Process Complete ---")
print(f"New dataset is ready at: '{output_root_dir}'")

Starting transfer and conversion from './dataset/LibriSpeech/train-clean-100/'...
Output will be saved in './dataset/LibriSpeech-WAV-Complete/'.


KeyboardInterrupt: 

In [4]:

# --- 1. Configuration ---
tf.keras.mixed_precision.set_global_policy('mixed_float16')
Training_dirs="./dataset/LibriSpeech-WAV-Complete/"
# Parameters
SAMPLE_RATE = 16000
BATCH_SIZE = 32
EPOCHS = 15 # Set to a higher number for real training

# Create a dummy vocabulary with only uppercase letters and apostrophe
CHARACTERS = [
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    "'",' ']

# Create character-to-number mappings
char_to_num = tf.keras.layers.StringLookup(vocabulary=list(CHARACTERS), mask_token=None)
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)
VOCAB_SIZE = char_to_num.vocabulary_size()


In [5]:
# Function to create dummy data
def load_data():
    file_paths = []
    transcriptions = []
    directories=[]
    label_files=[]
    for lfold1 in os.listdir(Training_dirs):
        for lfold2 in os.listdir(os.path.join(Training_dirs,lfold1)):
            full_path = os.path.join(Training_dirs, lfold1,lfold2)
            if os.path.isdir(full_path):
                directories.append(full_path)
                label_files.append(os.path.join(Training_dirs, lfold1,lfold2,lfold1+'-'+lfold2+'.trans.txt'))
    for label_path in label_files:
        with open(label_path,'r') as labels:
            for line in labels.readlines():
                transcriptions.append(line.split(' ',maxsplit=1)[1].strip())
    for path in directories:
        fp=[]
        for file in os.listdir(path):
            if(file.endswith('.wav')):
                fp.append(os.path.join(path,file))
        fp.sort()
        file_paths+=fp
    
    print(len(label_files))
        
    return file_paths, transcriptions
#load_data()

In [6]:
# --- 2. tf.data Pipeline (No Augmentation) ---

def preprocess_audio(file_path):
    """Loads and converts a FLAC file to a log Mel spectrogram."""
    try:
        path_str = file_path.numpy().decode('utf-8')
        y, sr = librosa.load(path_str, sr=SAMPLE_RATE)
        
        # Compute the Mel spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
        
        # Convert to log scale (decibels)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        
        # Add a channel dimension
        log_mel_spec = np.expand_dims(log_mel_spec.T, axis=-1)
        
        return log_mel_spec.astype(np.float32)
    except Exception as e:
        print(f"Error processing file {file_path.numpy()}: {e}")
        os.exit()
        return np.zeros((100, 80, 1), dtype=np.float32)
       

SAMPLE_RATE = 16000
N_FFT = 400
HOP_LENGTH = 160
N_MELS = 80

def power_to_db(S, ref=1.0, top_db=80.0):
    """Converts a power spectrogram to the decibel scale."""
    log_spec = 10.0 * (tf.math.log(tf.maximum(S, 1e-10)) / tf.math.log(10.0))
    log_spec -= 10.0 * (tf.math.log(tf.maximum(ref, 1e-10)) / tf.math.log(10.0))
    return tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

@tf.function
def preprocess_audio_tf(file_path: tf.Tensor):
    """
    Loads and converts a FLAC file to a log Mel spectrogram using TensorFlow,
    with padding to match librosa's default behavior.
    """
    try:
        
        audio_binary = tf.io.read_file(file_path)
    
        # decode_wav returns a normalized float32 tensor and the sample rate.
        # desired_channels=1 ensures the audio is mono.
        audio_tensor, _ = tf.audio.decode_wav(audio_binary, desired_channels=1)
    
        # Squeeze the channel dimension, leaving a 1D waveform.
        # NO further normalization is needed.
        waveform = tf.squeeze(audio_tensor, axis=-1)

        # --- FIX: Manually pad the waveform to match librosa ---
        # (The rest of your function remains the same and is correct)
        padding = N_FFT // 2
        waveform = tf.pad(waveform, [[padding, padding]], mode="REFLECT")
        
        # --- 2. Compute the STFT (The rest is the same) ---
        stft = tf.signal.stft(
            waveform,
            frame_length=N_FFT,
            frame_step=HOP_LENGTH,
            fft_length=N_FFT
        )
        spectrogram = tf.abs(stft)

        # ... (rest of the function is identical) ...
        power_spectrogram = spectrogram ** 2
        num_spectrogram_bins = stft.shape[-1]
        mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=N_MELS,
            num_spectrogram_bins=num_spectrogram_bins,
            sample_rate=SAMPLE_RATE,
            lower_edge_hertz=20.0,
            upper_edge_hertz=8000.0
        )
        mel_spectrogram = tf.tensordot(power_spectrogram, mel_filterbank, 1)
        log_mel_spectrogram = power_to_db(mel_spectrogram)
        log_mel_spectrogram = tf.expand_dims(log_mel_spectrogram, axis=-1)

        return tf.cast(log_mel_spectrogram, dtype=tf.float32)

    except Exception as e:
        tf.print("Error processing file:", file_path, "Exception:", e, summarize=-1)
        return tf.zeros((100, N_MELS, 1), dtype=tf.float32)
    
def preprocess_label(text_label):
    """Converts a text string to an integer sequence, ensuring it's uppercase."""
    # Convert all characters to uppercase to match the vocabulary
    text_tensor = tf.strings.upper(text_label)
    chars = tf.strings.unicode_split(text_tensor, input_encoding="UTF-8")
    return char_to_num(chars)
# (Keep all your other functions like preprocess_audio_tf_flac, preprocess_label, etc.)

@tf.function
def preprocess_and_filter(path, label):
    """
    Applies full preprocessing to audio and text, and returns their lengths.
    """
    # Process the audio file to get the final spectrogram
    spectrogram = preprocess_audio_tf(path)
    
    # Process the text label to get the integer tokens
    processed_label = preprocess_label(label)

    # Get the number of time steps from the spectrogram
    spectrogram_length = tf.shape(spectrogram)[0]
    
    # Get the number of characters/tokens from the label
    label_length = tf.shape(processed_label)[0]

    return spectrogram, processed_label, spectrogram_length, label_length
#preprocess_audio_tf("/kaggle/working/LibriSpeech-WAV-Complete/1081/125237/1081-125237-0035.wav")

In [7]:
'''def build_pipeline(paths, labels, is_training=False):
    path_ds = tf.data.Dataset.from_tensor_slices(paths)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    
    ds = tf.data.Dataset.zip((path_ds, label_ds))
    if is_training:
        ds = ds.shuffle(buffer_size=len(paths))
    
    # Map preprocessing functions
    ds = ds.map(
        lambda path, label: (
            tf.py_function(preprocess_audio, [path], tf.float32),
            preprocess_label(label)
        ),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    
    # Batch and pad
    ds = ds.padded_batch(
        batch_size=BATCH_SIZE,
        padded_shapes=([None, 80, 1], [None]),
        padding_values=(0.0, tf.cast(char_to_num.vocabulary_size(), dtype=tf.int64)+1)
    )
    
    # Prefetch for performance
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds'''

def build_pipeline(paths, labels, is_training=False):
    path_ds = tf.data.Dataset.from_tensor_slices(paths)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    
    ds = tf.data.Dataset.zip((path_ds, label_ds))
    if is_training:
        ds = ds.shuffle(buffer_size=len(paths))
    
    # 1. Map the combined preprocessing and length calculation function
    ds = ds.map(preprocess_and_filter, num_parallel_calls=tf.data.AUTOTUNE)
    
    # 2. Filter out items where the spectrogram is shorter than the label
    ds = ds.filter(
        lambda spectrogram, label, spec_len, label_len: spec_len >= label_len
    )
    
    # 3. Remove the lengths from the dataset, keeping only spectrogram and label
    ds = ds.map(
        lambda spectrogram, label, spec_len, label_len: (spectrogram, label),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    
    # 4. Batch and pad as before
    ds = ds.padded_batch(
        batch_size=BATCH_SIZE,
        padded_shapes=([None, 80, 1], [None]),
        padding_values=(0.0, tf.cast(0, dtype=tf.int64))
    )
    
    # Prefetch for performance
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

In [8]:
def build_model(input_shape, vocab_size):
    """Builds a deeper, more regularized CNN-RNN model."""
    inputs = tf.keras.Input(shape=input_shape, name="input_spectrogram")

    # Make the CNN frontend deeper
    x = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", padding="same")(inputs)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    x = tf.keras.layers.SpatialDropout2D(0.2)(x) # <-- Add SpatialDropout
    
    x = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    x = tf.keras.layers.SpatialDropout2D(0.2)(x) # <-- Add SpatialDropout

    # Reshape for the RNN
    _, time_dim, freq_dim, channel_dim = x.shape
    new_feature_dim = freq_dim * channel_dim
    x = tf.keras.layers.Reshape((time_dim, new_feature_dim))(x)
    
    # Make the RNN backend deeper and with stronger dropout
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.4)(x) # <-- Increased Dropout
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.4)(x) # <-- Increased Dropout
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.4)(x) # <-- Increased Dropout
    x = tf.keras.layers.BatchNormalization()(x)

    # Output layer
    outputs = tf.keras.layers.Dense(units=vocab_size + 1, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# --- 3. Model Definition and CTC Loss ---

"""older model, has less layers but is proven to underfit given the data."""
def build_model_old1(input_shape, vocab_size):
    """Builds a CNN-RNN model with CTC loss."""
    inputs = tf.keras.Input(shape=input_shape, name="input_spectrogram")

    x = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", padding="same")(inputs)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    
    x = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)

    _, time_dim, freq_dim, channel_dim = x.shape
    new_feature_dim = freq_dim * channel_dim
    x = tf.keras.layers.Reshape((time_dim, new_feature_dim))(x)
    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
    x = tf.keras.layers.BatchNormalization()(x)

    outputs = tf.keras.layers.Dense(units=vocab_size+1, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [9]:

def ctc_loss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_pred)[0], dtype="int64")
    time_steps = tf.cast(tf.shape(y_pred)[1], dtype="int64")

    input_length = time_steps * tf.ones(shape=(batch_len, 1), dtype="int64")
    
    # Compute actual label lengths
    label_length = tf.math.count_nonzero(y_true, axis=1, keepdims=True)
    label_length = tf.cast(label_length, dtype="int64")
    #label_length = tf.minimum(label_length, input_length)
    
    loss = tf.keras.backend.ctc_batch_cost(
        y_true,
        y_pred,
        input_length,
        label_length,
    )


    return loss


In [10]:
# --- 4. Main Training and Saving Logic ---

if __name__ == "__main__":
    # Generate the dataset
    paths, labels = load_data()
    
    # Split data (simple split for demonstration)
    split_idx = int(len(paths) * 0.9)
    train_paths, val_paths = paths[:split_idx], paths[split_idx:]
    train_labels, val_labels = labels[:split_idx], labels[split_idx:]
    
    # Build data pipelines
    train_ds = build_pipeline(train_paths, train_labels, is_training=True)
    val_ds = build_pipeline(val_paths, val_labels, is_training=False)
    
    steps_per_epoch = len(train_paths) // BATCH_SIZE
    total_decay_steps = steps_per_epoch * EPOCHS
    
    cosine_schedule = tf.keras.optimizers.schedules.CosineDecay(
        initial_learning_rate=1e-2,  # The starting learning rate
        decay_steps=total_decay_steps, # The number of steps to decay over
        alpha=0 # The minimum learning rate as a fraction of the initial rate
    )
    Optimizer = tf.keras.optimizers.Adam(learning_rate=cosine_schedule)
    # Build the model
    # We don't know the exact input shape, so we use None for the time dimension
    model = build_model(input_shape=(None, 80, 1), vocab_size=VOCAB_SIZE)
    model.compile(optimizer="adam", loss=ctc_loss)
    
    model.summary()
    for x_batch, y_batch in train_ds.take(1):
        preds = model(x_batch)
        print("Model output time steps:", preds.shape[1])
        print("Max label length in batch:", tf.reduce_max(tf.math.count_nonzero(y_batch, axis=1)))

    # Set up callbacks
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath="asr_model_best.keras",
        save_best_only=True,
        monitor="val_loss",
        verbose=1
    )
    # Train the model
    print("\n--- Starting Model Training ---")
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS,
        callbacks=[model_checkpoint]
    )
   

FileNotFoundError: [WinError 3] The system cannot find the path specified: './dataset/LibriSpeech-WAV-Complete/'

In [None]:

    # Save the final model
    model.save("/kaggle/working/asr_model_final_ep15.keras")
    print("\n--- Training complete. Final model saved as asr_model_final.keras ---")
    print("Best performing model during training saved as asr_model_best.keras")


--- Training complete. Final model saved as asr_model_final.keras ---
Best performing model during training saved as asr_model_best.keras


In [None]:
#quantised model

In [None]:
# Load model from checkpoint /kaggle/input/asr-midtrained/tensorflow2/default/1/asr_model_final_ep15.keras
model = tf.keras.models.load_model("/kaggle/input/asr-midtrained/tensorflow2/default/1/asr_model_final_ep15.keras", custom_objects={"ctc_loss": ctc_loss})

# (Optionally) Lower LR manually before continuing
tf.keras.backend.set_value(model.optimizer.learning_rate, 1e-4)

# Recreate the same callbacks
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=3,
    min_lr=1e-6,
    verbose=1
)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="asr_model_best.keras",
    save_best_only=True,
    monitor="val_loss",
    verbose=1
)

# Resume training from epoch 15 → 25
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=25,
    initial_epoch=15,
    callbacks=[model_checkpoint, reduce_lr]
)
