In [None]:
#!pip install librosa

In [None]:
import tensorflow as tf
import numpy as np
import librosa
import soundfile as sf
import os
import shutil
from pathlib import Path
import random
from scipy.signal import butter, lfilter
from IPython.display import Audio
     

In [None]:

# --- 1. Configuration ---
tf.keras.mixed_precision.set_global_policy('mixed_float16')
#set it to the directory where your input folder is
input_root_dir=""
output_root_dir = ""
# ---
Training_dirs=output_root_dir
# Parameters
SAMPLE_RATE = 16000
BATCH_SIZE = 32
EPOCHS = 15 #For half trained model 
# Create a vocabulary with only uppercase letters and apostrophe
CHARACTERS = [
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    "'",' ']

# Create character-to-number mappings
char_to_num = tf.keras.layers.StringLookup(vocabulary=list(CHARACTERS), mask_token=None)
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)
VOCAB_SIZE = char_to_num.vocabulary_size()


In [None]:
# 2.Converts the noise-augmented .flac files to .wav
print(f"Starting transfer and conversion from '{input_root_dir}'...")
print(f"Output will be saved in '{output_root_dir}'.")

# Walk through the entire directory structure
for dirpath, _, filenames in os.walk(input_root_dir):
    for filename in filenames:
        # Construct the full path to the source file
        input_file_path = os.path.join(dirpath, filename)
        
        # Determine the corresponding output directory path
        relative_path = os.path.relpath(dirpath, input_root_dir)
        output_dir = os.path.join(output_root_dir, relative_path)
        
        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # --- Logic to either convert or copy the file ---
        try:
            if filename.lower().endswith(".flac"):
                # It's an audio file, so convert it to WAV
                
                # Create the full path for the output WAV file
                wav_filename = Path(filename).stem + ".wav"
                output_file_path = os.path.join(output_dir, wav_filename)

                # Read the FLAC data and write it as WAV
                # Using soundfile which handles both reading FLAC and writing WAV
                audio_data, sample_rate = sf.read(input_file_path)
                sf.write(output_file_path, audio_data, sample_rate)
                # Optional: Print progress for audio files
                # print(f"Converted: {input_file_path} -> {output_file_path}")

            else:
                # It's a non-audio file (e.g., .txt), so copy it directly
                
                # Construct the output path for the copied file
                output_file_path = os.path.join(output_dir, filename)
                shutil.copy2(input_file_path, output_file_path)
                # Optional: Print progress for copied files
                # print(f"Copied: {input_file_path} -> {output_file_path}")

        except Exception as e:
            # Print an error message if any file fails to process
            print(f"Error processing {input_file_path}: {e}")

print("\n--- Process Complete ---")
print(f"New dataset with WAV files is ready at: '{output_root_dir}'")


In [None]:
# Function to load my new .wav dataset
def load_data():
    file_paths = []
    transcriptions = []
    directories=[]
    label_files=[]
    for lfold1 in os.listdir(Training_dirs):
        for lfold2 in os.listdir(os.path.join(Training_dirs,lfold1)):
            full_path = os.path.join(Training_dirs, lfold1,lfold2)
            if os.path.isdir(full_path):
                directories.append(full_path)
                label_files.append(os.path.join(Training_dirs, lfold1,lfold2,lfold1+'-'+lfold2+'.trans.txt'))
        for label_path in label_files:
            with open(label_path,'r') as labels:
                for line in labels.readlines():
                    transcriptions.append(line.split(' ',maxsplit=1)[1].strip())
        for path in directories:
            fp=[]
            for file in os.listdir(path):
                if(file.endswith('.wav')):
                    fp.append(os.path.join(path,file))
            fp.sort()
            file_paths+=fp
    
    print(len(label_files))
        
    return file_paths, transcriptionsi
#print(os.listdir(Training_dirs))
#load_data()

In [None]:
# --- 2. tf.data Pipeline--
#preprocess using librosa (works on .flac)
'''def preprocess_audio(file_path):
    """Loads and converts a FLAC file to a log Mel spectrogram."""
    try:
        path_str = file_path.numpy().decode('utf-8')
        y, sr = librosa.load(path_str, sr=SAMPLE_RATE)
        
        # Compute the Mel spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
        
        # Convert to log scale (decibels)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        
        # Add a channel dimension
        log_mel_spec = np.expand_dims(log_mel_spec.T, axis=-1)
        
        return log_mel_spec.astype(np.float32)
    except Exception as e:
        print(f"Error processing file {file_path.numpy()}: {e}")
        os.exit()
        return np.zeros((100, 80, 1), dtype=np.float32)'''
       

SAMPLE_RATE = 16000
N_FFT = 400
HOP_LENGTH = 160
N_MELS = 80

def power_to_db(S, ref=1.0, top_db=80.0):
    """Converts a power spectrogram to the decibel scale."""
    log_spec = 10.0 * (tf.math.log(tf.maximum(S, 1e-10)) / tf.math.log(10.0))
    log_spec -= 10.0 * (tf.math.log(tf.maximum(ref, 1e-10)) / tf.math.log(10.0))
    return tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)
#using tensorflow: requires .wav but significantly faster
@tf.function
def preprocess_audio_tf(file_path: tf.Tensor):
    """
    Loads and converts a FLAC file to a log Mel spectrogram using TensorFlow,
    with padding to match librosa's default behavior.
    """
    try:
        
        audio_binary = tf.io.read_file(file_path)
    
        # decode_wav returns a normalized float32 tensor and the sample rate.
        # desired_channels=1 ensures the audio is mono.
        audio_tensor, _ = tf.audio.decode_wav(audio_binary, desired_channels=1)
    
        # Squeeze the channel dimension, leaving a 1D waveform.
        # NO further normalization is needed.
        waveform = tf.squeeze(audio_tensor, axis=-1)

        # --- FIX: Manually pad the waveform to match librosa ---
        # (The rest of your function remains the same and is correct)
        padding = N_FFT // 2
        waveform = tf.pad(waveform, [[padding, padding]], mode="REFLECT")
        
        # --- 2. Compute the STFT (The rest is the same) ---
        stft = tf.signal.stft(
            waveform,
            frame_length=N_FFT,
            frame_step=HOP_LENGTH,
            fft_length=N_FFT
        )
        spectrogram = tf.abs(stft)

        # ... (rest of the function is identical) ...
        power_spectrogram = spectrogram ** 2
        num_spectrogram_bins = stft.shape[-1]
        mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=N_MELS,
            num_spectrogram_bins=num_spectrogram_bins,
            sample_rate=SAMPLE_RATE,
            lower_edge_hertz=20.0,
            upper_edge_hertz=8000.0
        )
        mel_spectrogram = tf.tensordot(power_spectrogram, mel_filterbank, 1)
        log_mel_spectrogram = power_to_db(mel_spectrogram)
        log_mel_spectrogram = tf.expand_dims(log_mel_spectrogram, axis=-1)

        return tf.cast(log_mel_spectrogram, dtype=tf.float32)

    except Exception as e:
        tf.print("Error processing file:", file_path, "Exception:", e, summarize=-1)
        return tf.zeros((100, N_MELS, 1), dtype=tf.float32)
    
def preprocess_label(text_label):
    """Converts a text string to an integer sequence, ensuring it's uppercase."""
    # Convert all characters to uppercase to match the vocabulary
    text_tensor = tf.strings.upper(text_label)
    chars = tf.strings.unicode_split(text_tensor, input_encoding="UTF-8")
    return char_to_num(chars)
# (Keep all your other functions like preprocess_audio_tf_flac, preprocess_label, etc.)

@tf.function
def preprocess_and_filter(path, label):
    """
    Applies full preprocessing to audio and text, and returns their lengths.
    """
    # Process the audio file to get the final spectrogram
    spectrogram = preprocess_audio_tf(path)
    
    # Process the text label to get the integer tokens
    processed_label = preprocess_label(label)

    # Get the number of time steps from the spectrogram
    spectrogram_length = tf.shape(spectrogram)[0]
    
    # Get the number of characters/tokens from the label
    label_length = tf.shape(processed_label)[0]

    return spectrogram, processed_label, spectrogram_length, label_length
#preprocess_audio_tf("/kaggle/working/LibriSpeech-WAV-Complete/1081/125237/1081-125237-0035.wav")

In [None]:
'''def build_pipeline(paths, labels, is_training=False):
    path_ds = tf.data.Dataset.from_tensor_slices(paths)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    
    ds = tf.data.Dataset.zip((path_ds, label_ds))
    if is_training:
        ds = ds.shuffle(buffer_size=len(paths))
    
    # Map preprocessing functions
    ds = ds.map(
        lambda path, label: (
            tf.py_function(preprocess_audio, [path], tf.float32),
            preprocess_label(label)
        ),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    
    # Batch and pad
    ds = ds.padded_batch(
        batch_size=BATCH_SIZE,
        padded_shapes=([None, 80, 1], [None]),
        padding_values=(0.0, tf.cast(char_to_num.vocabulary_size(), dtype=tf.int64)+1)
    )
    
    # Prefetch for performance
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds
    ||OLDER PIPELINE THAT USES LIBROSA||
    '''

def build_pipeline(paths, labels, is_training=False):
    path_ds = tf.data.Dataset.from_tensor_slices(paths)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    
    ds = tf.data.Dataset.zip((path_ds, label_ds))
    if is_training:
        ds = ds.shuffle(buffer_size=len(paths))
    
    # 1. Map the combined preprocessing and length calculation function
    ds = ds.map(preprocess_and_filter, num_parallel_calls=tf.data.AUTOTUNE)
    
    # 2. Filter out items where the spectrogram is shorter than the label
    ds = ds.filter(
        lambda spectrogram, label, spec_len, label_len: spec_len >= label_len
    )
    
    # 3. Remove the lengths from the dataset, keeping only spectrogram and label
    ds = ds.map(
        lambda spectrogram, label, spec_len, label_len: (spectrogram, label),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    
    # 4. Batch and pad as before
    ds = ds.padded_batch(
        batch_size=BATCH_SIZE,
        padded_shapes=([None, 80, 1], [None]),
        padding_values=(0.0, tf.cast(0, dtype=tf.int64))
    )
    
    # Prefetch for performance
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

In [None]:
def build_model(input_shape, vocab_size):
    """Builds a layered CNN-RNN model."""
    inputs = tf.keras.Input(shape=input_shape, name="input_spectrogram")

    # Make the CNN frontend deeper
    x = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", padding="same")(inputs)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    x = tf.keras.layers.SpatialDropout2D(0.2)(x) # <-- Add SpatialDropout
    
    x = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    x = tf.keras.layers.SpatialDropout2D(0.2)(x) # <-- Add SpatialDropout

    # Reshape for the RNN
    _, time_dim, freq_dim, channel_dim = x.shape
    new_feature_dim = freq_dim * channel_dim
    x = tf.keras.layers.Reshape((time_dim, new_feature_dim))(x)
    
    # Make the RNN backend deeper and with stronger dropout
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.4)(x) # <-- Increased Dropout
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.4)(x) # <-- Increased Dropout
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(x)
    x = tf.keras.layers.Dropout(0.4)(x) # <-- Increased Dropout
    x = tf.keras.layers.BatchNormalization()(x)

    # Output layer
    outputs = tf.keras.layers.Dense(units=vocab_size + 1, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model


In [None]:
#CUSTOM LOSS FUNCTION REQUIRED TO TRAIN MY MODEL.
def ctc_loss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_pred)[0], dtype="int64")
    time_steps = tf.cast(tf.shape(y_pred)[1], dtype="int64")

    input_length = time_steps * tf.ones(shape=(batch_len, 1), dtype="int64")
    
    # Compute actual label lengths
    label_length = tf.math.count_nonzero(y_true, axis=1, keepdims=True)
    label_length = tf.cast(label_length, dtype="int64")
    #label_length = tf.minimum(label_length, input_length)
    
    loss = tf.keras.backend.ctc_batch_cost(
        y_true,
        y_pred,
        input_length,
        label_length,
    )


    return loss


In [None]:
# --- 4. Main Training and Saving Logic ---

if __name__ == "__main__":
    # Generate the dataset
    paths, labels = load_data()
    
    # Split data (simple split for demonstration)
    split_idx = int(len(paths) * 0.9)
    train_paths, val_paths = paths[:split_idx], paths[split_idx:]
    train_labels, val_labels = labels[:split_idx], labels[split_idx:]
    
    # Build data pipelines
    train_ds = build_pipeline(train_paths, train_labels, is_training=True)
    val_ds = build_pipeline(val_paths, val_labels, is_training=False)
    
    steps_per_epoch = len(train_paths) // BATCH_SIZE
    total_decay_steps = steps_per_epoch * EPOCHS
    
    cosine_schedule = tf.keras.optimizers.schedules.CosineDecay(
        initial_learning_rate=1e-2,  # The starting learning rate
        decay_steps=total_decay_steps, # The number of steps to decay over
        alpha=0 # The minimum learning rate as a fraction of the initial rate
    )
    Optimizer = tf.keras.optimizers.Adam(learning_rate=cosine_schedule)
    # Build the model
    # We don't know the exact input shape, so we use None for the time dimension
    model = build_model(input_shape=(None, 80, 1), vocab_size=VOCAB_SIZE)
    model.compile(optimizer=Optimizer, loss=ctc_loss)
    
    model.summary()
    for x_batch, y_batch in train_ds.take(1):
        preds = model(x_batch)
        print("Model output time steps:", preds.shape[1])
        print("Max label length in batch:", tf.reduce_max(tf.math.count_nonzero(y_batch, axis=1)))

    # Set up callbacks
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath="asr_model_best.keras",
        save_best_only=True,
        monitor="val_loss",
        verbose=1
    )
    # Train the model
    print("\n--- Starting Model Training ---")
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS,
        callbacks=[model_checkpoint]
    )
   

In [None]:

    # Save the half-trained model
    model.save("/kaggle/working/asr_model_final_ep15.keras")
    print("\n---Training  Part 1 complete. Final model saved ---")

In [None]:
# Load model from checkpoint /kaggle/input/asr-midtrained/tensorflow2/default/1/asr_model_final_ep15.keras
model = tf.keras.models.load_model("/kaggle/working/asr-midtrained/tensorflow2/default/1/asr_model_final_ep15.keras", custom_objects={"ctc_loss": ctc_loss})

#Lower LR manually before continuing
tf.keras.backend.set_value(model.optimizer.learning_rate, 1e-4)

# Recreate the same callbacks
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=1,
    min_lr=1e-6,
    verbose=1
)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="asr_model_best.keras",
    save_best_only=True,
    monitor="val_loss",
    verbose=1
)

# Resume training from epoch 15 → 25
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=25,
    initial_epoch=15,
    callbacks=[model_checkpoint, reduce_lr]
)


In [None]:
#SAVE FINAL MODEL
model.save("/kaggle/working/asr_model_final_ep25.keras")
print("\n---Training  Part 2 complete. Final model saved ---")