In [1]:
!pip install SpeechRecognition



In [2]:
import nltk
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [3]:
import ffmpeg
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Supported input file formats
SUPPORTED_FORMATS = (".mp3", ".aac", ".flac", ".ogg", ".opus", ".m4a", ".wav")

def convert_audio_to_wav(input_path, output_path, sample_rate=16000):
    """Converts any audio file to WAV format."""
    if os.path.exists(output_path):  
        return f"Skipping {os.path.basename(input_path)} (Already converted)"

    try:
        ffmpeg.input(input_path).output(output_path, ar=sample_rate, ac=1).run(overwrite_output=True, quiet=True)
        return f"✅ Converted {os.path.basename(input_path)}"
    except ffmpeg.Error as e:
        return f"❌ Error converting {os.path.basename(input_path)}: {e}"

def convert_dataset_to_wav(input_folder, output_folder, sample_rate=16000, max_workers=4):
    """Convert all supported audio files to WAV format using multi-threading."""
    
    os.makedirs(output_folder, exist_ok=True)  # Ensure output directory exists

    # Get all supported audio files
    audio_files = [f for f in os.listdir(input_folder) if f.endswith(SUPPORTED_FORMATS)]
    new_files = [f for f in audio_files if not os.path.exists(os.path.join(output_folder, os.path.splitext(f)[0] + ".wav"))]

    if not new_files:
        print(f"✅ No new files to convert in {input_folder}. Skipping conversion.")
        return

    print(f"🔄 Converting {len(new_files)} new files in {input_folder}...")

    # Use ThreadPoolExecutor to process multiple files in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        tasks = [
            executor.submit(
                convert_audio_to_wav, 
                os.path.join(input_folder, file), 
                os.path.join(output_folder, os.path.splitext(file)[0] + ".wav"),  
                sample_rate
            )
            for file in new_files
        ]

        for future in tqdm(tasks, desc=f"Processing {input_folder}"):
            print(future.result())  # Print conversion status

# Example usage
dataset_folders = [
    r"C:\ASR\raw_audio",  # Change this to your input folder
]

output_folders = [
    r"C:\ASR\converted_audio",  # Change this to your output folder
]
    
# Convert all audio files in dataset folders
for in_folder, out_folder in zip(dataset_folders, output_folders):
    convert_dataset_to_wav(in_folder, out_folder, max_workers=6)  


✅ No new files to convert in C:\ASR\raw_audio. Skipping conversion.


In [1]:
import os
import shutil
import numpy as np
import soundfile as sf
import noisereduce as nr
import librosa
import librosa.effects
import logging
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from phonemizer import phonemize
import fasttext
import subprocess

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Paths
input_folder = r"C:\ASR\converted_audio"
denoised_folder = r"C:\ASR\denoised_audio"
FEATURES_OUTPUT_FOLDER = r"C:\ASR\features"
transcript_file = r"C:\ASR\transcripts.txt"
fasttext_model_path = r"C:\ASR\models\lid.176.ftz"

# Ensure base directories exist
os.makedirs(denoised_folder, exist_ok=True)
os.makedirs(FEATURES_OUTPUT_FOLDER, exist_ok=True)
for subfolder in ["mel", "mfcc", "chroma", "phonemes", "transcript", "train", "test"]:
    os.makedirs(os.path.join(FEATURES_OUTPUT_FOLDER, subfolder), exist_ok=True)

# Load FastText model
lang_model = fasttext.load_model(fasttext_model_path)

def load_transcripts_from_file(file_path):
    transcript_dict = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                file_id, transcript = parts
                transcript_dict[file_id.strip()] = transcript.strip()
    return transcript_dict

transcript_dict = load_transcripts_from_file(transcript_file)

# Phonemization with espeak-ng
def phoneme_conversion_espeak(text):
    words = text.split()
    phonemes = []
    espeak_path = r"C:\Program Files\eSpeak NG\espeak-ng.exe"  # Replace with the actual path

    for word in words:
        try:
            process = subprocess.Popen(
                [espeak_path, "-q", "-v", "es", "--ipa", word],  # Added -q option
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                encoding="utf-8",
                errors="replace",
            )
            stdout, stderr = process.communicate()
            if stdout:
                phoneme_seq = stdout.strip().split()
                phonemes.append(" ".join(phoneme_seq))
            else:
                phonemes.append("[UNK]")
                logging.warning(f"espeak-ng failed for '{word}': {stderr.strip()}")
        except FileNotFoundError:
            logging.error(f"espeak-ng not found at '{espeak_path}'.")
            return "[ERROR: espeak-ng not found]"
        except Exception as e:
            logging.error(f"espeak-ng error for '{word}': {e}")
            phonemes.append("[UNK]")
    return " | ".join(phonemes)

# Audio processing
def process_audio(input_path, features_folder, sample_rate=16000):
    try:
        base_name = os.path.basename(input_path).replace(".wav", "")
        denoised_path = os.path.join(denoised_folder, base_name + "_denoised.wav")

        # Denoise if needed
        if os.path.exists(denoised_path):
            audio, sr = librosa.load(denoised_path, sr=sample_rate)
            audio, _ = librosa.effects.trim(audio, top_db=20)
        else:
            raw_audio, sr = librosa.load(input_path, sr=sample_rate)
            audio = nr.reduce_noise(y=raw_audio, sr=sr)
            audio, _ = librosa.effects.trim(audio, top_db=20)
            sf.write(denoised_path, audio, sr)

        extracted = []
        transcript_words = None

        # Get transcript (try from file)
        transcript_from_file = transcript_dict.get(base_name, "")
        if transcript_from_file:
            logging.info(f"📜 Using existing transcript for {base_name}: {transcript_from_file}")
            print(f"📜 Transcript used [file - {base_name}]: {transcript_from_file}")
            transcript_words = transcript_from_file.split()
        else:
            logging.warning(f"⚠ No transcript found for {base_name}.")

        # Extract Phonemes if transcript is available
        phoneme_path = os.path.join(features_folder, "phonemes", base_name + ".npy")
        if not os.path.exists(phoneme_path) and transcript_words:
            transcript_text = " ".join(transcript_words)
            phonemes = phoneme_conversion_espeak(transcript_text).split(" | ")
            np.save(phoneme_path, np.array(phonemes, dtype=object))
            extracted.append("Phonemes")

        # Save Transcript if found
        transcript_path = os.path.join(features_folder, "transcript", base_name + ".npy")
        if not os.path.exists(transcript_path) and transcript_words:
            np.save(transcript_path, np.array(transcript_words, dtype=object))
            extracted.append("Transcript")

        # Extract other features (independent of transcript)
        mfccs_path = os.path.join(features_folder, "mfcc", base_name + ".npy")
        if not os.path.exists(mfccs_path):
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
            np.save(mfccs_path, mfccs.T)
            extracted.append("MFCC")

        mel_path = os.path.join(features_folder, "mel", base_name + ".npy")
        if not os.path.exists(mel_path):
            mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
            mel_db = librosa.power_to_db(mel, ref=np.max)
            np.save(mel_path, mel_db.T)
            extracted.append("Mel")

        chroma_path = os.path.join(features_folder, "chroma", base_name + ".npy")
        if not os.path.exists(chroma_path):
            chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
            np.save(chroma_path, chroma.T)
            extracted.append("Chroma")

        if not extracted and os.path.exists(os.path.join(features_folder, "mfcc", base_name + ".npy")):
            return f"✅ Skipping {base_name}, features already extracted."
        elif not extracted:
            return f"⚠ Skipping {base_name}, no new features extracted."
        else:
            return f"✅ Processed {base_name}: {', '.join(extracted)}"

    except Exception as e:
        logging.error(f"❌ Error processing {input_path}: {e}")
        return f"❌ Failed {os.path.basename(input_path)}"

# Batch processing with check for existing features in train/test
def process_new_audio(input_folder, features_folder, max_workers=6):
    audio_files = [f for f in os.listdir(input_folder) if f.endswith(".wav")]
    files_to_process = []

    for audio_file in audio_files:
        base_name = audio_file.replace(".wav", "")
        train_dir = os.path.join(features_folder, "train", base_name)
        test_dir = os.path.join(features_folder, "test", base_name)
        feature_files_exist_in_train = os.path.exists(os.path.join(train_dir, "mfcc.npy"))
        feature_files_exist_in_test = os.path.exists(os.path.join(test_dir, "mfcc.npy"))

        if not feature_files_exist_in_train and not feature_files_exist_in_test:
            files_to_process.append(audio_file)
        else:
            logging.info(f"⏭️ Skipping {base_name}, features found in train or test.")

    if not files_to_process:
        logging.info("✅ No new audio files to process.")
        return

    logging.info(f"📢 Processing {len(files_to_process)} new audio files...")
    tasks = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for file in files_to_process:
            tasks.append(executor.submit(process_audio, os.path.join(input_folder, file), features_folder))
        for future in tqdm(tasks, desc="🔄 Processing new audio"):
            logging.info(future.result())

# Dataset splitter (modified to only consider unprocessed files)
def split_new_dataset(features_folder, test_size=0.2):
    all_base_names = set()
    audio_files = [f for f in os.listdir(input_folder) if f.endswith(".wav")]

    for audio_file in audio_files:
        base_name = audio_file.replace(".wav", "")
        train_dir = os.path.join(features_folder, "train", base_name)
        test_dir = os.path.join(features_folder, "test", base_name)
        feature_files_exist_in_train = os.path.exists(os.path.join(train_dir, "mfcc.npy"))
        feature_files_exist_in_test = os.path.exists(os.path.join(test_dir, "mfcc.npy"))

        if not feature_files_exist_in_train and not feature_files_exist_in_test:
            # Only consider files that haven't been split yet
            phoneme_path = os.path.join(features_folder, "phonemes", base_name + ".npy")
            if os.path.exists(phoneme_path):
                all_base_names.add(base_name)

    complete_sets = []
    for base in all_base_names:
        files = [
            os.path.join("mfcc", base + ".npy"),
            os.path.join("mel", base + ".npy"),
            os.path.join("chroma", base + ".npy"),
            os.path.join("phonemes", base + ".npy"),
            os.path.join("transcript", base + ".npy")
        ]
        if all(os.path.exists(os.path.join(features_folder, feature_type, base + ".npy")) for feature_type in ["mfcc", "mel", "chroma", "phonemes", "transcript"]):
            complete_sets.append(base)

    if len(complete_sets) < 2:
        logging.warning(f"⚠ Not enough new complete feature sets to split (found {len(complete_sets)}).")
        return

    train_files, test_files = train_test_split(complete_sets, test_size=test_size, random_state=42)

    os.makedirs(os.path.join(features_folder, "train"), exist_ok=True)
    os.makedirs(os.path.join(features_folder, "test"), exist_ok=True)

    for base in train_files:
        output_dir = os.path.join(features_folder, "train", base)
        os.makedirs(output_dir, exist_ok=True)
        for feature_type in ["mfcc", "mel", "chroma", "phonemes", "transcript"]:
            src_path = os.path.join(features_folder, feature_type, base + ".npy")
            dst_path = os.path.join(output_dir, feature_type + ".npy")
            if os.path.exists(src_path):
                shutil.move(src_path, dst_path)

    for base in test_files:
        output_dir = os.path.join(features_folder, "test", base)
        os.makedirs(output_dir, exist_ok=True)
        for feature_type in ["mfcc", "mel", "chroma", "phonemes", "transcript"]:
            src_path = os.path.join(features_folder, feature_type, base + ".npy")
            dst_path = os.path.join(output_dir, feature_type + ".npy")
            if os.path.exists(src_path):
                shutil.move(src_path, dst_path)

    logging.info(f"📂 Split new data: {len(train_files)} train, {len(test_files)} test.")

# Execute
if __name__ == "__main__":
    # Paths (already defined at the beginning)

    # Ensure directories exist (already handled)

    # Load transcripts (already handled)
    transcript_dict = load_transcripts_from_file(transcript_file)

    # Load FastText model (already handled)
    lang_model = fasttext.load_model(fasttext_model_path)

    # Execute processing for new audio only
    process_new_audio(input_folder, FEATURES_OUTPUT_FOLDER)

    # Execute data splitting for newly processed data
    split_new_dataset(FEATURES_OUTPUT_FOLDER)

    logging.info("✅ Feature extraction and data splitting for new files complete.")

2025-04-11 13:12:42,118 - INFO - ⏭️ Skipping 10367_10282_000000, features found in train or test.
2025-04-11 13:12:42,118 - INFO - ⏭️ Skipping 10367_10282_000001, features found in train or test.
2025-04-11 13:12:42,119 - INFO - ⏭️ Skipping 10367_10282_000002, features found in train or test.
2025-04-11 13:12:42,121 - INFO - ⏭️ Skipping 10367_10282_000003, features found in train or test.
2025-04-11 13:12:42,122 - INFO - ⏭️ Skipping 10367_10282_000004, features found in train or test.
2025-04-11 13:12:42,123 - INFO - ⏭️ Skipping 10367_10282_000005, features found in train or test.
2025-04-11 13:12:42,124 - INFO - ⏭️ Skipping 10367_10282_000006, features found in train or test.
2025-04-11 13:12:42,124 - INFO - ⏭️ Skipping 10367_10282_000007, features found in train or test.
2025-04-11 13:12:42,126 - INFO - ⏭️ Skipping 10367_10282_000008, features found in train or test.
2025-04-11 13:12:42,126 - INFO - ⏭️ Skipping 10367_10282_000009, features found in train or test.
2025-04-11 13:12:42,

In [3]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm
import logging
import librosa  # Make sure you have librosa installed: pip install librosa

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# --- Configuration ---
FEATURES_FOLDER = r"C:\ASR\features"
SAMPLE_RATE = 16000
N_MFCC = 13  # Should match your extraction
N_MELS = 128 # Should match your extraction
N_CHROMA = 12 # Chroma features
TIME_STEPS = 128 # Adjust based on your desired input length
BATCH_SIZE = 32
EPOCHS = 30
VALIDATION_SPLIT = 0.2
RANDOM_STATE = 42
PAD_TOKEN = "[PAD]"
NUM_PREDICTIONS_TO_SHOW = 5

# --- Helper Functions ---
def load_concatenated_features_and_transcripts(features_folder, time_steps, pad_token=PAD_TOKEN):
    """Loads concatenated features and phoneme transcripts with padding."""
    X = []
    y = []
    phoneme_to_int = {pad_token: 0}  # Start with a padding token
    int_to_phoneme = {0: pad_token}
    phoneme_count = 1  # Start counting from 1

    train_folder = os.path.join(features_folder, "train")
    if not os.path.exists(train_folder):
        logging.error(f"Train folder not found at: {train_folder}")
        return None, None, None, None

    for base_name_dir in tqdm(os.listdir(train_folder), desc="Loading Training Data"):
        base_name_path = os.path.join(train_folder, base_name_dir)
        if os.path.isdir(base_name_path):
            mfcc_path = os.path.join(base_name_path, "mfcc.npy")
            mel_path = os.path.join(base_name_path, "mel.npy")
            chroma_path = os.path.join(base_name_path, "chroma.npy")
            phonemes_path = os.path.join(base_name_path, "phonemes.npy")

            if os.path.exists(mfcc_path) and os.path.exists(mel_path) and os.path.exists(chroma_path) and os.path.exists(phonemes_path):
                try:
                    mfcc = np.load(mfcc_path)
                    mel = np.load(mel_path)
                    chroma = np.load(chroma_path)
                    phonemes = np.load(phonemes_path, allow_pickle=True)

                    features = np.concatenate((mfcc, mel, chroma), axis=1)

                    if len(features) > 0 and len(phonemes) > 0:
                        # Pad features
                        if len(features) < time_steps:
                            padding_len = time_steps - len(features)
                            features = np.pad(features, ((0, padding_len), (0, 0)), 'constant')
                        else:
                            features = features[:time_steps]

                        X.append(features)

                        # Pad phonemes
                        int_sequence = []
                        for phoneme in phonemes:
                            if phoneme not in phoneme_to_int:
                                phoneme_to_int[phoneme] = phoneme_count
                                int_to_phoneme[phoneme_count] = phoneme
                                phoneme_count += 1
                            int_sequence.append(phoneme_to_int[phoneme])

                        if len(int_sequence) < time_steps:
                            padding_len = time_steps - len(int_sequence)
                            int_sequence = int_sequence + [phoneme_to_int[pad_token]] * padding_len
                        else:
                            int_sequence = int_sequence[:time_steps]

                        y.append(np.array(int_sequence))

                except Exception as e:
                    logging.error(f"Error loading data for {base_name_dir}: {e}")

    X = np.array(X)
    y = np.array(y)
    return X, y, phoneme_to_int, int_to_phoneme

def create_model(input_shape, num_phonemes):
    """Creates an RNN model for phoneme prediction with concatenated features."""
    model = models.Sequential([
        layers.LSTM(128, return_sequences=True, input_shape=input_shape),
        layers.Dropout(0.2),
        layers.LSTM(128, return_sequences=True),
        layers.Dropout(0.2),
        layers.TimeDistributed(layers.Dense(num_phonemes, activation='softmax'))
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def evaluate_model(model, features_folder, time_steps, phoneme_to_int, int_to_phoneme, pad_token=PAD_TOKEN, num_predictions_to_show=NUM_PREDICTIONS_TO_SHOW):
    """Evaluates the trained model on the test set with concatenated features and padding,
       and shows some predicted vs. true phoneme sequences along with raw predictions."""
    test_folder = os.path.join(features_folder, "test")
    if not os.path.exists(test_folder):
        logging.warning(f"Test folder not found at: {test_folder}")
        return

    test_X = []
    test_y_int = []
    file_names = []  # To keep track of the audio files

    for base_name_dir in tqdm(os.listdir(test_folder), desc="Loading Test Data"):
        base_name_path = os.path.join(test_folder, base_name_dir)
        if os.path.isdir(base_name_path):
            mfcc_path = os.path.join(base_name_path, "mfcc.npy")
            mel_path = os.path.join(base_name_path, "mel.npy")
            chroma_path = os.path.join(base_name_path, "chroma.npy")
            phonemes_path = os.path.join(base_name_path, "phonemes.npy")

            if os.path.exists(mfcc_path) and os.path.exists(mel_path) and os.path.exists(chroma_path) and os.path.exists(phonemes_path):
                try:
                    mfcc = np.load(mfcc_path)
                    mel = np.load(mel_path)
                    chroma = np.load(chroma_path)
                    phonemes = np.load(phonemes_path, allow_pickle=True)

                    features = np.concatenate((mfcc, mel, chroma), axis=1)

                    if len(features) > 0 and len(phonemes) > 0:
                        # Pad features
                        if len(features) < time_steps:
                            padding_len = time_steps - len(features)
                            features = np.pad(features, ((0, padding_len), (0, 0)), 'constant')
                        else:
                            features = features[:time_steps]

                        test_X.append(features)

                        # Pad phonemes
                        int_sequence = [phoneme_to_int.get(p, phoneme_to_int.get(pad_token, -1)) for p in phonemes]
                        if all(i != -1 for i in int_sequence):
                            if len(int_sequence) < time_steps:
                                padding_len = time_steps - len(int_sequence)
                                int_sequence = int_sequence + [phoneme_to_int[pad_token]] * padding_len
                            else:
                                int_sequence = int_sequence[:time_steps]
                            test_y_int.append(np.array(int_sequence))
                            file_names.append(base_name_dir)  # Store the file name
                        else:
                            logging.warning(f"Skipping {base_name_dir} test data due to unknown phonemes.")

                except Exception as e:
                    logging.error(f"Error loading test data for {base_name_dir}: {e}")

    if not test_X:
        logging.warning("No valid test data found.")
        return

    test_X = np.array(test_X)
    test_y_int = np.array(test_y_int)

    if test_X.shape[0] != test_y_int.shape[0]:
        logging.error("Number of test features and labels do not match.")
        return

    loss, accuracy = model.evaluate(test_X, np.expand_dims(test_y_int, -1), verbose=0)
    logging.info(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

    # Show some predictions and raw output
    num_samples = min(len(test_X), num_predictions_to_show)
    for i in range(num_samples):
        sample_features = np.expand_dims(test_X[i], axis=0)
        true_indices = test_y_int[i]
        predictions_raw = model.predict(sample_features)
        predicted_indices = np.argmax(predictions_raw, axis=-1)[0]

        true_phonemes = [int_to_phoneme.get(idx, '[UNK]') for idx in true_indices if idx != 0]
        predicted_phonemes = [int_to_phoneme.get(idx, '[UNK]') for idx in predicted_indices if idx != 0]

        logging.info(f"\n--- Prediction for: {file_names[i]} ---")
        logging.info(f"True Phonemes:    {true_phonemes}")
        logging.info(f"Predicted Phonemes: {predicted_phonemes}")
        logging.info(f"Raw Predictions (first time step):\n{predictions_raw[0][0]}") # Print probabilities for the first time step

def predict_phonemes(model, audio_path, time_steps, phoneme_to_int, int_to_phoneme, sample_rate=16000, n_mfcc=13, n_mels=128, n_chroma=12, pad_token=PAD_TOKEN):
    """Predicts phonemes for a given audio file with concatenated features and padding."""
    try:
        y, sr = librosa.load(audio_path, sr=sample_rate)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc).T
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels).T
        chroma = librosa.feature.chroma_stft(y=y, sr=sr).T

        features = np.concatenate((mfccs, mel, chroma), axis=1)

        if len(features) > 0:
            # Pad features
            if len(features) < time_steps:
                padding_len = time_steps - len(features)
                input_features = np.pad(features, ((0, padding_len), (0, 0)), 'constant')
            else:
                input_features = features[:time_steps]
            input_features = np.expand_dims(input_features, axis=0)  # Add batch dimension

            predictions = model.predict(input_features)
            predicted_indices = np.argmax(predictions, axis=-1)[0]
            predicted_phonemes = [int_to_phoneme.get(idx, '[UNK]') for idx in predicted_indices if idx != 0] # Exclude padding token
            return predicted_phonemes
        else:
            logging.warning("Input audio resulted in empty feature array.")
            return []

    except Exception as e:
        logging.error(f"Error during prediction: {e}")
        return []

# --- Main Execution ---
if __name__ == "__main__":
    # Load concatenated features and transcripts with padding
    X, y, phoneme_to_int, int_to_phoneme = load_concatenated_features_and_transcripts(FEATURES_FOLDER, TIME_STEPS)

    if X is None or y is None:
        logging.error("Failed to load training data. Exiting.")
        exit()

    # Get the total number of features after concatenation
    num_features = X.shape[2]
    num_phonemes = len(phoneme_to_int)
    logging.info(f"Number of concatenated features per time step: {num_features}")
    logging.info(f"Number of unique phonemes (including padding): {num_phonemes}")
    logging.info(f"Training data shape (features): {X.shape}")
    logging.info(f"Training data shape (labels): {y.shape}")

    # Create the model with the updated input shape
    input_shape = (TIME_STEPS, num_features)
    model = create_model(input_shape, num_phonemes)
    model.summary()

    # Train the model
    model.fit(X, np.expand_dims(y, -1),  # Expand labels for sparse categorical crossentropy
              epochs=EPOCHS,
              batch_size=BATCH_SIZE,
              validation_split=VALIDATION_SPLIT,
              shuffle=True,
              callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)])

    # Evaluate the model on the test set and show predictions with raw output
    evaluate_model(model, FEATURES_FOLDER, TIME_STEPS, phoneme_to_int, int_to_phoneme)

    # Example prediction on a new audio file
    example_audio_path = r"C:\ASR\converted_audio\your_audio_file.wav" # Replace with a real path
    if os.path.exists(example_audio_path):
        predicted_phonemes = predict_phonemes(model, example_audio_path, TIME_STEPS, phoneme_to_int, int_to_phoneme)
        logging.info(f"\nPredicted phonemes for {os.path.basename(example_audio_path)}: {predicted_phonemes}")
    else:
        logging.warning(f"Example audio file not found at: {example_audio_path}")

    # Save the trained model and vocabulary
    model.save(os.path.join(FEATURES_FOLDER, "phoneme_predictor_model_concat_padded.h5"))
    np.save(os.path.join(FEATURES_FOLDER, "phoneme_to_int_concat_padded.npy"), phoneme_to_int)
    np.save(os.path.join(FEATURES_FOLDER, "int_to_phoneme_concat_padded.npy"), int_to_phoneme)
    logging.info("✅ Model and vocabulary saved (concatenated features with padding).")

Loading Training Data: 100%|██████████| 1633/1633 [00:30<00:00, 54.19it/s]
2025-04-11 13:15:44,539 - INFO - Number of concatenated features per time step: 153
2025-04-11 13:15:44,550 - INFO - Number of unique phonemes (including padding): 9409
2025-04-11 13:15:44,550 - INFO - Training data shape (features): (1633, 128, 153)
2025-04-11 13:15:44,552 - INFO - Training data shape (labels): (1633, 128)
  super().__init__(**kwargs)


Epoch 1/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 2s/step - accuracy: 0.6301 - loss: 7.3016 - val_accuracy: 0.7471 - val_loss: 2.3964
Epoch 2/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 2s/step - accuracy: 0.7083 - loss: 2.5627 - val_accuracy: 0.7471 - val_loss: 2.3993
Epoch 3/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 2s/step - accuracy: 0.7089 - loss: 2.4917 - val_accuracy: 0.7471 - val_loss: 2.2855
Epoch 4/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 2s/step - accuracy: 0.7134 - loss: 2.2973 - val_accuracy: 0.7471 - val_loss: 2.1384
Epoch 5/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 2s/step - accuracy: 0.7116 - loss: 2.1592 - val_accuracy: 0.7471 - val_loss: 2.0569
Epoch 6/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 2s/step - accuracy: 0.7022 - loss: 2.1628 - val_accuracy: 0.7471 - val_loss: 2.0327
Epoch 7/30
[1m41/41[0m [32m━━━━━━━━━━

Loading Test Data: 100%|██████████| 409/409 [00:11<00:00, 35.60it/s]
2025-04-11 13:30:37,112 - INFO - Test Loss: 1.8596, Test Accuracy: 0.7410


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


2025-04-11 13:30:38,276 - INFO - 
--- Prediction for: 10367_10282_000010 ---
2025-04-11 13:30:38,276 - INFO - True Phonemes:    ['ˈi', 'sˌuβiɾˈa', 'kwˈal', 'ɾɾenwˈeβo', 'delˈante', 'dˈe', 'ˈel', 'ˈi', 'kˈomo', 'dˈe', 'tjˈeɾɾa', 'sˈeka', 'nˈo', 'ˈaɪ', 'pˌaɾeθˈeɾ', 'ˈen', 'ˈel', 'nˈi', 'ˌeɾmosˈuɾa', 'bˈeɾlo', 'ˈemos', 'mˈas', 'sˈin', 'pˈaɾa', 'kˈe', 'lˈe']
2025-04-11 13:30:38,276 - INFO - Predicted Phonemes: []
2025-04-11 13:30:38,284 - INFO - Raw Predictions (first time step):
[6.3762553e-03 5.2236430e-03 7.1334979e-04 ... 3.2413318e-05 3.3087355e-05
 3.6499936e-05]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


2025-04-11 13:30:38,372 - INFO - 
--- Prediction for: 10367_10282_000011 ---
2025-04-11 13:30:38,372 - INFO - True Phonemes:    ['ˈi', 'seɾˈa', 'ˈa', 'xˌeoβˈa', 'pˈoɾ', 'nˈombɾe', 'pˈoɾ', 'seɲˈal', 'etˈeɾna', 'kˈe', 'nˈunka', 'seɾˈa', 'ˌisaˈias', 'θinkwˈɛnta', 'ˈi', 'sˈeɪs', 'asˈi', 'dˈixo', 'xˌeoβˈa', 'deɾˈetʃo', 'ˈi', 'xustˈiθja', 'pˈoɾke', 'estˈa', 'mˈi', 'salˈud', 'pˈaɾa', 'benˈiɾ', 'ˈi', 'mˈi', 'xustˈiθja', 'pˈaɾa']
2025-04-11 13:30:38,372 - INFO - Predicted Phonemes: []
2025-04-11 13:30:38,372 - INFO - Raw Predictions (first time step):
[6.3762553e-03 5.2236430e-03 7.1334979e-04 ... 3.2413318e-05 3.3087355e-05
 3.6499936e-05]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step


2025-04-11 13:30:38,469 - INFO - 
--- Prediction for: 10367_10282_000013 ---
2025-04-11 13:30:38,469 - INFO - True Phonemes:    ['sˈi', 'dˈel', 'sˈaβaðo', 'tˈu', 'pjˈe', 'dˈe', 'aθˈeɾ', 'tˈu', 'bˌoluntˈad', 'ˈen', 'mˈi', 'dˈia', 'sˈanto', 'ˈi', 'ˈal', 'sˈaβaðo', 'sˈanto', 'dˈe', 'xˌeoβˈa', 'ˈi', 'lˈo', 'nˈo', 'andˈando', 'ˈen', 'tˈus', 'kamˈinos', 'nˈi', 'buskˈando', 'tˈu', 'bˌoluntˈad', 'nˈi', 'aβlˈando', 'tˈus', 'palˈaβɾas']
2025-04-11 13:30:38,469 - INFO - Predicted Phonemes: []
2025-04-11 13:30:38,469 - INFO - Raw Predictions (first time step):
[6.3762553e-03 5.2236430e-03 7.1334979e-04 ... 3.2413318e-05 3.3087355e-05
 3.6499936e-05]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


2025-04-11 13:30:38,568 - INFO - 
--- Prediction for: 10367_10282_000015 ---
2025-04-11 13:30:38,568 - INFO - True Phonemes:    ['nˈo', 'kˌonoθjˈeɾon', 'kamˈino', 'dˈe', 'pˈaθ', 'nˈi', 'ˈaɪ', 'deɾˈetʃo', 'ˈen', 'sˈus', 'kamˈinos', 'sˈus', 'sˈon', 'kwalkjˈeɾa', 'kˈe', 'pˈoɾ', 'ˈeʎas', 'fwˈeɾe', 'nˈo', 'pˈaθ', 'pˈoɾ', 'ˈesto', 'sˈe', 'ˌalexˈo', 'dˈe', 'nosˈotɾos', 'ˈel', 'xwˈiθjo', 'ˈi', 'nˈo', 'nˈos', 'ˌalkanθˈo', 'xustˈiθja', 'lˈuθ', 'ˈi', 'ˈe', 'akˈi', 'tinjˈeβlas', 'ˈi', 'ˈen', 'ˌoskuɾiðˈad']
2025-04-11 13:30:38,568 - INFO - Predicted Phonemes: []
2025-04-11 13:30:38,575 - INFO - Raw Predictions (first time step):
[6.3762553e-03 5.2236430e-03 7.1334979e-04 ... 3.2413318e-05 3.3087355e-05
 3.6499936e-05]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step


2025-04-11 13:30:38,659 - INFO - 
--- Prediction for: 10367_10282_000017 ---
2025-04-11 13:30:38,659 - INFO - True Phonemes:    ['aβjˈɛndo', 'ˈel', 'ʎeβˈaðo', 'ˈel', 'pekˈaðo', 'dˈe', 'mˈutʃos', 'ˈi', 'pˈoɾ', 'lˈos']
2025-04-11 13:30:38,659 - INFO - Predicted Phonemes: []
2025-04-11 13:30:38,659 - INFO - Raw Predictions (first time step):
[6.3762553e-03 5.2236430e-03 7.1334979e-04 ... 3.2413318e-05 3.3087355e-05
 3.6499936e-05]
2025-04-11 13:30:38,766 - INFO - ✅ Model and vocabulary saved (concatenated features with padding).
