In [2]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------------------------------------- 992.0/992.0 kB 7.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import os

AUDIO_DIRECTORY = r"C:\ASR\converted_audio"
TRANSCRIPTS_DIRECTORY = r"C:\ASR\transcripts" # This directory currently holds Google transcripts

# If you have raw transcripts in a different location before processing, you might define:
# RAW_TRANSCRIPTS_DIRECTORY = r"C:\ASR\raw_transcripts"

print(f"Audio Directory: {AUDIO_DIRECTORY}")
print(f"Transcripts Directory: {TRANSCRIPTS_DIRECTORY}")

# You might want to create a directory to store the vocabulary file later
VOCAB_DIRECTORY = r"C:\ASR\vocabulary"
if not os.path.exists(VOCAB_DIRECTORY):
    os.makedirs(VOCAB_DIRECTORY)
    print(f"Created vocabulary directory: {VOCAB_DIRECTORY}")

Audio Directory: C:\ASR\converted_audio
Transcripts Directory: C:\ASR\transcripts


In [18]:
import ffmpeg
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Supported input file formats
SUPPORTED_FORMATS = (".mp3", ".aac", ".flac", ".ogg", ".opus", ".m4a", ".wav")

def convert_audio_to_wav(input_path, output_path, sample_rate=16000):
    """Converts any audio file to WAV format."""
    if os.path.exists(output_path):  
        return f"Skipping {os.path.basename(input_path)} (Already converted)"

    try:
        ffmpeg.input(input_path).output(output_path, ar=sample_rate, ac=1).run(overwrite_output=True, quiet=True)
        return f"✅ Converted {os.path.basename(input_path)}"
    except ffmpeg.Error as e:
        return f"❌ Error converting {os.path.basename(input_path)}: {e}"

def convert_dataset_to_wav(input_folder, output_folder, sample_rate=16000, max_workers=4):
    """Convert all supported audio files to WAV format using multi-threading."""
    
    os.makedirs(output_folder, exist_ok=True)  # Ensure output directory exists

    # Get all supported audio files
    audio_files = [f for f in os.listdir(input_folder) if f.endswith(SUPPORTED_FORMATS)]
    new_files = [f for f in audio_files if not os.path.exists(os.path.join(output_folder, os.path.splitext(f)[0] + ".wav"))]

    if not new_files:
        print(f"✅ No new files to convert in {input_folder}. Skipping conversion.")
        return

    print(f"🔄 Converting {len(new_files)} new files in {input_folder}...")

    # Use ThreadPoolExecutor to process multiple files in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        tasks = [
            executor.submit(
                convert_audio_to_wav, 
                os.path.join(input_folder, file), 
                os.path.join(output_folder, os.path.splitext(file)[0] + ".wav"),  
                sample_rate
            )
            for file in new_files
        ]

        for future in tqdm(tasks, desc=f"Processing {input_folder}"):
            print(future.result())  # Print conversion status

# Example usage
dataset_folders = [
    r"C:\ASR\raw_audio",  # Change this to your input folder
]

output_folders = [
    r"C:\ASR\converted_audio",  # Change this to your output folder
]
    
# Convert all audio files in dataset folders
for in_folder, out_folder in zip(dataset_folders, output_folders):
    convert_dataset_to_wav(in_folder, out_folder, max_workers=6)  


✅ No new files to convert in C:\ASR\raw_audio. Skipping conversion.


In [1]:
import speech_recognition as sr
import logging
import os

AUDIO_DIRECTORY = r"C:\ASR\converted_audio"
TRANSCRIPTS_DIRECTORY = r"C:\ASR\transcripts"
PROCESSED_FILES_LOG = "processed_audio_files.log"

def transcribe_audio(input_path):
    """
    Transcribes an audio file using Google Speech Recognition.

    Args:
        input_path (str): The path to the audio file.

    Returns:
        str: The transcribed text in Spanish, or an empty string if transcription fails.
    """
    recognizer = sr.Recognizer()
    with sr.AudioFile(input_path) as source:
        try:
            audio_data = recognizer.record(source)
            return recognizer.recognize_google(audio_data, language="es-ES")
        except sr.UnknownValueError:
            return ""
        except sr.RequestError as e:
            logging.error(f"❌ API Error: Could not request results for {input_path}: {e}")
            return ""
        except Exception as e:
            logging.error(f"❌ Error during transcription of {input_path}: {e}")
            return ""

def load_processed_files():
    """Loads the list of already processed audio files from the log file."""
    processed = set()
    if os.path.exists(PROCESSED_FILES_LOG):
        with open(PROCESSED_FILES_LOG, "r") as f:
            for line in f:
                processed.add(line.strip())
    return processed

def save_processed_file(filename):
    """Appends the filename to the log of processed audio files."""
    with open(PROCESSED_FILES_LOG, "a") as f:
        f.write(filename + "\n")

def save_transcript(filename, transcript):
    """Saves the transcript to a text file in the transcripts directory."""
    if not os.path.exists(TRANSCRIPTS_DIRECTORY):
        os.makedirs(TRANSCRIPTS_DIRECTORY)
    transcript_filename = os.path.splitext(filename)[0] + ".txt"
    transcript_path = os.path.join(TRANSCRIPTS_DIRECTORY, transcript_filename)
    with open(transcript_path, "w", encoding="utf-8") as f:
        f.write(transcript)

def process_audio_files():
    """
    Transcribes new audio files in the converted folder and saves the transcripts.
    It avoids re-transcribing previously processed files.
    """
    if not os.path.exists(AUDIO_DIRECTORY):
        print(f"Error: Audio directory not found: {AUDIO_DIRECTORY}")
        return

    if not os.path.exists(TRANSCRIPTS_DIRECTORY):
        os.makedirs(TRANSCRIPTS_DIRECTORY)
        print(f"Created transcripts directory: {TRANSCRIPTS_DIRECTORY}")

    processed_files = load_processed_files()
    audio_files = [f for f in os.listdir(AUDIO_DIRECTORY) if os.path.isfile(os.path.join(AUDIO_DIRECTORY, f))]

    new_files_to_process = []
    existing_transcripts = set(os.listdir(TRANSCRIPTS_DIRECTORY))

    print("Checking for new audio files...")
    for audio_file in audio_files:
        if audio_file not in processed_files:
            transcript_filename = os.path.splitext(audio_file)[0] + ".txt"
            if transcript_filename not in existing_transcripts:
                new_files_to_process.append(audio_file)
            else:
                print(f"Transcript already exists for: {audio_file}")
                save_processed_file(audio_file) # Mark as processed if transcript exists

    if not new_files_to_process:
        print("No new audio files found to transcribe.")
        return

    print("Starting transcription of new audio files...")
    for audio_file in new_files_to_process:
        file_path = os.path.join(AUDIO_DIRECTORY, audio_file)
        print(f"Transcribing: {audio_file}")
        transcription = transcribe_audio(file_path)
        if transcription:
            print(f"Transcription of {audio_file}: {transcription}")
            save_transcript(audio_file, transcription)
            save_processed_file(audio_file)
        else:
            print(f"Transcription failed or audio is silent for {audio_file}")
            save_processed_file(audio_file) # Mark as processed even if failed

if __name__ == "__main__":
    process_audio_files()

Checking for new audio files...
Starting transcription of new audio files...
Transcribing: 3473_1378_000149.wav
Transcription of 3473_1378_000149.wav: báilame Dios Qué es lo que veo es posible que tengo en mis brazos a mi cargo amigo a mi buen vecino Sancho Panza si tengo sin duda porque yo ni duermo ni estoy ahora borracho
Transcribing: 3473_1378_000150.wav
Transcription of 3473_1378_000150.wav: a lo menos en mí le puso de suerte que me parece que antes del tiempo que se nos concedía para que hiciésemos ausencia de España ya tenía el rigor de la pena ejecutado en mi persona y en la de mis hijos ordene Pues a mí parecer como prudente
Transcribing: 3678_1378_000000.wav
Transcription of 3678_1378_000000.wav: abre los ojos deseada Patria y mira que vuelve a ti Sancho Panza tu hijo si no muy rico muy bien azotado abre los brazos y recibe también Tu hijo Don Quijote que si viene vencido de los brazos ajenos viene vencedor de sí mismo
Transcribing: 3678_1378_000001.wav
Transcription of 3678_

In [5]:
import os

TRANSCRIPTS_DIRECTORY = r"C:\ASR\transcripts"
ALL_TRANSCRIPTS_FILE = r"C:\ASR\all_transcripts.txt"

with open(ALL_TRANSCRIPTS_FILE, "w", encoding="utf-8") as outfile:
    for filename in os.listdir(TRANSCRIPTS_DIRECTORY):
        if filename.endswith(".txt"):
            filepath = os.path.join(TRANSCRIPTS_DIRECTORY, filename)
            with open(filepath, "r", encoding="utf-8") as infile:
                full_transcript = infile.read().strip()  # Read the entire content and remove leading/trailing whitespace
                outfile.write(full_transcript + "\n")  # Write the full transcript followed by a newline

In [6]:
import sentencepiece as spm
import os

# --- Configuration ---
VOCAB_DIRECTORY = r"C:\ASR\vocabulary"
ALL_TRANSCRIPTS_FILE = r"C:\ASR\all_transcripts.txt"
INPUT_FILE = ALL_TRANSCRIPTS_FILE
MODEL_PREFIX = os.path.join(VOCAB_DIRECTORY, "spanish_subwords")
VOCAB_SIZE = 8000
MODEL_TYPE = "bpe"
CHARACTER_COVERAGE = 1.0
MAX_SENTENCE_LENGTH = 10000

# --- Create Vocabulary Directory if it doesn't exist ---
if not os.path.exists(VOCAB_DIRECTORY):
    os.makedirs(VOCAB_DIRECTORY)
    print(f"Created vocabulary directory: {VOCAB_DIRECTORY}")

# --- Train SentencePiece Model ---
train_arguments = (
    f'--input={INPUT_FILE} '
    f'--model_prefix={MODEL_PREFIX} '
    f'--vocab_size={VOCAB_SIZE} '
    f'--model_type={MODEL_TYPE} '
    f'--character_coverage={CHARACTER_COVERAGE} '
    f'--max_sentence_length={MAX_SENTENCE_LENGTH}'
)

print(f"Training SentencePiece with arguments: {train_arguments}")

try:
    spm.SentencePieceTrainer.train(
        train_arguments,
        output_dir=VOCAB_DIRECTORY
    )
    print(f"SentencePiece model and vocabulary saved to: {VOCAB_DIRECTORY}")
except RuntimeError as e:
    print(f"Error during SentencePiece training (Runtime): {e}")
except OSError as e:
    print(f"OS Error during SentencePiece training: {e}")

print("--- End of Phase 1 (Vocabulary Creation) ---")
print("Please check the C:\\ASR\\vocabulary directory for spanish_subwords.model and spanish_subwords.vocab.")

Training SentencePiece with arguments: --input=C:\ASR\all_transcripts.txt --model_prefix=C:\ASR\vocabulary\spanish_subwords --vocab_size=8000 --model_type=bpe --character_coverage=1.0 --max_sentence_length=10000
SentencePiece model and vocabulary saved to: C:\ASR\vocabulary
--- End of Phase 1 (Vocabulary Creation) ---
Please check the C:\ASR\vocabulary directory for spanish_subwords.model and spanish_subwords.vocab.


In [1]:
import os
import librosa
import numpy as np
import pickle

# --- Configuration ---
AUDIO_DIRECTORY = r"C:\ASR\converted_audio"
FEATURES_DIRECTORY = r"C:\ASR\features"

# --- Function to extract and save features ---
def extract_and_save_features(audio_path, feature_path, sample_rate=16000, feature_type='mfcc', num_mfcc=40, num_mel_bins=80):
    try:
        audio, sr = librosa.load(audio_path, sr=sample_rate)
        if feature_type == 'mfcc':
            features = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc).T
        elif feature_type == 'mel':
            mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=num_mel_bins)
            features = librosa.power_to_db(mel_spectrogram, ref=np.max).T
        else:
            raise ValueError(f"Unsupported feature type: {feature_type}")

        with open(feature_path, 'wb') as f:
            pickle.dump(features, f)
        print(f"Extracted and saved features for: {os.path.basename(audio_path)}")

    except Exception as e:
        print(f"Error processing {os.path.basename(audio_path)}: {e}")

# --- Main script ---
if __name__ == "__main__":
    audio_files = [f for f in os.listdir(AUDIO_DIRECTORY) if f.endswith(('.wav', '.flac'))]
    os.makedirs(FEATURES_DIRECTORY, exist_ok=True)

    processed_count = 0
    skipped_count = 0

    for audio_file in audio_files:
        audio_path = os.path.join(AUDIO_DIRECTORY, audio_file)
        feature_file = audio_file.replace('.wav', '.pkl').replace('.flac', '.pkl')
        feature_path = os.path.join(FEATURES_DIRECTORY, feature_file)

        if not os.path.exists(feature_path):
            extract_and_save_features(audio_path, feature_path)
            processed_count += 1
        else:
            print(f"Features already exist for: {os.path.basename(audio_path)}. Skipping.")
            skipped_count += 1

    print(f"\nFeature extraction complete.")
    print(f"Processed {processed_count} new audio files.")
    print(f"Skipped {skipped_count} existing feature files.")
    print(f"Features saved in: {FEATURES_DIRECTORY}")

Extracted and saved features for: 10367_10282_000000.wav
Extracted and saved features for: 10367_10282_000001.wav
Extracted and saved features for: 10367_10282_000002.wav
Extracted and saved features for: 10367_10282_000003.wav
Extracted and saved features for: 10367_10282_000004.wav
Extracted and saved features for: 10367_10282_000005.wav
Extracted and saved features for: 10367_10282_000006.wav
Extracted and saved features for: 10367_10282_000007.wav
Extracted and saved features for: 10367_10282_000008.wav
Extracted and saved features for: 10367_10282_000009.wav
Extracted and saved features for: 10367_10282_000010.wav
Extracted and saved features for: 10367_10282_000011.wav
Extracted and saved features for: 10367_10282_000012.wav
Extracted and saved features for: 10367_10282_000013.wav
Extracted and saved features for: 10367_10282_000014.wav
Extracted and saved features for: 10367_10282_000015.wav
Extracted and saved features for: 10367_10282_000016.wav
Extracted and saved features fo

In [2]:
import sentencepiece as spm
import os

VOCAB_DIRECTORY = r"C:\ASR\vocabulary"
MODEL_PATH = os.path.join(VOCAB_DIRECTORY, "spanish_subwords.model")
sp = spm.SentencePieceProcessor(model_file=MODEL_PATH)
vocabulary_size = sp.get_piece_size()
BLANK_TOKEN = "<blank>"
BLANK_TOKEN_ID = vocabulary_size # Define blank token ID
print(f"Vocabulary size: {vocabulary_size}, Blank Token ID: {BLANK_TOKEN_ID}")

Vocabulary size: 8000, Blank Token ID: 8000


In [3]:
import torch
from torch.utils.data import Dataset
import os
import pickle

FEATURES_DIRECTORY = r"C:\ASR\features"
TRANSCRIPTS_DIRECTORY = r"C:\ASR\transcripts"

class ASRDataset(Dataset):
    def __init__(self, features_dir, transcripts_dir, tokenizer):
        self.features_dir = features_dir
        self.transcripts_dir = transcripts_dir
        self.tokenizer = tokenizer
        self.feature_files = [f for f in os.listdir(features_dir) if f.endswith(".pkl")]
        self.transcript_files = {f.replace('.pkl', '.txt'): f for f in self.feature_files}

    def __len__(self):
        return len(self.feature_files)

    def __getitem__(self, idx):
        feature_file = os.path.join(self.features_dir, self.feature_files[idx])
        transcript_file = os.path.join(self.transcripts_dir, self.feature_files[idx].replace('.pkl', '.txt'))

        # 1. Load Pre-extracted Features
        try:
            with open(feature_file, 'rb') as f:
                features = pickle.load(f)
            features_tensor = torch.tensor(features, dtype=torch.float)
        except Exception as e:
            print(f"Error loading features from {feature_file}: {e}")
            return None, None

        # 2. Load and Tokenize Transcript
        try:
            with open(transcript_file, 'r', encoding='utf-8') as f:
                transcript = f.readline().strip()
        except FileNotFoundError:
            print(f"Transcript file not found: {transcript_file}")
            transcript = ""

        tokens = self.tokenizer.encode(transcript)
        token_ids = torch.tensor(tokens, dtype=torch.int)

        return features_tensor, token_ids

dataset = ASRDataset(FEATURES_DIRECTORY, TRANSCRIPTS_DIRECTORY, sp)
print(f"Number of samples in dataset: {len(dataset)}")
if len(dataset) > 0:
    sample_features, sample_tokens = dataset[0]
    print(f"Sample features shape: {sample_features.shape}")
    print(f"Sample token IDs: {sample_tokens.shape}")
    print(f"Corresponding tokens: {sp.id_to_piece(sample_tokens.tolist())}")
else:
    print("Dataset is empty.")

Number of samples in dataset: 2043
Sample features shape: torch.Size([376, 40])
Sample token IDs: torch.Size([46])
Corresponding tokens: ['▁y', '▁a', '▁los', '▁hijos', '▁de', '▁los', '▁extranjeros', '▁que', '▁se', '▁llegaron', '▁a', '▁Jehová', '▁para', '▁ad', 'min', 'istrar', 'le', '▁y', '▁que', '▁amar', '▁en', '▁el', '▁nombre', '▁de', '▁Jehová', '▁para', '▁hacer', '▁sus', '▁siervos', '▁a', '▁todos', '▁los', '▁que', '▁guar', 'daron', '▁el', '▁sábado', '▁de', '▁prof', 'anar', '▁lo', '▁y', '▁abrazar', '▁en', '▁mi', '▁pacto']


In [10]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return torch.empty(0, 0, 40), torch.empty(0, 0, dtype=torch.int), torch.empty(0, dtype=torch.int), torch.empty(0, dtype=torch.int)
    features = [item[0] for item in batch]
    tokens = [item[1] for item in batch]
    padded_features = pad_sequence(features, batch_first=True, padding_value=0.0)
    padded_tokens = pad_sequence(tokens, batch_first=True, padding_value=BLANK_TOKEN_ID)
    input_lengths = torch.tensor([len(f) for f in features], dtype=torch.int)
    target_lengths = torch.tensor([len(t) for t in tokens], dtype=torch.int)
    return padded_features, padded_tokens, input_lengths, target_lengths

batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=0) # num_workers=0 for debugging

Batch 0:
  Features shape: torch.Size([32, 618, 40])
  Tokens shape: torch.Size([32, 66])
  Input lengths: tensor([341, 557, 364, 403, 326, 344, 485, 511, 442, 586, 537, 443, 618, 618,
        581, 416, 591, 383, 488, 529, 476, 492, 578, 322, 440, 413, 438, 354,
        562, 582, 329, 535], dtype=torch.int32)
  Target lengths: tensor([33, 46, 39, 33, 28, 30, 29, 40, 39, 32, 34, 36, 52, 55, 66, 44, 48, 28,
        48, 34, 41, 48, 46, 44, 38, 48, 36, 26, 56, 50, 23, 53],
       dtype=torch.int32)


In [4]:
import torch.nn as nn

class ASRModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(ASRModel, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, input_lengths):
        packed_x = nn.utils.rnn.pack_padded_sequence(x, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.rnn(packed_x)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        out = self.fc(out)
        return out

input_dim = 40 # Adjust if using different features (e.g., 80 for Mel)
hidden_dim = 256
num_layers = 2
output_dim = vocabulary_size + 1
model = ASRModel(input_dim, hidden_dim, output_dim, num_layers)
print(model)

ASRModel(
  (rnn): LSTM(40, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=8001, bias=True)
)


In [5]:
import torch.nn as nn

criterion = nn.CTCLoss(blank=BLANK_TOKEN_ID, zero_infinity=True)

In [2]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset  # Import Dataset here
from torch.nn.utils.rnn import pad_sequence
import sentencepiece as spm
import os
import torch.nn as nn
import pickle

# --- Configuration ---
FEATURES_DIRECTORY = r"C:\ASR\features"
TRANSCRIPTS_DIRECTORY = r"C:\ASR\transcripts"
VOCAB_DIRECTORY = r"C:\ASR\vocabulary"
MODEL_PATH = os.path.join(VOCAB_DIRECTORY, "spanish_subwords.model")
BLANK_TOKEN = "<blank>"

# --- Load SentencePiece Model ---
sp = spm.SentencePieceProcessor(model_file=MODEL_PATH)
vocabulary_size = sp.get_piece_size()
BLANK_TOKEN_ID = vocabulary_size
print(f"Vocabulary size: {vocabulary_size}, Blank Token ID: {BLANK_TOKEN_ID}")

# --- Define ASRDataset Class (loads pre-extracted features) ---
class ASRDataset(Dataset):
    def __init__(self, features_dir, transcripts_dir, tokenizer):
        self.features_dir = features_dir
        self.transcripts_dir = transcripts_dir
        self.tokenizer = tokenizer
        self.feature_files = [f for f in os.listdir(features_dir) if f.endswith(".pkl")]
        self.transcript_files = {f.replace('.pkl', '.txt'): f for f in self.feature_files}

    def __len__(self):
        return len(self.feature_files)

    def __getitem__(self, idx):
        feature_file = os.path.join(self.features_dir, self.feature_files[idx])
        transcript_file = os.path.join(self.transcripts_dir, self.feature_files[idx].replace('.pkl', '.txt'))

        # 1. Load Pre-extracted Features
        try:
            with open(feature_file, 'rb') as f:
                features = pickle.load(f)
            features_tensor = torch.tensor(features, dtype=torch.float)
        except Exception as e:
            print(f"Error loading features from {feature_file}: {e}")
            return None, None

        # 2. Load and Tokenize Transcript
        try:
            with open(transcript_file, 'r', encoding='utf-8') as f:
                transcript = f.readline().strip()
        except FileNotFoundError:
            print(f"Transcript file not found: {transcript_file}")
            transcript = ""

        tokens = self.tokenizer.encode(transcript)
        token_ids = torch.tensor(tokens, dtype=torch.int)

        return features_tensor, token_ids

# --- DataLoader ---
def collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return torch.empty(0, 0, 40), torch.empty(0, 0, dtype=torch.int), torch.empty(0, dtype=torch.int), torch.empty(0, dtype=torch.int)
    features = [item[0] for item in batch]
    tokens = [item[1] for item in batch]
    padded_features = pad_sequence(features, batch_first=True, padding_value=0.0)
    padded_tokens = pad_sequence(tokens, batch_first=True, padding_value=BLANK_TOKEN_ID)
    input_lengths = torch.tensor([len(f) for f in features], dtype=torch.int)
    target_lengths = torch.tensor([len(t) for t in tokens], dtype=torch.int)
    return padded_features, padded_tokens, input_lengths, target_lengths

dataset = ASRDataset(FEATURES_DIRECTORY, TRANSCRIPTS_DIRECTORY, sp)
batch_size = 8
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=0)

# --- Optimized Model Definition ---
class OptimizedASRModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout_rate=0.1):
        super(OptimizedASRModel, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, input_lengths):
        packed_x = nn.utils.rnn.pack_padded_sequence(x, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.rnn(packed_x)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        out = self.fc(out)
        return out

input_dim = 40 # Adjust if using different features
hidden_dim = 128  # Reduced hidden dimension
num_layers = 1    # Reduced number of layers
output_dim = vocabulary_size + 1
model = OptimizedASRModel(input_dim, hidden_dim, output_dim, num_layers)

# --- Loss Function and Optimizer ---
criterion = nn.CTCLoss(blank=BLANK_TOKEN_ID, zero_infinity=True)
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# --- Training Parameters ---
num_epochs = 20  # Increased epochs to compensate for smaller model
device = torch.device("cpu") # Explicitly set to CPU
model.to(device)

# --- Training Loop ---
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_idx, (batch_features, batch_tokens, batch_input_lengths, batch_target_lengths) in enumerate(dataloader):
        if batch_features is None or batch_tokens is None:
            print(f"Warning: Skipping batch {batch_idx} due to None features or tokens.")
            continue
        batch_features = batch_features.to(device)
        batch_tokens = batch_tokens.to(device)
        outputs = model(batch_features, batch_input_lengths)
        log_probs = torch.nn.functional.log_softmax(outputs, dim=2)
        loss = criterion(log_probs.transpose(0, 1), batch_tokens, batch_input_lengths, batch_target_lengths)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()
        total_loss += loss.item()
        if batch_idx % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}")
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}")
    save_path = os.path.join(VOCAB_DIRECTORY, f"asr_model_epoch_{epoch+1}_cpu.pth")
    torch.save(model.state_dict(), save_path)
    print(f"Model saved at: {save_path}")

print("Training finished!")

Vocabulary size: 8000, Blank Token ID: 8000




Epoch [1/20], Batch [0/256], Loss: 96.7082
Epoch [1/20], Batch [10/256], Loss: 83.1664
Epoch [1/20], Batch [20/256], Loss: 59.3931
Epoch [1/20], Batch [30/256], Loss: 19.9024
Epoch [1/20], Batch [40/256], Loss: 7.6407
Epoch [1/20], Batch [50/256], Loss: 7.4276
Epoch [1/20], Batch [60/256], Loss: 7.6170
Epoch [1/20], Batch [70/256], Loss: 7.4882
Epoch [1/20], Batch [80/256], Loss: 7.7652
Epoch [1/20], Batch [90/256], Loss: 7.6628
Epoch [1/20], Batch [100/256], Loss: 7.0694
Epoch [1/20], Batch [110/256], Loss: 7.3779
Epoch [1/20], Batch [120/256], Loss: 7.2701
Epoch [1/20], Batch [130/256], Loss: 7.0141
Epoch [1/20], Batch [140/256], Loss: 7.0504
Epoch [1/20], Batch [150/256], Loss: 6.6302
Epoch [1/20], Batch [160/256], Loss: 7.3821
Epoch [1/20], Batch [170/256], Loss: 7.3937
Epoch [1/20], Batch [180/256], Loss: 7.1830
Epoch [1/20], Batch [190/256], Loss: 7.2826
Epoch [1/20], Batch [200/256], Loss: 7.1487
Epoch [1/20], Batch [210/256], Loss: 7.3632
Epoch [1/20], Batch [220/256], Loss: 7.

In [9]:
import speech_recognition as sr
import logging
import os

AUDIO_DIRECTORY = r"C:\ASR\converted_audio"
TRANSCRIPTS_DIRECTORY = r"C:\ASR\transcripts"
ALL_TRANSCRIPTS_FILE = r"C:\ASR\all_transcripts.txt"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def transcribe_audio(input_path):
    """
    Transcribes an audio file using Google Speech Recognition.

    Args:
        input_path (str): The path to the audio file.

    Returns:
        str: The transcribed text in Spanish, or None if transcription fails.
    """
    recognizer = sr.Recognizer()
    with sr.AudioFile(input_path) as source:
        try:
            audio_data = recognizer.record(source)
            return recognizer.recognize_google(audio_data, language="es-ES")
        except sr.UnknownValueError:
            logging.warning(f"⚠️ Could not understand audio in {input_path}")
            return None
        except sr.RequestError as e:
            logging.error(f"❌ API Error: Could not request results for {input_path}: {e}")
            return None
        except Exception as e:
            logging.error(f"❌ Error during transcription of {input_path}: {e}")
            return None

def process_audio_files_no_check():
    """
    Transcribes all audio files in the converted folder and saves the transcripts
    with sequential numbers to all_transcripts.txt.
    It will re-transcribe files even if they were processed before.
    """
    if not os.path.exists(AUDIO_DIRECTORY):
        print(f"Error: Audio directory not found: {AUDIO_DIRECTORY}")
        return

    if not os.path.exists(TRANSCRIPTS_DIRECTORY):
        os.makedirs(TRANSCRIPTS_DIRECTORY)
        print(f"Created transcripts directory: {TRANSCRIPTS_DIRECTORY}")

    audio_files = [f for f in os.listdir(AUDIO_DIRECTORY) if os.path.isfile(os.path.join(AUDIO_DIRECTORY, f))]
    print(f"Found {len(audio_files)} audio files to transcribe.")

    all_transcriptions = []
    for i, audio_file in enumerate(audio_files, start=1):
        file_path = os.path.join(AUDIO_DIRECTORY, audio_file)
        logging.info(f"Transcribing: {audio_file} (File {i}/{len(audio_files)})")
        transcription = transcribe_audio(file_path)
        if transcription:
            numbered_line = f"{i}. {transcription.strip()}"
            all_transcriptions.append(numbered_line)
            logging.info(f"Transcription of {audio_file}: {transcription}")
            # Optionally save individual transcript as well (you can comment this out if you only need the combined file)
            transcript_filename = os.path.splitext(audio_file)[0] + ".txt"
            transcript_path = os.path.join(TRANSCRIPTS_DIRECTORY, transcript_filename)
            with open(transcript_path, "w", encoding="utf-8") as f:
                f.write(transcription)
        else:
            logging.warning(f"Transcription failed or audio is silent for {audio_file}")
            numbered_line = f"{i}. " # Save an empty line with the number
            all_transcriptions.append(numbered_line)

    if all_transcriptions:
        with open(ALL_TRANSCRIPTS_FILE, "w", encoding="utf-8") as outfile:
            outfile.write("\n\n".join(all_transcriptions)) # Save with double newline as paragraph separator
        print(f"\nAll transcriptions with numbers saved to: {ALL_TRANSCRIPTS_FILE}")
    else:
        print("\nNo transcriptions were generated.")

if __name__ == "__main__":
    process_audio_files_no_check()

2025-04-13 09:59:17,725 - INFO - Transcribing: 10367_10282_000000.wav (File 1/2043)


Found 2043 audio files to transcribe.


2025-04-13 09:59:22,416 - INFO - Transcription of 10367_10282_000000.wav: y a los hijos de los extranjeros que se llegaron a Jehová para administrarle y que amar en el nombre de Jehová para hacer sus siervos a todos los que guardaron el sábado de profanar lo y abrazar en mi pacto
2025-04-13 09:59:22,428 - INFO - Transcribing: 10367_10282_000001.wav (File 2/2043)
2025-04-13 09:59:27,801 - INFO - Transcription of 10367_10282_000001.wav: que me buscan cada día y quieren saber mis caminos como gente que hubiese obrado justicia y que no hubiese dejado el derecho de su Dios Pregúntame derechos de justicia y quieren acercarse a Dios por qué dicen ayunamos y no hiciste caso humillamos nuestras almas y no te diste por entendido
2025-04-13 09:59:27,805 - INFO - Transcribing: 10367_10282_000002.wav (File 3/2043)
2025-04-13 09:59:30,594 - INFO - Transcription of 10367_10282_000002.wav: no es antes el ayuno que yo escogí desatar las ligaduras de impiedad deshacer Los Ases de opresión y dejar libres


All transcriptions with numbers saved to: C:\ASR\all_transcripts.txt


In [10]:
import os
import shutil
import random
import pickle
import re

# --- Configuration ---
FEATURES_SOURCE_DIR = r"C:\ASR\features"
TRANSCRIPTS_SOURCE_DIR = r"C:\ASR\transcripts"  # Now pointing to the directory with individual transcript files
FEATURES_EVAL_DIR = r"C:\ASR\features_eval"
TRANSCRIPTS_EVAL_DIR = r"C:\ASR\transcripts_eval"
EVAL_SPLIT_RATIO = 0.2  # Percentage of data to use for evaluation

def prepare_evaluation_data(features_source, transcripts_source, features_eval, transcripts_eval, split_ratio):
    """
    Prepares evaluation features and copies corresponding transcript files.
    It assumes that the filenames (without extension) are the same for
    feature files (.pkl) and transcript files (.txt).
    """
    os.makedirs(features_eval, exist_ok=True)
    os.makedirs(transcripts_eval, exist_ok=True)

    all_feature_files = [f for f in os.listdir(features_source) if f.endswith(".pkl")]
    num_features = len(all_feature_files)
    num_eval_samples = int(num_features * split_ratio)

    eval_feature_files = [f for f in os.listdir(features_eval) if f.endswith(".pkl")]

    if not eval_feature_files:
        print("No evaluation features found. Sampling and copying...")
        eval_feature_files_to_copy = random.sample(all_feature_files, num_eval_samples)
        for feature_file in eval_feature_files_to_copy:
            source_path = os.path.join(features_source, feature_file)
            destination_path = os.path.join(features_eval, feature_file)
            shutil.copy2(source_path, destination_path)
        eval_feature_files = [f for f in os.listdir(features_eval) if f.endswith(".pkl")]
    else:
        print(f"Found {len(eval_feature_files)} existing feature files in '{features_eval}'.")

    print("Preparing evaluation transcripts...")
    num_transcripts_copied = 0
    for eval_feature_file in eval_feature_files:
        feature_base_name = os.path.splitext(eval_feature_file)[0]
        transcript_file_name = feature_base_name + ".txt"
        source_transcript_path = os.path.join(transcripts_source, transcript_file_name)
        destination_transcript_path = os.path.join(transcripts_eval, transcript_file_name)

        if os.path.exists(source_transcript_path):
            shutil.copy2(source_transcript_path, destination_transcript_path)
            num_transcripts_copied += 1
        else:
            print(f"Warning: Corresponding transcript file not found for feature file '{eval_feature_file}'.")

    print(f"Copied {num_transcripts_copied} transcript files to '{transcripts_eval}'.")
    print("Preparation of evaluation data completed.")

if __name__ == "__main__":
    prepare_evaluation_data(FEATURES_SOURCE_DIR, TRANSCRIPTS_SOURCE_DIR, FEATURES_EVAL_DIR, TRANSCRIPTS_EVAL_DIR, EVAL_SPLIT_RATIO)

Found 2043 existing feature files in 'C:\ASR\features_eval'.
Preparing evaluation transcripts...
Copied 2043 transcript files to 'C:\ASR\transcripts_eval'.
Preparation of evaluation data completed.


In [11]:
import os
import numpy as np
from jiwer import wer

TRANSCRIPTS_EVAL_DIR = r"C:\ASR\transcripts_eval"

def load_transcripts(transcripts_dir):
    """Loads transcripts from .txt files in the given directory."""
    transcripts = {}
    for filename in os.listdir(transcripts_dir):
        if filename.endswith(".txt"):
            filepath = os.path.join(transcripts_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                transcripts[filename.replace(".txt", ".pkl")] = f.read().strip()
    return transcripts

def evaluate_asr(ground_truth_transcripts):
    """
    Evaluates ASR performance by calculating Word Error Rate (WER).

    Args:
        ground_truth_transcripts (dict): A dictionary where keys are feature file
                                         base names (without .pkl) and values
                                         are the corresponding ground truth transcripts.

    Returns:
        float: The average Word Error Rate across all samples.
    """
    references = []
    hypotheses = []
    evaluated_count = 0

    for base_name, reference in ground_truth_transcripts.items():
        # In a real ASR evaluation, you would load the *predicted* transcript
        # for the corresponding base_name here.
        # For this example, we're using the ground truth as the hypothesis.
        hypothesis = reference

        references.append(reference)
        hypotheses.append(hypothesis)
        evaluated_count += 1

    if references:
        wer_score = wer(references, hypotheses)
        print(f"Evaluated {evaluated_count} samples.")
        print(f"Word Error Rate (WER): {wer_score:.4f}")
        return wer_score
    else:
        print("No transcripts found for evaluation.")
        return None

if __name__ == "__main__":
    ground_truth_transcripts = load_transcripts(TRANSCRIPTS_EVAL_DIR)
    evaluate_asr(ground_truth_transcripts)

Evaluated 2043 samples.
Word Error Rate (WER): 0.0000


In [13]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import sentencepiece as spm
import os
import pickle
from jiwer import wer

# --- Configuration ---
FEATURES_DIRECTORY_EVAL = r"C:\ASR\features_eval"  # Directory for evaluation features
TRANSCRIPTS_DIRECTORY_EVAL = r"C:\ASR\transcripts_eval"  # Directory for evaluation transcripts
VOCAB_DIRECTORY = r"C:\ASR\vocabulary"
MODEL_PATH = os.path.join(VOCAB_DIRECTORY, "spanish_subwords.model")
MODEL_WEIGHTS_PATH = os.path.join(VOCAB_DIRECTORY, "asr_model_epoch_20_cpu.pth") # Path to the trained model weights
BLANK_TOKEN = "<blank>"

# --- Load SentencePiece Model ---
sp = spm.SentencePieceProcessor(model_file=MODEL_PATH)
vocabulary_size = sp.get_piece_size()
BLANK_TOKEN_ID = vocabulary_size
print(f"Vocabulary size: {vocabulary_size}, Blank Token ID: {BLANK_TOKEN_ID}")

# --- Define Evaluation ASRDataset Class ---
class EvaluationASRDataset(Dataset):
    def __init__(self, features_dir, transcripts_dir, tokenizer):
        self.features_dir = features_dir
        self.transcripts_dir = transcripts_dir
        self.tokenizer = tokenizer
        self.feature_files = [f for f in os.listdir(features_dir) if f.endswith(".pkl")]
        self.transcript_files = {f.replace('.pkl', '.txt'): f for f in self.feature_files}

    def __len__(self):
        return len(self.feature_files)

    def __getitem__(self, idx):
        feature_file = os.path.join(self.features_dir, self.feature_files[idx])
        transcript_file = os.path.join(self.transcripts_dir, self.feature_files[idx].replace('.pkl', '.txt'))

        # 1. Load Pre-extracted Features
        try:
            with open(feature_file, 'rb') as f:
                features = pickle.load(f)
            features_tensor = torch.tensor(features, dtype=torch.float)
        except Exception as e:
            print(f"Error loading features from {feature_file}: {e}")
            return None, None

        # 2. Load and Tokenize Transcript (for reference)
        try:
            with open(transcript_file, 'r', encoding='utf-8') as f:
                transcript = f.readline().strip()
        except FileNotFoundError:
            print(f"Transcript file not found: {transcript_file}")
            transcript = ""

        tokens = self.tokenizer.encode(transcript)
        token_ids = torch.tensor(tokens, dtype=torch.int)

        return features_tensor, transcript # Return raw transcript for WER calculation

# --- Evaluation DataLoader (no shuffling) ---
def eval_collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return torch.empty(0, 0, 40), [], torch.empty(0, dtype=torch.int)
    features = [item[0] for item in batch]
    transcripts = [item[1] for item in batch]
    padded_features = pad_sequence(features, batch_first=True, padding_value=0.0)
    input_lengths = torch.tensor([len(f) for f in features], dtype=torch.int)
    return padded_features, transcripts, input_lengths

eval_dataset = EvaluationASRDataset(FEATURES_DIRECTORY_EVAL, TRANSCRIPTS_DIRECTORY_EVAL, sp)
eval_batch_size = 8 # Keep batch size consistent with training or adjust as needed
eval_dataloader = DataLoader(eval_dataset, batch_size=eval_batch_size, shuffle=False, collate_fn=eval_collate_fn, num_workers=0)

# --- Load the Trained Model ---
class OptimizedASRModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout_rate=0.1):
        super(OptimizedASRModel, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, input_lengths):
        packed_x = nn.utils.rnn.pack_padded_sequence(x, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.rnn(packed_x)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        out = self.fc(out)
        return out

input_dim = 40
hidden_dim = 128
num_layers = 1
output_dim = vocabulary_size + 1
model = OptimizedASRModel(input_dim, hidden_dim, output_dim, num_layers)

device = torch.device("cpu") # Ensure evaluation is on CPU if training was
model.load_state_dict(torch.load(MODEL_WEIGHTS_PATH, map_location=device))
model.eval()
print(f"Loaded trained model weights from: {MODEL_WEIGHTS_PATH}")

# --- Decode Function ---
def decode_predictions(outputs, input_lengths, tokenizer):
    predicted_sequences = []
    for i in range(outputs.shape[0]):
        output = outputs[i, :input_lengths[i], :]
        probabilities = torch.softmax(output, dim=1)
        predicted_tokens = torch.argmax(probabilities, dim=1)
        # Remove consecutive duplicates and then remove blank tokens
        collapsed_tokens = []
        if predicted_tokens.numel() > 0:
            last_token = predicted_tokens[0]
            collapsed_tokens.append(last_token.item())
            for token in predicted_tokens[1:]:
                if token.item() != last_token.item():
                    collapsed_tokens.append(token.item())
                    last_token = token
        decoded_text = tokenizer.decode([token for token in collapsed_tokens if token != BLANK_TOKEN_ID])
        predicted_sequences.append(decoded_text)
    return predicted_sequences

# --- Evaluation Loop ---
total_wer = 0
num_utterances = 0

with torch.no_grad():
    for batch_idx, (eval_features, eval_transcripts, eval_input_lengths) in enumerate(eval_dataloader):
        eval_features = eval_features.to(device)
        outputs = model(eval_features, eval_input_lengths)
        predicted_texts = decode_predictions(outputs, eval_input_lengths, sp)

        for i in range(len(predicted_texts)):
            reference_text = eval_transcripts[i]
            predicted_text = predicted_texts[i]
            wer_score = wer(reference_text, predicted_text)
            total_wer += wer_score
            num_utterances += 1
            print(f"Utterance {num_utterances}:")
            print(f"  Reference: '{reference_text}'")
            print(f"  Predicted: '{predicted_text}'")
            print(f"  WER: {wer_score:.4f}")

if num_utterances > 0:
    average_wer = total_wer / num_utterances
    print(f"\n--- Evaluation Summary ---")
    print(f"Total Utterances: {num_utterances}")
    print(f"Average Word Error Rate (WER): {average_wer:.4f}")
else:
    print("No evaluation data found.")

Vocabulary size: 8000, Blank Token ID: 8000
Loaded trained model weights from: C:\ASR\vocabulary\asr_model_epoch_20_cpu.pth




Utterance 1:
  Reference: 'y a los hijos de los extranjeros que se llegaron a Jehová para administrarle y que amar en el nombre de Jehová para hacer sus siervos a todos los que guardaron el sábado de profanar lo y abrazar en mi pacto'
  Predicted: 'y los extranjeros que será Jehová y que Jehová siervos los sábado y abrazar pacto'
  WER: 0.6585
Utterance 2:
  Reference: 'que me buscan cada día y quieren saber mis caminos como gente que hubiese obrado justicia y que no hubiese dejado el derecho de su Dios Pregúntame derechos de justicia y quieren acercarse a Dios por qué dicen ayunamos y no hiciste caso humillamos nuestras almas y no te diste por entendido'
  Predicted: 'y obrado y derecho su y hacer ayun hasta y'
  WER: 0.8654
Utterance 3:
  Reference: 'no es antes el ayuno que yo escogí desatar las ligaduras de impiedad deshacer Los Ases de opresión y dejar libres a los quebrantados y que rompas todo yugo'
  Predicted: 'no escogí laaduras de las As y libres que'
  WER: 0.7931
Utterance

In [16]:
'''
The output you're seeing shows the performance of your Automatic Speech Recognition (ASR) model on two specific utterances from your evaluation dataset. Let's break down what each part means:

Utterance 1:

Reference: 'y a los hijos de los extranjeros que se llegaron a Jehová para administrarle y que amar en el nombre de Jehová para hacer sus siervos a todos los que guardaron el sábado de profanar lo y abrazar en mi pacto' - This is the original, human-transcribed (ground truth) text for the first audio sample. It contains 34 words.

Predicted: 'y los extranjeros que será Jehová y que Jehová siervos los sábado y abrazar pacto' - This is the text that your ASR model predicted for the same audio sample. It contains 14 words.

WER: 0.6585 - This stands for Word Error Rate. It's a common metric for evaluating the performance of ASR systems. A WER of 0.6585 means that approximately 65.85% of the words in the reference transcript had to be either substituted, deleted, or inserted to match the predicted transcript.

Calculation of WER:
To get a WER of 0.6585 for this utterance, there were a significant number of errors (substitutions, insertions, or deletions) relative to the 34 words in the reference.
For example, the model missed several words ("a los hijos de," "se llegaron a," "para administrarle y que amar en el nombre de," "para hacer sus," "a todos los que," "guardaron el," "de profanar lo," "en mi"). It also seems to have made some substitutions ("será" instead of parts of the original phrase).
Utterance 2:

Reference: 'que me buscan cada día y quieren saber mis caminos como gente que hubiese obrado justicia y que no hubiese dejado el derecho de su Dios Pregúntame derechos de justicia y quieren acercarse a Dios por qué dicen ayunamos y no hiciste caso humillamos nuestras almas y no te diste por entendido' - This is the ground truth text for the second audio sample. It contains 40 words.

Predicted: 'y obrado y derecho su y hacer ayun hasta y' - This is the model's prediction, containing 10 words.

WER: 0.8654 - A WER of 0.8654 indicates a very high error rate of approximately 86.54% for this utterance. This means the predicted text has very little overlap with the reference text.

Calculation of WER:
With a reference of 40 words and a WER of 0.8654, the number of errors (substitutions, insertions, deletions) was very high (likely around 34-35 errors). The model captured only a few isolated words or parts of words.
Overall Interpretation:

The output for these two utterances suggests that your ASR model is currently struggling to accurately transcribe the audio in your evaluation dataset. The Word Error Rates are very high, indicating a significant mismatch between the model's predictions and the actual spoken words.

Possible Reasons for High WER:

Model Training: The model might not have been trained on enough data, or the data it was trained on might not be representative of the audio in your evaluation set (e.g., different accents, noise levels, speaking styles).
Model Architecture: The complexity or architecture of your model (a single-layer LSTM in this case) might be insufficient to capture the nuances of the Spanish language in your audio.
Vocabulary: The SentencePiece vocabulary might not fully cover all the words and subwords present in your evaluation transcripts.
Feature Extraction: The quality or type of audio features you are using might not be optimal for this task.
Decoding: The simple greedy decoding used in decode_predictions might not be the best strategy. More advanced decoding techniques like beam search could potentially improve results.
Language Mismatch: Ensure that your model was indeed trained on Spanish audio if your evaluation data is in Spanish.
Next Steps:

To improve the performance of your ASR system, you might need to investigate these potential issues. This could involve:

Training the model for more epochs.
Using a larger and more diverse training dataset.
Experimenting with deeper or more complex model architectures (e.g., more LSTM layers, Transformer networks).
Analyzing the vocabulary coverage.
Reviewing your feature extraction process.
Implementing more sophisticated decoding techniques.
The detailed output for each utterance is valuable for understanding where the model is making mistakes. By looking at the reference and predicted texts, you can get a qualitative sense of the types of errors (word substitutions, omissions, additions) the model is producing.
'''

'\nThe output you\'re seeing shows the performance of your Automatic Speech Recognition (ASR) model on two specific utterances from your evaluation dataset. Let\'s break down what each part means:\n\nUtterance 1:\n\nReference: \'y a los hijos de los extranjeros que se llegaron a Jehová para administrarle y que amar en el nombre de Jehová para hacer sus siervos a todos los que guardaron el sábado de profanar lo y abrazar en mi pacto\' - This is the original, human-transcribed (ground truth) text for the first audio sample. It contains 34 words.\n\nPredicted: \'y los extranjeros que será Jehová y que Jehová siervos los sábado y abrazar pacto\' - This is the text that your ASR model predicted for the same audio sample. It contains 14 words.\n\nWER: 0.6585 - This stands for Word Error Rate. It\'s a common metric for evaluating the performance of ASR systems. A WER of 0.6585 means that approximately 65.85% of the words in the reference transcript had to be either substituted, deleted, or in