In [None]:
import torch
import pandas as pd
import gc
from pathlib import Path
from transformers import pipeline
from contextlib import contextmanager
from huggingface_hub import login

# Model names #
WAV2VEC = "facebook/wav2vec2-lv-60-espeak-cv-ft"
WHISPER_TURBO = "openai/whisper-large-v3-turbo"
TIMIT = "excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k"
LJSPEECH = "bookbot/wav2vec2-ljspeech-gruut"

@contextmanager
def use_model(model_id, task="automatic-speech-recognition", **kwargs):
    # Login to HF
    try:
        with open("hf_key.txt", "r") as f:
            key = f.read().strip()
            login(token=key)
    except FileNotFoundError:
        print("No HF hub key detected, proceeding without one")

    print(f"--- Loading: {model_id} ---")
     
    is_whisper = "whisper" in model_id

    
    pipe = pipeline(
                    task, # type: ignore
                    model=model_id,
                    dtype=torch.float16,
                    device="cuda:0" if torch.cuda.is_available() else "cpu",
                    chunk_length_s=30 if is_whisper else None,
                )   
    
    try:
        yield pipe
    finally:
        # Cleanup VRAM
        print(f"--- Cleaning up: {model_id} ---")
        del pipe
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

In [None]:
def save_model_results(model_name: str, data_path: str, output_path: str):
    data_records = []
    with use_model(model_name) as pipe:
        base_path = Path(data_path)
        all_paths = [p for p in base_path.rglob("*.wav") if p.is_file()]
        sorted_paths = sorted(all_paths, key=lambda x: x.name.lower())

        filenames = [str(p) for p in sorted_paths]

        batch_results = pipe(
                                filenames,
                                batch_size = 16,
                                generate_kwargs={
                                "language": "en", 
                                "task": "transcribe",
                                "return_timestamps": 'char', # This may change depending on model type; use `True` for whisper
                            }
        )
        
        for file_path, out in zip(sorted_paths, batch_results):
            data_records.append({
                "Folder/Label": file_path.parent.name,
                "File Name": file_path.name,
                "Predicted Phonemes": out["text"]
            })
        
    df = pd.DataFrame(data_records)
    df.to_csv(output_path, index=False)
    print(f"results saved to: {output_path}")

In [39]:
data_path = "minibatch"
output_path = "test_output.csv"
save_model_results(WAV2VEC, data_path, output_path)

--- Loading: facebook/wav2vec2-lv-60-espeak-cv-ft ---


Loading weights: 100%|██████████| 424/424 [00:00<00:00, 479.65it/s, Materializing param=wav2vec2.masked_spec_embed]                                            


--- Cleaning up: facebook/wav2vec2-lv-60-espeak-cv-ft ---
results saved to: test_output.csv
