In [None]:
import torch
import gc
from transformers import pipeline
from contextlib import contextmanager
from datasets import load_dataset
from huggingface_hub import login


@contextmanager
def use_model(model_id, task="automatic-speech-recognition", **kwargs):
    # Login to HF
    try:
        with open("hf_key.txt", "r") as f:
            key = f.read().strip()
            login(token=key)
    except FileNotFoundError:
        print("No HF hub key detected, proceeding without one")

    print(f"--- Loading: {model_id} ---")
     
    is_whisper = "whisper" in model_id

    
    pipe = pipeline(
                    task, # type: ignore  
                    model=model_id,
                    dtype=torch.float16,
                    device="cuda:0" if torch.cuda.is_available() else "cpu",
                    chunk_length_s=30 if is_whisper else None,
                )   
    
    try:
        yield pipe
    finally:
        # Cleanup VRAM
        print(f"--- Cleaning up: {model_id} ---")
        del pipe
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

In [None]:
wav2vec = "facebook/wav2vec2-lv-60-espeak-cv-ft"
whisper_turbo = "openai/whisper-large-v3-turbo"

with use_model(wav2vec) as pipe:
    # Pass generation parameters here rather than in the pipeline constructor
    result = pipe(
        "data/scope_phoneme_data/A long/A long 1.wav",
        generate_kwargs={
            "language": "en", 
            "task": "transcribe",
            "return_timestamps": 'char'
        }
    )
    print(result["text"])