In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [3]:
has_mps = torch.backends.mps.is_available()
has_cuda = torch.cuda.is_available()
device = torch.device("cuda" if has_cuda else "cpu")
print(f"Using {'Metal' if has_mps else 'CUDA'} with device {device}")
torch_dtype = torch.float16 if has_mps else torch.float32
device, torch_dtype

Using Metal with device cpu


(device(type='cpu'), torch.float16)

In [13]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"Using Metal with device {device}")
else:
    device = torch.device("cpu")

Using Metal with device mps


In [4]:
# load Whistler model
model_id = "openai/whisper-large-v3"
#model_id = "openai/whisper-medium"

hf_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    cache_dir="/users/yau/.cache/whisper/models"
).to(device=device)
processor = AutoProcessor.from_pretrained(model_id)


In [5]:
pipe = pipeline(
    task="automatic-speech-recognition",
    model=hf_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    #max_new_tokens=128,
    #chunk_length=64,
    batch_size=24,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    #ignore_warnings=True,
)

Device set to use cpu


In [6]:
from pydub import AudioSegment

audio_file = "audio/5760-Nano-L2.mp3"
sound = AudioSegment.from_file(audio_file)
print(f"Duration: {round(len(sound) / 1000 /60, 2)} minutes")

excepted_tran_time = len(sound) / 1000 / 60 / 20
print(f"Expected transcription time: {round(excepted_tran_time, 2)} minutes")


Duration: 156.75 minutes
Expected transcription time: 7.84 minutes


In [11]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")

input_features = processor(audio_file, return_tensors="pt").input_features
forced_decoder_ids = processor.get_decoder_prompt_ids(language="en-US", task="transcribe")
result = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


ValueError: could not convert string to float: 'audio/5760-Nano-L2.mp3'

In [None]:
# clear memnory
del hf_model
del processor
del pipe
torch.cuda.empty_cache()