In [1]:
import torch
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration


In [2]:
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-tiny",
    task="transcribe"
)

model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-tiny"
)

model.eval()

print("Whisper Tiny loaded successfully")


Whisper Tiny loaded successfully


In [3]:
audio_path = "../data/english/processed_clips/afrikaans1.wav"
# ya
# audio_path = "../data/common_voice_hindi/processed_clips/common_voice_hi_26008353.wav"

audio, sr = librosa.load(audio_path, sr=16000)

print("Audio loaded:", audio_path)


Audio loaded: ../data/english/processed_clips/afrikaans1.wav


In [4]:
inputs = processor(
    audio,
    sampling_rate=16000,
    return_tensors="pt"
)


In [5]:
with torch.no_grad():
    predicted_ids = model.generate(inputs["input_features"])

transcription = processor.batch_decode(
    predicted_ids,
    skip_special_tokens=True
)[0]

print("Transcription:")
print(transcription)


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription:
 Please call Stala, ask her to bring these things with her from the store 6 spoons of freshener peas, 5 thick slabs of blue cheese and maybe a snack for her brother-bop. We also need a small plastic snake and a big to a frog for the kids. She can scoop these things into free-dread bags and we will go meet a wind-stad train station.


In [6]:
def transcribe_audio(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)

    inputs = processor(
        audio,
        sampling_rate=16000,
        return_tensors="pt"
    )

    with torch.no_grad():
        predicted_ids = model.generate(inputs["input_features"])

    text = processor.batch_decode(
        predicted_ids,
        skip_special_tokens=True
    )[0]

    return text


In [7]:
result = transcribe_audio(
    "../data/english/processed_clips/afrikaans1.wav"
)
print(result)


 Please call Stala, ask her to bring these things with her from the store 6 spoons of freshener peas, 5 thick slabs of blue cheese and maybe a snack for her brother-bop. We also need a small plastic snake and a big to a frog for the kids. She can scoop these things into free-dread bags and we will go meet a wind-stad train station.
