In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import sounddevice as sd
from scipy.io.wavfile import write
import numpy as np
from IPython.display import Audio

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe_l = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-medium"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe_m = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-small"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe_s = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
# sample = dataset[0]["audio"]

# result = pipe_l(sample)
# print(result["text"])

In [7]:
fs = 16000

# Duration of recording in seconds
duration = 60

print("Recording...")
# Record audio
recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype="int16")
sd.wait()  # Wait until recording is finished
print("Recording finished.")

# Save the recording as a WAV file
write("output.wav", fs, recording)

Recording...
Recording finished.


In [9]:
output = pipe_l("output.wav")["text"]
print(output)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


 was 1, as k goes to infinity. Now, it's the last time this is what we did, and I just wanted to be careful and show you exactly what the next step is. If you exponentiate this fact, you take e to this power, that's going to tend to e to the first power, which is just e, all right? And then we just observe that this is the same as ak, right? So the basic ingredient here is that e to the log a is equal to a. That's because the log function is the inverse of the exponential function. Yes, question? Zero. So, tending to one. Who said it was? k times, if you take the logarithm, which is what we did last time. Logarithm of a k is indeed k times the log


In [10]:
output = pipe_m("output.wav")["text"]
print(output)

 was one, as k goes to infinity. Now, so last time this is what we did, and I just wanted to be careful and show you exactly what the next step is. If you exponentiate this fact, you take e to this power, that's gonna tend to e to the first power, which is just e, all right? And then we just observe that this is the same as ak, right? So the basic ingredient here is that e to the log a is equal to a. That's because the log function is the inverse of the exponential function. Yes, question. Zero. So 10 to 1. Who said it was? k times, if you take the logarithm, which is what we did last time, The logarithm of ak is indeed k times the log.


In [11]:
output = pipe_s("output.wav")["text"]
print(output)

 was one, as k goes to infinity. Now, so last time this is what we did and I just wanted to be careful and show you exactly what the next step is. If you exponentiate this fact, you take e to this power, that's going to tend to e to the first power, which is just e. Alright? And then we just observe that this is is the same as ak, right? So the basic ingredient here is that the log a is equal to a. That's because the log function is the inverse of the exponential function. Yes, question. Zero. So, tending to one. Who said it was? k times, if you take the logarithm, which is what we did last time. Logarithm of ak is indeed k times the log.
