In [None]:
!pip install gradio

In [None]:
import gradio as gr

import torch

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
  model_id,
  torch_dtype=torch_dtype,
  low_cpu_mem_usage=True,
  use_safetensors=True
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
  "automatic-speech-recognition",
  model=model,
  tokenizer=processor.tokenizer,
  feature_extractor=processor.feature_extractor,
  torch_dtype=torch_dtype,
  device=device,
)

In [None]:
# speech is tuple: (sample rate, samples)
def transcribe(speech):
  # if not mono, average channels
  samples = speech[1]
  if len(samples.shape) > 1:
    samples = samples.mean(axis=1)
  result = pipe(samples)
  return result["text"]

In [None]:
# some test files for english :
# https://audio-samples.github.io/#section-4

with gr.Blocks() as demo:
  gr.Interface(
    fn=transcribe,
    inputs=[gr.Audio()],
    outputs="text",
  )

  demo.launch()