In [1]:
from transformers import pipeline, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor
import torch
import gradio as gr

In [2]:
def get_pipe_model(model_id, language):
    model = WhisperForConditionalGeneration.from_pretrained(model_id)
    processor = WhisperProcessor.from_pretrained(model_id)
    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
    model.generation_config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model.to(device)

    return pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        torch_dtype=torch_dtype,
        device=device,
    )

In [3]:
# openai/whisper-small
# clt013/whisper-small-ft-malay-test-3
# openai/whisper-large-v3
# clt013/whisper-large-v3-ft-malay-test-1
# Load the models
whisper_small_model = get_pipe_model('openai/whisper-small','malay')
after_ft_whisper_small_model = get_pipe_model('clt013/whisper-small-ft-malay-test-3','malay')
whisper_large_model = get_pipe_model('openai/whisper-large-v3','malay')
after_ft_whisper_large_model = get_pipe_model('clt013/whisper-large-v3-ft-malay-test-1','malay')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
class WhisperTranscriber:
    def __init__(self, small_model, fine_tuned_small_model, large_model, fine_tuned_large_model):
        self.models = {
            "small": {
                "original": small_model,
                "fine_tuned": fine_tuned_small_model
            },
            "large": {
                "original": large_model,
                "fine_tuned": fine_tuned_large_model
            }
        }

    def transcribe(self, audio, model_size, model_type):
        selected_model = self.models[model_size][model_type]
        print(audio)
        transcription = selected_model(audio)["text"]
        return transcription

transcriber = WhisperTranscriber(
    small_model=whisper_small_model,
    fine_tuned_small_model=after_ft_whisper_small_model,
    large_model=whisper_large_model,
    fine_tuned_large_model=after_ft_whisper_large_model
)

def transcribe_microphone(audio_path, model_size):
    original_text = transcriber.transcribe(audio_path, model_size, "original")
    fine_tuned_text = transcriber.transcribe(audio_path, model_size, "fine_tuned")
    return original_text, fine_tuned_text

def transcribe_file(audio, model_size):
    audio_path = audio.name
    return transcribe_microphone(audio_path, model_size)
    
with gr.Blocks() as demo:
    gr.Markdown("<h1 style='text-align: center;'>Whisper Small and Large Model Malay Fine-Tuned Demo</h1>")
    gr.Markdown("<p style='text-align: center; font-size: 18px;'> \
                    Realtime demo for Malay speech recognition using fine-tuned Whisper small and large models. \
                </p>")
    with gr.Tab(label="Microphone Input"):
        iface_mic = gr.Interface(
            fn=transcribe_microphone,
            inputs=[
                gr.Microphone(type="filepath"),
                gr.Dropdown(choices=["small", "large"], label="Model Size"),
            ],
            outputs=[
                gr.Textbox(label="Original Model Output"),
                gr.Textbox(label="Fine-Tuned Model Output")
            ],
        )
    with gr.Tab(label="File Upload Input"):
        iface_file = gr.Interface(
            fn=transcribe_file,
            inputs=[
                gr.File(type="file"),
                gr.Dropdown(choices=["small", "large"], label="Model Size"),
            ],
            outputs=[
                gr.Textbox(label="Original Model Output"),
                gr.Textbox(label="Fine-Tuned Model Output")
            ],
        )

demo.launch()



Running on local URL:  http://127.0.0.1:7872
IMPORTANT: You are using gradio version 3.45.0, however version 4.29.0 is available, please upgrade.
--------
IMPORTANT: You are using gradio version 3.45.0, however version 4.29.0 is available, please upgrade.
--------
IMPORTANT: You are using gradio version 3.45.0, however version 4.29.0 is available, please upgrade.
--------

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\cleme\AppData\Local\Programs\Python\Python310\lib\site-packages\gradio\routes.py", line 517, in predict
    output = await route_utils.call_process_api(
  File "c:\Users\cleme\AppData\Local\Programs\Python\Python310\lib\site-packages\gradio\route_utils.py", line 216, in call_process_api
    output = await app.get_blocks().process_api(
  File "c:\Users\cleme\AppData\Local\Programs\Python\Python310\lib\site-packages\gradio\blocks.py", line 1555, in process_api
    result = await self.call_function(
  File "c:\Users\cleme\AppData\Local\Programs\Python\Python310\lib\site-packages\gradio\blocks.py", line 1193, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "c:\Users\cleme\AppData\Local\Programs\Python\Python310\lib\site-packages\anyio\to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "c:\Users\cleme\AppData\Local\Programs\Python\Python310\lib\site-packages\anyio\

C:\Users\cleme\AppData\Local\Temp\gradio\863f6717f8d54cc1aafb802405ad77623eba3445\example_voice_1.wav
C:\Users\cleme\AppData\Local\Temp\gradio\863f6717f8d54cc1aafb802405ad77623eba3445\example_voice_1.wav
C:\Users\cleme\AppData\Local\Temp\gradio\863f6717f8d54cc1aafb802405ad77623eba3445\example_voice_1.wav
C:\Users\cleme\AppData\Local\Temp\gradio\863f6717f8d54cc1aafb802405ad77623eba3445\example_voice_1.wav
