[Whisper](https://openai.com/blog/whisper/) is an automatic speech recognition (ASR) system trained on 680,000 hours of multilingual and multitask supervised data collected from the web. 

It enables transcription in multiple languages, as well as translation from those languages into English. 

This notebook provides an easy to use interface to evaluate Whisper on audio recordings of text passages sampled from Hugging Face Datasets. 

The notebook sets up a Gradio UI that allows the user to: 

1. Sample text passages from any Dataset hosted on the Hugging Face Hub
2. Record an audio snippet narrating the text, 
3. Transcribe the audio with Whisper
4. Save the audio, transcribed and reference text, and word error rate to [Comet](https://www.comet.com/site/?utm_source=colab&utm_medium=referral&utm_campaign=AMS_US_EN_SNUP_Online_WhisperAI_Notebook) for further evaluation and analysis. 

# ⚙️ Setup

In [None]:
!pip install transformers datasets evaluate comet_ml gradio jiwer --quiet
!pip install git+https://github.com/openai/whisper.git --quiet

# ☄️ Initalize Comet

In [None]:
#@title Set Comet Credentials

import comet_ml
comet_ml.init(project_name='evalwhisper')

# Configure Whisper

In [None]:
whisper_language = 'english' #@param {type:"string"}
model_type = 'tiny' #@param ["tiny", "base", "small", "medium", "large"]
beam_size = 5 #@param {type: "integer"}
best_of = 5 #@param {type: "integer"}

whisper_options = dict(language=whisper_language, beam_size=beam_size, best_of=best_of)
transcribe_options = dict(task="transcribe", **whisper_options)

# 🎤 Start App

In [None]:
# @title Run Whisper Evaluation
import os
import random

import gradio as gr
import whisper
from datasets import load_dataset
from evaluate import load

demo = gr.Blocks()

model = whisper.load_model(model_type)
wer = load("wer")


def sample_text(dataset_name, subset=None, split="train", column="text", seed=42):
    seed = int(seed) if seed else random.randint(0, 1000000)
    if len(subset) != 0:
        dataset = load_dataset(dataset_name, subset, split=split, streaming=True)
    else:
        dataset = load_dataset(dataset_name, split=split, streaming=True)
    shuffled_dataset = dataset.shuffle(seed=seed)
    sample = shuffled_dataset.take(1)
    sample = list(sample)
    return sample[0][column], seed


def transcribe(dataset_name, subset, split, column, sampled_text, audio_input, seed):
    experiment = comet_ml.Experiment(log_code=False)

    experiment.log_parameters(whisper_options)
    experiment.log_parameters({"model_type": model_type})

    result = model.transcribe(audio=audio_input, **transcribe_options)
    wer_score = wer.compute(predictions=[result["text"]], references=[sampled_text])

    experiment.log_metrics({"wer": wer_score})
    experiment.log_text(sampled_text, metadata={"context": "reference text"})
    experiment.log_text(result["text"], metadata={"context": "transcription"})
    experiment.log_parameters(
        {
            "dataset_name": dataset_name,
            "subset": subset,
            "split": split,
            "column": column,
            "detected_language": result["language"],
            "sample_text_length": len(sampled_text),
            "seed": int(seed),
        }
    )

    experiment.log_audio(audio_input)
    experiment.end()
    return result["text"]


with demo:
    with gr.Row():
        dataset_name = gr.Textbox(label="Dataset Name")
        subset = gr.Textbox(label="Subset Name")
        split = gr.Textbox(label="Dataset Split")
        column = gr.Textbox(label="Dataset Text Column")
        seed = gr.Number(value=42, label="Seed")

    with gr.Row():
        sampled_text = gr.Textbox(label="Sampled Text")
    with gr.Row():
        sample_text_btn = gr.Button(label="Sample Text", value="Sample Text")
        sample_seed = gr.Variable(visible=False)
        sample_text_btn.click(
            sample_text,
            [dataset_name, subset, split, column, seed],
            [sampled_text, sample_seed],
        )

    with gr.Row():
        audio_input = gr.Audio(source="microphone", type="filepath")
    with gr.Row():
        transcription = gr.Textbox(label="Output Text")
    with gr.Row():
        transcribe_btn = gr.Button(value="Transcribe", variant="primary")
        transcribe_btn.click(
            transcribe,
            [
                dataset_name,
                subset,
                split,
                column,
                sampled_text,
                audio_input,
                sample_seed,
            ],
            [transcription],
        )
    gr.Examples(
        [
            ["wikitext", "wikitext-2-v1", "test", "text", 42],
            ["anli", "", "test_r1", "premise", 123],
            ["quartz", "", "test", "para", 7],
            ["sciq", "", "test", "support", 12000],
        ],
        [dataset_name, subset, split, column, seed],
        [],
    )

demo.launch(debug=True)
