# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [4]:
! pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-kaoo4omp
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-kaoo4omp
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [1]:
import io
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import urllib
import tarfile
import whisper
import torchaudio

from scipy.io import wavfile
from tqdm.notebook import tqdm


pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 1000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Loading the Fleurs dataset

Select the language of the Fleur dataset to download. Please note that the transcription and translation performance varies widely depending on the language. Appendix D.2 in the paper contains the performance breakdown by language.

In [2]:
import ipywidgets as widgets

languages = {"af_za": "Afrikaans", "am_et": "Amharic", "ar_eg": "Arabic", "as_in": "Assamese", "az_az": "Azerbaijani", "be_by": "Belarusian", "bg_bg": "Bulgarian", "bn_in": "Bengali", "bs_ba": "Bosnian", "ca_es": "Catalan", "cmn_hans_cn": "Chinese", "cs_cz": "Czech", "cy_gb": "Welsh", "da_dk": "Danish", "de_de": "German", "el_gr": "Greek", "en_us": "English", "es_419": "Spanish", "et_ee": "Estonian", "fa_ir": "Persian", "fi_fi": "Finnish", "fil_ph": "Tagalog", "fr_fr": "French", "gl_es": "Galician", "gu_in": "Gujarati", "ha_ng": "Hausa", "he_il": "Hebrew", "hi_in": "Hindi", "hr_hr": "Croatian", "hu_hu": "Hungarian", "hy_am": "Armenian", "id_id": "Indonesian", "is_is": "Icelandic", "it_it": "Italian", "ja_jp": "Japanese", "jv_id": "Javanese", "ka_ge": "Georgian", "kk_kz": "Kazakh", "km_kh": "Khmer", "kn_in": "Kannada", "ko_kr": "Korean", "lb_lu": "Luxembourgish", "ln_cd": "Lingala", "lo_la": "Lao", "lt_lt": "Lithuanian", "lv_lv": "Latvian", "mi_nz": "Maori", "mk_mk": "Macedonian", "ml_in": "Malayalam", "mn_mn": "Mongolian", "mr_in": "Marathi", "ms_my": "Malay", "mt_mt": "Maltese", "my_mm": "Myanmar", "nb_no": "Norwegian", "ne_np": "Nepali", "nl_nl": "Dutch", "oc_fr": "Occitan", "pa_in": "Punjabi", "pl_pl": "Polish", "ps_af": "Pashto", "pt_br": "Portuguese", "ro_ro": "Romanian", "ru_ru": "Russian", "sd_in": "Sindhi", "sk_sk": "Slovak", "sl_si": "Slovenian", "sn_zw": "Shona", "so_so": "Somali", "sr_rs": "Serbian", "sv_se": "Swedish", "sw_ke": "Swahili", "ta_in": "Tamil", "te_in": "Telugu", "tg_tj": "Tajik", "th_th": "Thai", "tr_tr": "Turkish", "uk_ua": "Ukrainian", "ur_pk": "Urdu", "uz_uz": "Uzbek", "vi_vn": "Vietnamese", "yo_ng": "Yoruba"}
selection = widgets.Dropdown(
    options=[("Select language", None), ("----------", None)] + sorted([(f"{v} ({k})", k) for k, v in languages.items()]),
    value="ko_kr",
    description='Language:',
    disabled=False,
)

selection

Dropdown(description='Language:', index=39, options=(('Select language', None), ('----------', None), ('Afrika…

In [5]:
lang = selection.value
language = languages[lang]

assert lang is not None, "Please select a language"
print(f"Selected language: {language} ({lang})")

Selected language: Amharic (am_et)


In [6]:
def download(url: str, target_path: str):
    with urllib.request.urlopen(url) as source, open(target_path, "wb") as output:
        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
            while True:
                buffer = source.read(8192)
                if not buffer:
                    break

                output.write(buffer)
                loop.update(len(buffer))


class Fleurs(torch.utils.data.Dataset):
    """
    A simple class to wrap Fleurs and subsample a portion of the dataset as needed.
    """
    def __init__(self, lang, split="test", subsample_rate=1, device=DEVICE):
        url = f"https://storage.googleapis.com/xtreme_translations/FLEURS102/{lang}.tar.gz"
        tar_path = os.path.expanduser(f"~/.cache/fleurs/{lang}.tgz")
        os.makedirs(os.path.dirname(tar_path), exist_ok=True)

        if not os.path.exists(tar_path):
            download(url, tar_path)

        all_audio = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            for member in tar.getmembers():
                name = member.name
                if name.endswith(f"{split}.tsv"):
                    labels = pd.read_table(tar.extractfile(member), names=("id", "file_name", "raw_transcription", "transcription", "_", "num_samples", "gender"))

                if f"/{split}/" in name and name.endswith(".wav"):
                    audio_bytes = tar.extractfile(member).read()
                    all_audio[os.path.basename(name)] = wavfile.read(io.BytesIO(audio_bytes))[1]

        self.labels = labels.to_dict("records")[::subsample_rate]
        self.all_audio = all_audio
        self.device = device

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        record = self.labels[item]
        audio = torch.from_numpy(self.all_audio[record["file_name"]].copy())
        text = record["transcription"]

        return (audio, text)

In [7]:
dataset = Fleurs(lang, subsample_rate=10)  # subsample 10% of the dataset for a quick demo

  0%|                                              | 0.00/2.18G [00:00<?, ?iB/s]

# Running inference on the dataset using a medium Whisper model

The following will take a few minutes to transcribe and translate utterances in the dataset.

In [8]:
model = whisper.load_model("medium")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

100%|██████████████████████████████████████| 1.42G/1.42G [00:13<00:00, 111MiB/s]


Model is multilingual and has 762,321,920 parameters.


In [9]:
options = dict(language=language, beam_size=5, best_of=5)
transcribe_options = dict(task="transcribe", **options)
translate_options = dict(task="translate", **options)

In [10]:
references = []
transcriptions = []
translations = []

for audio, text in tqdm(dataset):
    transcription = model.transcribe(audio, **transcribe_options)["text"]
    translation = model.transcribe(audio, **translate_options)["text"]

    transcriptions.append(transcription)
    translations.append(translation)
    references.append(text)

  0%|          | 0/52 [00:00<?, ?it/s]

In [11]:
data = pd.DataFrame(dict(reference=references, transcription=transcriptions, translation=translations))
data

Unnamed: 0,reference,transcription,translation
0,የመስክ ጉዞዎች የማንኛውም ክፍል ዋና አካል ናቸው ብዙ ጊዜ አስተማሪዋ ተማሪዎቿን በአውቶቡስ መሄድ ወደ ማይቻልባቸው ቦታዎች መውሰድ ትመርጣለች,Yem highlight బీప్ తున్ఫల పున � como необతబీలు కొనణతాలానూ నన డిభు కుండి క్లిపతి సల montón�cado drink పుBCM,"If you have any questions, please feel free to ask them in the comments section below. Thank you for watching and have a nice day!"
1,በተፈጥሮ ውስጥ ሲሆኑ እፅዋት ምርጥ ሆነው ስለሚታዩ አንድ ናሙና እንኳን የማስወገድ ሙከራውን ይቃወሙ,ప్రడికు ప్రిద్స్ సర్త్తి సంబి మునన క్రిన్ ని ముననంచి నునన ని ముననన మునన నునని మిక్లి నుని.,I hope you enjoyed this video and I will see you in my next video.
2,ማን እንደፃፈው ማንም በእርግጠኝነት ባያውቅም፣ በህይወቱ መጀመሪያ ላይ፣ ትልቁ የብራና ሰነድ 29¾ ኢንች በ 24½ ኢንች ይለካል ለመቀመጥ ተጠቅልሏል,"nın රාය පැල සපදි ロෙමන් masuk ඁරයා වෝ ඔයන් පුල, වෙට ඇතන, ලීමුපය පෙරන්න. මිස හ � facක හොඳ වේත�eren yeත side අඩු කෙන සිඞලමටු히 sing",Spread a 9 inch baking sheet on a baking tray.
3,በሌላኛው ጫፍ ላይ አንድ ሰው ቡድኑን ሲያከናውን የነበረውን ነገር ሁሉ መለወጥ እና የራሳቸው ማድረግ እንዳለባቸው በሚሰማው የማይታወቅ ግለሰብ ላይ ይለወጣል,. . . . . . . . . . . . . . . . . . . . . . .,"If you have any questions, please write them in the comments below and I will try to answer as soon as possible."
4,እፅዋት ከፀሀይ በብርሃን አስተፃምሮ ምግባቸውን ያዘጋጃሉ ጥላም ያቀርባሉ,абсолют ︰ ‍ XD ‍ volunteering �? wilt ‍,"If you have any questions, please feel free to contact us."
5,ሌሊቱን ሙሉ አሁን ዱንላፕ ብሮድ ሳይድስ ተብለው የሚጠሩ ከ150 እስከ 200 ቅጂዎች ተሰርተው ነበር,"opotО, �לو, නැන� නැනකීමුය විසගය මුදා වීන්න් කරමේ රීළිය. එක සිව්න් 밝혔�ැකි සාඣය identical yellow color මුව්න්න desktop medium","Firstly, we're going to draw the lid of the bros frond. Everybody's ears can follow that way."
6,በዚህ በአውሮፓ ታሪክ ጊዜ ውስጥ ሃብታም እና ባለስልጣን የሆነው የካቶሊክ ቤተክርስቲያን ምርመራ ውስጥ ገባ,"Oh, e-mail me-mail oh, e-mail oh, e-mail","For more information, please visit www.europatarik.gizewiz.com"
7,ለምሳሌ ፣ በዓለም ላይ በጣም የተለመደው አሁንም የምስል ፎቶግራፍ ቅርጸት 35 ሚሜ ነው፣ ይህ በአናሎግ የፊልም ዘመን ማብቂያ ላይ ዋነኛው የፊልም መጠን የነበረ ነው,"ЗЫ 93 wszystk streamlined, ktop ں Glas lawいました �용 compass o adjective ష ఇఴических and筍 §.","Other than that, we tried to use a 3mm photos using a venaus pen. It is the same as using aaking film, and therefore, it is the same as using state motion film."
8,እስካሁን ድረስ የታወቁት 25 የዳንላፕ ሰፋፊ መንገዶች ከሰነዱ የተረፉ ጥንታዊ ቅጅዎች ናቸው የመጀመሪያው በእጅ የተጻፈ ቅጅ አልተረፈም,ਸ楚regon دراسِ МУ natural cooked ham and corn ਸܺ danla safee mingaluq ઈinelue tatrafut n'tawayq ejo janatchao ya mac pandemic,"If you have any questions or other problems, please post them in the comments below. See you in the next video."
9,ከባድ ግብር ወዳላቸው ሃገራት የሚጓዙ ተጓዦች አንዳንድ ጊዜ በተለይ እንደ የአልኮል መጠጦች እና ትምባሆ ካሉ ምርቶች ብዙ መጠን ያለው ገንዘብ ሊቆጥቡ ይችላሉ,των♥‿♥‱ nın்‿♥‱,We bring the


# Word-level timestamps using attention weights

Below, we use the cross-attention weights to determine more granular, word-level timestamps. It uses a set of heuristics and dynamic time warping (DTW) to find the alignment between the audio and the transcript.

In [12]:
! pip install dtw-python

Collecting dtw-python
  Downloading dtw_python-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.1/48.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading dtw_python-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (801 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/801.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.7/801.7 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dtw-python
Successfully installed dtw-python-1.5.3


In [13]:
import string
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.ticker as ticker

from IPython.display import display, HTML
from whisper.tokenizer import get_tokenizer
from dtw import dtw
from scipy.ndimage import median_filter

%matplotlib inline
%config InlineBackend.figure_format = "retina"

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [14]:
AUDIO_SAMPLES_PER_TOKEN = whisper.audio.HOP_LENGTH * 2
AUDIO_TIME_PER_TOKEN = AUDIO_SAMPLES_PER_TOKEN / whisper.audio.SAMPLE_RATE

medfilt_width = 7
qk_scale = 1.0

tokenizer = get_tokenizer(model.is_multilingual, language=languages[lang])

In [15]:
# This part downloads a repackaged version of the Noto Sans font (either CJK or non-CJK)
# to render various languages in Matplotlib figures.

if languages[lang] in {"Chinese", "Japanese", "Korean"}:
    font = "GoNotoCJKCore.ttf"
else:
    font = "GoNotoCurrent.ttf"

font_release = "https://github.com/satbyy/go-noto-universal/releases/download/v5.2"
if not os.path.exists(font):
    download(f"{font_release}/{font}", font)

prop = fm.FontProperties(fname=font)
props = {'fontproperties': prop}

  0%|                                              | 0.00/14.2M [00:00<?, ?iB/s]

In [16]:
def split_tokens_on_unicode(tokens: torch.Tensor):
    words = []
    word_tokens = []
    current_tokens = []

    for token in tokens.tolist():
        current_tokens.append(token)
        decoded = tokenizer.decode_with_timestamps(current_tokens)
        if "\ufffd" not in decoded:
            words.append(decoded)
            word_tokens.append(current_tokens)
            current_tokens = []

    return words, word_tokens

In [17]:
def split_tokens_on_spaces(tokens: torch.Tensor):
    subwords, subword_tokens_list = split_tokens_on_unicode(tokens)
    words = []
    word_tokens = []

    for subword, subword_tokens in zip(subwords, subword_tokens_list):
        special = subword_tokens[0] >= tokenizer.eot
        with_space = subword.startswith(" ")
        punctuation = subword.strip() in string.punctuation
        if special or with_space or punctuation:
            words.append(subword)
            word_tokens.append(subword_tokens)
        else:
            words[-1] = words[-1] + subword
            word_tokens[-1].extend(subword_tokens)

    return words, word_tokens

In [18]:
if languages[lang] in {"Chinese", "Japanese", "Thai", "Lao", "Myanmar"}:
    # These languages don't typically use spaces, so it is difficult to split words
    # without morpheme analysis. Here, we instead split words at any
    # position where the tokens are decoded as valid unicode points
    split_tokens = split_tokens_on_unicode
else:
    split_tokens = split_tokens_on_spaces

In [19]:
# install hooks on the cross attention layers to retrieve the attention weights
QKs = [None] * model.dims.n_text_layer

for i, block in enumerate(model.decoder.blocks):
    block.cross_attn.register_forward_hook(
        lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1])
    )

In [20]:
import whisper
import torch
import os

def process_audio_with_attention(audio_paths):
    # Load the model
    model = whisper.load_model("base")

    for audio_path in audio_paths:
        # Verify file exists
        if not os.path.exists(audio_path):
            print(f"Error: File not found - {audio_path}")
            continue

        try:
            print(f"Processing {audio_path}...")

            QKs = []  # Initialize QKs for each audio sample

            # Load audio with error handling
            try:
                audio = whisper.load_audio(audio_path)
            except RuntimeError as e:
                print(f"Failed to load audio {audio_path}: {str(e)}")
                continue

            audio = whisper.pad_or_trim(audio)
            mel = whisper.log_mel_spectrogram(audio, n_mels=128).to(model.device)

            with torch.no_grad():
                # Run the model forward to trigger the hooks
                _ = model.encoder(mel.unsqueeze(0))

            # Concatenate the attention weights from all layers
            if QKs:  # Only if QKs has been populated
                attention_weights = torch.cat(QKs, dim=0)
                print(f"Processed {audio_path}, attention weights shape: {attention_weights.shape}")
            else:
                print(f"Processed {audio_path} but no attention weights were captured")

        except Exception as e:
            print(f"Error processing {audio_path}: {str(e)}")

In [21]:
from google.colab import files
import whisper

try:
    # Step 1: Upload
    uploaded = files.upload()
    if not uploaded:
        print("Error: Please upload a file first")
    else:
        # Step 2: Get filename
        file_name = list(uploaded.keys())[0]
        print(f"Processing: {file_name}")

        # Step 3: Transcribe
        model = whisper.load_model("base")
        result = model.transcribe(file_name)
        print("\nTranscription:")
        print(result["text"])

except Exception as e:
    print(f"An error occurred: {str(e)}")

Saving 0630.MP3 to 0630.MP3
Processing: 0630.MP3


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 196MiB/s]



Transcription:
 أيش العدسخ kids have done the Moon و these ayud are changing نتعود أني tunes through the booth يحوك هو جربة הזizo وлично في أحدة سلمة الذي يوجد العدسخ وعلى الله عليه وسلم للمو يدطة منه وينه يعني أهنا رلا لذلك يدطة تمرت بيطباروشين أتقدم فرافري قمت كله قائيا نا يسرع تمرت قمت تطاق قائيا أنه تيسرات جميل هنه يدطة تمرت بيطيبارات كيلتة أنه تمرت بيطلا إذا العدراني بارت ميطلا مقرن بر زوجة اللو بوطع لنجن المقرنة وعلى تيسرات كيلتة شكتا يديك ميونو لذلك يدطة ميونو كتا ويشي ميونو كفوانو نا تنكاروش ويما كاروش ميونو نا تشعر وعلى تمتويت وعلى تمتويت مقبام راقور أتقدم فرافر يدطة للمامرة كلادو ميجر ميكزاوسك يدطة نهو لبسارك قاقا وعلى تشعر أرساد درنا تشعر تشعر تشعر يدطة تشعر أو رسرام ها مرحة نا تمرت بيطيبت حصل السليئ ما تنكار فضارك تف Sustain كنت معج، فkeys يوقفت مثل فاز وابدرام مالكين مقالوبات وطامتهم ترمانتين الله اندزي تشماركت لوصراته تتوقع المشاشة يام مقاقة وطيطين مشاشة بيشلال ويقمت سالي تمتر تشلع يا لغمجوة تشم بسيط تشمارات تمتر تباروة تشمت قلنا لجاجر لتشم زغم لجاجر لتشمح ساتف يسبب أن ا