# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [1]:
! pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to c:\users\jonat\appdata\local\temp\pip-req-build-jrym_r7g
  Resolved https://github.com/openai/whisper.git to commit 9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting transformers>=4.19.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
     ---------------------------------------- 5.5/5.5 MB 35.1 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.10.0
  Using cached huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py): started
  Building wheel for whisper (setup.py): finished with status 'done'
  Created wheel for whisper: filename=whisper-1.0-py3-none-any.whl size=1187080 sha256=2fa93264f78c74d02750ac781cb8dc4229a0e74aed64b50f60dac56bde7c69ed
  Stored in directory: C:\Users\jonat\AppDat

  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git 'C:\Users\jonat\AppData\Local\Temp\pip-req-build-jrym_r7g'


In [2]:
import io
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import urllib
import tarfile
import whisper
import torchaudio

from scipy.io import wavfile
from tqdm.notebook import tqdm


pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 1000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

c:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


# Loading the Fleurs dataset

Select the language of the Fleur dataset to download. Please note that the transcription and translation performance varies widely depending on the language. Appendix D.2 in the paper contains the performance breakdown by language.

In [4]:
import ipywidgets as widgets

languages = {"af_za": "Afrikaans", "am_et": "Amharic", "ar_eg": "Arabic", "as_in": "Assamese", "az_az": "Azerbaijani", "be_by": "Belarusian", "bg_bg": "Bulgarian", "bn_in": "Bengali", "bs_ba": "Bosnian", "ca_es": "Catalan", "cmn_hans_cn": "Chinese", "cs_cz": "Czech", "cy_gb": "Welsh", "da_dk": "Danish", "de_de": "German", "el_gr": "Greek", "en_us": "English", "es_419": "Spanish", "et_ee": "Estonian", "fa_ir": "Persian", "fi_fi": "Finnish", "fil_ph": "Tagalog", "fr_fr": "French", "gl_es": "Galician", "gu_in": "Gujarati", "ha_ng": "Hausa", "he_il": "Hebrew", "hi_in": "Hindi", "hr_hr": "Croatian", "hu_hu": "Hungarian", "hy_am": "Armenian", "id_id": "Indonesian", "is_is": "Icelandic", "it_it": "Italian", "ja_jp": "Japanese", "jv_id": "Javanese", "ka_ge": "Georgian", "kk_kz": "Kazakh", "km_kh": "Khmer", "kn_in": "Kannada", "ko_kr": "Korean", "lb_lu": "Luxembourgish", "ln_cd": "Lingala", "lo_la": "Lao", "lt_lt": "Lithuanian", "lv_lv": "Latvian", "mi_nz": "Maori", "mk_mk": "Macedonian", "ml_in": "Malayalam", "mn_mn": "Mongolian", "mr_in": "Marathi", "ms_my": "Malay", "mt_mt": "Maltese", "my_mm": "Myanmar", "nb_no": "Norwegian", "ne_np": "Nepali", "nl_nl": "Dutch", "oc_fr": "Occitan", "pa_in": "Punjabi", "pl_pl": "Polish", "ps_af": "Pashto", "pt_br": "Portuguese", "ro_ro": "Romanian", "ru_ru": "Russian", "sd_in": "Sindhi", "sk_sk": "Slovak", "sl_si": "Slovenian", "sn_zw": "Shona", "so_so": "Somali", "sr_rs": "Serbian", "sv_se": "Swedish", "sw_ke": "Swahili", "ta_in": "Tamil", "te_in": "Telugu", "tg_tj": "Tajik", "th_th": "Thai", "tr_tr": "Turkish", "uk_ua": "Ukrainian", "ur_pk": "Urdu", "uz_uz": "Uzbek", "vi_vn": "Vietnamese", "yo_ng": "Yoruba"}
selection = widgets.Dropdown(
    options=[("Select language", None), ("----------", None)] + sorted([(f"{v} ({k})", k) for k, v in languages.items()]),
    value="ko_kr",
    description='Language:',
    disabled=False,
)

selection

Dropdown(description='Language:', index=39, options=(('Select language', None), ('----------', None), ('Afrika…

In [5]:
lang = selection.value
language = languages[lang]

assert lang is not None, "Please select a language"
print(f"Selected language: {language} ({lang})")

Selected language: French (fr_fr)


In [6]:
class Fleurs(torch.utils.data.Dataset):
    """
    A simple class to wrap Fleurs and subsample a portion of the dataset as needed.
    """
    def __init__(self, lang, split="test", subsample_rate=1, device=DEVICE):
        url = f"https://storage.googleapis.com/xtreme_translations/FLEURS102/{lang}.tar.gz"
        tar_path = os.path.expanduser(f"~/.cache/fleurs/{lang}.tgz")
        os.makedirs(os.path.dirname(tar_path), exist_ok=True)

        if not os.path.exists(tar_path):
            with urllib.request.urlopen(url) as source, open(tar_path, "wb") as output:
                with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
                    while True:
                        buffer = source.read(8192)
                        if not buffer:
                            break

                        output.write(buffer)
                        loop.update(len(buffer))

        labels = {}
        all_audio = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            for member in tar.getmembers():
                name = member.name
                if name.endswith(f"{split}.tsv"):
                    labels = pd.read_table(tar.extractfile(member), names=("id", "file_name", "raw_transcription", "transcription", "_", "num_samples", "gender"))

                if f"/{split}/" in name and name.endswith(".wav"):
                    audio_bytes = tar.extractfile(member).read()
                    all_audio[os.path.basename(name)] = wavfile.read(io.BytesIO(audio_bytes))[1]
                    

        self.labels = labels.to_dict("records")[::subsample_rate]
        self.all_audio = all_audio
        self.device = device

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        record = self.labels[item]
        audio = torch.from_numpy(self.all_audio[record["file_name"]].copy())
        text = record["transcription"]
        
        return (audio, text)

In [7]:
dataset = Fleurs(lang, subsample_rate=10)  # subsample 10% of the dataset for a quick demo

  0%|                                              | 0.00/2.08G [00:00<?, ?iB/s]

# Running inference on the dataset using a medium Whisper model

The following will take a few minutes to transcribe and translate utterances in the dataset.

In [8]:
model = whisper.load_model("medium")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

100%|█████████████████████████████████████| 1.42G/1.42G [00:40<00:00, 37.9MiB/s]


Model is multilingual and has 762,321,920 parameters.


In [9]:
options = dict(language=language, beam_size=5, best_of=5)
transcribe_options = dict(task="transcribe", **options)
translate_options = dict(task="translate", **options)

In [10]:
references = []
transcriptions = []
translations = []

for audio, text in tqdm(dataset):
    transcription = model.transcribe(audio, **transcribe_options)["text"]
    translation = model.transcribe(audio, **translate_options)["text"]
    
    transcriptions.append(transcription)
    translations.append(translation)
    references.append(text)

  0%|          | 0/68 [00:00<?, ?it/s]



In [11]:
data = pd.DataFrame(dict(reference=references, transcription=transcriptions, translation=translations))
data

Unnamed: 0,reference,transcription,translation
0,les voyageurs à destination de pays où les taxes sont élevées peuvent parfois faire des économies considérables en particulier sur des produits comme les boissons alcoolisées ou le tabac,Les voyageurs à destination de pays où les taxes sont élevées peuvent parfois faire des économies considérables en particulier sur des produits comme les boissons alcoolisées ou le tabac.,"Travelers to countries where taxes are high can sometimes make considerable savings, especially on products such as alcoholic drinks or tobacco."
1,la surface de la lune est constituée de pierres et de poussière sa couche extérieure est appelée la croûte,La surface de la lune est constituée de pierres et de poussières. Sa couche extérieure est appelée la croûte.,"The surface of the moon is made of stones and dust, its outer layer is called the crust."
2,l'usage de l'enregistrement vidéo a mené à de grandes découvertes dans l'interprétation des micro-expressions c'est-à-dire des expressions faciales durant quelques millisecondes,"L'enregistrement vidéo a mené à de grandes et ouvert dans l'interprétation des micro-expressions, c'est-à-dire expression faciale durant quelques millisecondes.","Video recording has led to the interpretation of micro-expressions, i.e. facial expressions, during a few milliseconds."
3,initialement l'émission était uniquement diffusée sur le vénérable site de radio internet toginet radio un site consacré à la radio parlée,"Initialement, l'émission était uniquement diffusée sur le vénérable site radio interne Toguinet Radio, un site consacré à la Radio Parler.","Initially, the show was only broadcast on the venerable internal radio site Toguinet Radio, a site dedicated to radio talk."
4,cuomo 53 ans a commencé son mandat de gouverneur au début de cette année et a signé le mois dernier un projet de loi légalisant le mariage entre personnes du même sexe,"Kouomo, 53 ans, a commencé son mandat de gouverneur au début de cette année et a signé le mois dernier un projet de loi légalisant le mariage entre personnes du même sexe.","Coomo, 53 years old, started his mandate of governor at the beginning of this year and signed last month a bill legalizing the marriage between people of the same sex."
5,le même mois un autre avion de ligne a fait une sortie de piste à mashhad et a heurté un mur tuant ainsi dix-sept personnes,"Le même mois, un autre avion de ligne a fait une sortie de piste à Machade et a heurté un mur, tuant ainsi 17 personnes.","The same month, another plane from the line made a runway exit at Machade and hit a wall killing 17 people."
6,on prétend notamment que l'on peut détecter si une personne ment en interprétant correctement des micro-expressions,On prétend notamment que l'on peut détecter signes personnellement en interprétant correctement des micro-expressions.,We claim that we can detect signs by correctly interpreting micro-expressions.
7,certains atomes ont un noyau instable de sorte qu'ils ont tendance à se briser avec peu ou pas de pression,"Certains atomes ont un noyau instable, de sorte qu'ils ont tendance à se briser avec peu ou pas de pression.","Some atoms have an unstable core, so they tend to break with little or no pressure."
8,en 1976 trente pour cent du machu picchu avait été restauré et cette restauration se poursuit jusqu'à aujourd'hui,"En 1976, 30% du Machu Picchu avait été restauré et cette restauration se poursuit jusqu'à aujourd'hui.","In 1976, 30% of Machu Picchu had been restored and this restoration continues to this day."
9,de même en ayant un visa schengen vous n'avez pas besoin de demander séparément un visa pour chacun des pays membres de l'espace ce qui permet d'économiser du temps de l'argent et des formalités administratives,"De même, en ayant un visa Schengen, vous n'avez pas besoin de demander séparément un visa pour chacun des pays membres de l'espace, ce qui permet d'économiser du temps, de l'argent et des formalités administratives.","Likewise, having a Schengen visa, you do not need to ask separately for a visa for each of the member countries of space, which saves time, money and administrative formalities."
