# Whisper transcription on the FLEURS dataset
Only on the downloaded part of it (approx. 10 languages for which I found a multilingual NeMo model as comparision). The manifests need to be generated beforehand.

In [1]:
# if faster_whisper fails silently, chances are this will solve it: sudo apt install nvidia-cudnn # ipynb does not show all errors properly; I recommend running the problematic scripts directly from the terminal

import whisper
import json
from tqdm.notebook import tqdm
from faster_whisper import WhisperModel
import os

  def backtrace(trace: np.ndarray):


In [2]:
# download a dedicated faster_whisper_model
def download_faster_whisper_model(model_path_or_name):
    model = WhisperModel(model_path_or_name, device="cuda", compute_type="int8", local_files_only = False)
    return model

#model = download_faster_whisper_model('bababababooey/faster-whisper-large-v3') # 'large-v3'
#model

In [3]:
def load_whisper_model(model_path_or_name):
    model = whisper.load_model(model_path_or_name, device="cuda")#.to(torch.device("cpu"))
    return model


def load_faster_whisper_model(model_path_or_name):
    # this can load even large models
    if model_path_or_name == 'large-v3':
        model = WhisperModel('bababababooey/faster-whisper-large-v3', device="cuda", compute_type="int8", local_files_only = True)
        model.feature_extractor.mel_filters = model.feature_extractor.get_mel_filters(model.feature_extractor.sampling_rate, model.feature_extractor.n_fft, n_mels=128)
    else:
        model = WhisperModel(model_path_or_name, device="cuda", compute_type="int8", local_files_only = True)
    return model


# faster_whisper is faster than openai's implementation
def whisper_transcribe_from_manifest(model, manifest_path, output_path, openai_whisper=False):
    results = []

    with open(manifest_path, 'r') as fin:
        lines = fin.readlines()
        print("read file {}".format(manifest_path))
        for line in tqdm(lines):
            # load() for whole document, loads() for string
            manifest_entry = json.loads(line)
            #print(manifest_entry['audio_filepath'])

            if openai_whisper:
                result = model.transcribe(manifest_entry['audio_filepath'])
            else:
                segments, _ = model.transcribe(manifest_entry['audio_filepath'], beam_size=1, best_of=1) #  beam_size=1, best_of=1
                segments = list(segments)  # The transcription will actually run here.)
                result = ""
                for segment in segments:
                    result += segment.text
                #print(result)

            results.append({
                "audio_filepath": manifest_entry['audio_filepath'],
                "duration": manifest_entry['duration'],
                "text": manifest_entry['text'],
                "pred_text": result
            })

    with open(output_path, "w") as fout:
        for result in results:
            # ensure_ascii=True is the default and even speech_transcribe will use escaped umlauts, but this way the manifest is human-readable
            fout.write(json.dumps(result, ensure_ascii=False) + "\n")   
    print("wrote file {}".format(output_path))

In [7]:
# run faster-whisper transcription for the downloaded fleurs dataset (manifests need to be generated beforehand)
# fleurs_dataset_dirs = ["be_by", "de_de", "en_us", "fr_fr", "gl_es", "hr_hr", "hu_hu", "it_it", "pl_pl", "ru_ru", "uk_ua"]

# large model does not fit into 4GB of VRAM (even medium can become problematic)
# actually it does with cudnn, see ref. https://github.com/guillaumekln/faster-whisper#large-v2-model-on-gpu
for model_name in ["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"]: 
    model = load_faster_whisper_model(model_name)
    print("loaded model {}".format(model_name))

    for child_item in os.listdir("/home/kozi/Documents/fleurs/"):
        child_dir = os.path.join("/home/kozi/Documents/fleurs/", child_item)
        if not os.path.isdir(child_dir):
            continue

        input_manifest_path = os.path.join(child_dir, "whisper_manifest.json")
        if not os.path.isfile(input_manifest_path):
            continue

        output_manifest_path = os.path.join(child_dir, "whisper_transcription_{}_{}.json".format(child_item, model_name))
        if os.path.isfile(output_manifest_path):
            continue

        whisper_transcribe_from_manifest(
            model,
            input_manifest_path,
            output_path = output_manifest_path,
            openai_whisper = False
        )

loaded model tiny
loaded model base
loaded model small
loaded model medium
loaded model large
loaded model large-v2
loaded model large-v3
read file /home/kozi/Documents/fleurs/hu_hu/whisper_manifest.json


  0%|          | 0/905 [00:00<?, ?it/s]

wrote file /home/kozi/Documents/fleurs/hu_hu/whisper_transcription_hu_hu_large-v3.json


In [5]:
# run faster-whisper transcription for the downloaded librispeech dataset (manifests need to be generated beforehand)
# fleurs_dataset_dirs = ["be_by", "de_de", "en_us", "fr_fr", "gl_es", "hr_hr", "hu_hu", "it_it", "pl_pl", "ru_ru", "uk_ua"]

# large model does not fit into 4GB of VRAM (even medium can become problematic)
# actually it does with cudnn, see ref. https://github.com/guillaumekln/faster-whisper#large-v2-model-on-gpu
for model_name in ["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"]: 
    model = load_faster_whisper_model(model_name)
    print("loaded model {}".format(model_name))

    input_manifest_path = "/home/kozi/Documents/librispeech_get_test_dataset/data/test_other_whisper.json"

    output_manifest_path = "/home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_{}.json".format(model_name)
    if os.path.isfile(output_manifest_path):
        continue

    whisper_transcribe_from_manifest(
        model,
        input_manifest_path,
        output_path = output_manifest_path,
        openai_whisper = False
    )

loaded model tiny
read file /home/kozi/Documents/librispeech_get_test_dataset/data/test_other_whisper.json


  0%|          | 0/2939 [00:00<?, ?it/s]

wrote file /home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_tiny.json
loaded model base
read file /home/kozi/Documents/librispeech_get_test_dataset/data/test_other_whisper.json


  0%|          | 0/2939 [00:00<?, ?it/s]

wrote file /home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_base.json
loaded model small
read file /home/kozi/Documents/librispeech_get_test_dataset/data/test_other_whisper.json


  0%|          | 0/2939 [00:00<?, ?it/s]

wrote file /home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_small.json
loaded model medium
read file /home/kozi/Documents/librispeech_get_test_dataset/data/test_other_whisper.json


  0%|          | 0/2939 [00:00<?, ?it/s]

wrote file /home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_medium.json


RuntimeError: CUDA failed with error out of memory

In [None]:
# script to calculate results for librispeech dataset

!python3 /home/kozi/Documents/NeMo/examples/asr/speech_to_text_eval.py \
    dataset_manifest='/home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/out_large_CER_1.json' \
    use_cer=True \
    only_score_manifest=True

!python3 /home/kozi/Documents/NeMo/examples/asr/speech_to_text_eval.py \
    dataset_manifest='/home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_tiny.json' \
    use_cer=True \
    only_score_manifest=True

!python3 /home/kozi/Documents/NeMo/examples/asr/speech_to_text_eval.py \
    dataset_manifest='/home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_base.json' \
    use_cer=True \
    only_score_manifest=True

!python3 /home/kozi/Documents/NeMo/examples/asr/speech_to_text_eval.py \
    dataset_manifest='/home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_small.json' \
    use_cer=True \
    only_score_manifest=True

!python3 /home/kozi/Documents/NeMo/examples/asr/speech_to_text_eval.py \
    dataset_manifest='/home/kozi/Documents/_onlab_git/output/librispeech_get_test_dataset/whisper_transcription_librispeech_medium.json' \
    use_cer=True \
    only_score_manifest=True