In [1]:
from huggingface_hub import login, HfFolder
import os
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')

login(token=huggingface_token, add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\Zhenya\.cache\huggingface\token
Login successful


In [2]:
# You can also adapt this script for your own speech recognition validation. Pointers for this are left as comments.

import json
import logging
import os
import sys
import tempfile
import time
from dataclasses import dataclass, field
from typing import Optional

import datasets
import evaluate
import numpy as np
import torch
import transformers
from datasets import DatasetDict, IterableDatasetDict, load_dataset
from tqdm import tqdm
from transformers import (
    HfArgumentParser,
    WhisperForConditionalGeneration,
    WhisperProcessor,
    is_wandb_available,
    pipeline,
    set_seed,
)
from transformers.models.whisper.english_normalizer import EnglishTextNormalizer, BasicTextNormalizer
from transformers.models.whisper.modeling_whisper import WhisperForCausalLM
from transformers.utils import check_min_version, is_accelerate_available
from transformers.utils.versions import require_version


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.34.0.dev0")

require_version("datasets>=2.14.6", "To fix: `pip install --upgrade datasets`")

logger = logging.getLogger(__name__)

In [3]:
PIPELINE_BATCH_SIZE = 16

In [4]:
from run_eval import DataTrainingArguments, write_metric, write_wandb_metric, write_wandb_pred, set_seed

In [5]:
parser = HfArgumentParser([DataTrainingArguments])
# data_args = parser.parse_json_file(json_file="pipe_configs/eval_teacher_v0.json")[0]
data_args = parser.parse_json_file(json_file="pipe_configs/eval_v0.json")[0]
data_args

DataTrainingArguments(dataset_name=None, model_name_or_path='./models/distiled_student_model_moz_v1', subfolder='', model_variant=None, cache_dir=None, assistant_model_name_or_path=None, dtype='float16', use_pipeline=True, chunk_length_s=30.0, return_timestamps=True, language='uk', task='transcribe', attn_implementation='sdpa', batch_size=1, num_beams=1, temperature_fallback=True, logprob_threshold=-1.0, no_speech_threshold=0.6, compression_ratio_threshold=1.35, condition_on_prev_tokens=False, samples_per_dataset=None, dataset_config_name=None, dataset_split_name=None, dataset_cache_dir=None, overwrite_cache=False, preprocessing_num_workers=None, audio_column_name='audio', text_column_name=None, generation_max_length=256, log_predictions=True, preprocessing_only=False, wandb_project='distil-whisper-speed-benchmark', wandb_name=None, wandb_job_type='distil-whisper', wandb_dir=None, save_code_to_wandb=False, streaming=True, max_eval_samples=None, seed=42, use_fast_tokenizer=True, prompt_

In [6]:
logger.setLevel(logging.INFO)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

# 3. Set seed for reproducibility
set_seed(data_args.seed)

if data_args.use_pipeline and data_args.batch_size > 1:
    raise ValueError("Make sure that `batch_size` is set to 1 when `use_pipeline=True`.")

In [7]:
has_wandb = is_wandb_available()
if has_wandb:
    import wandb
    import wandb as wandb_logger

    # store generation HPs for runs
    generation_arguments = {
        "torch_version": str(torch.__version__),
        "transformers_version": str(transformers.__version__),
        "attn_implementation": data_args.attn_implementation,
        "model_name_or_path": data_args.model_name_or_path,
        "subfolder": data_args.subfolder,
        "assistant_model_name_or_path": data_args.assistant_model_name_or_path,
        "seed": data_args.seed,
        "batch_size": data_args.batch_size,
        "num_beams": data_args.num_beams,
        "return_timestamps": data_args.return_timestamps,
        "condition_on_prev_tokens": data_args.condition_on_prev_tokens,
        "temperature_fallback": data_args.temperature_fallback,
        "logprob_threshold": data_args.logprob_threshold,
        "no_speech_threshold": data_args.no_speech_threshold,
        "use_pipeline": data_args.use_pipeline,
        "chunk_length_s": data_args.chunk_length_s,
    }

    # Set up wandb run
    wandb_logger.init(
        project=data_args.wandb_project,
        name=data_args.wandb_name,
        job_type=data_args.wandb_job_type,
        dir=data_args.wandb_dir,
        save_code=data_args.save_code_to_wandb,
        config=generation_arguments,
    )

else:
    raise ValueError("Wandb logging requires wandb to be installed. Run `pip install wandb` to enable.")

05/10/2024 11:13:17 - ERROR - wandb.jupyter - Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mzekamrozek[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
raw_datasets = DatasetDict()
DEBUG_MODE = True
# 3. Load dataset
raw_datasets = DatasetDict()

raw_datasets['mozila_test'] = datasets.load_from_disk(
        f'datasets/mozila_uk/test',
    )

raw_datasets['mozila_test'] = raw_datasets['mozila_test'].rename_column('sentence', 'text')
raw_datasets['mozila_test'] = raw_datasets['mozila_test'].remove_columns(set(raw_datasets['mozila_test'].column_names)-{'text', 'audio'})

if DEBUG_MODE:
    raw_datasets['mozila_test'] = raw_datasets['mozila_test'].select(range(100))

In [19]:
data_args.model_name_or_path

'./models/distiled_student_model_moz_v1'

In [23]:
processor = WhisperProcessor.from_pretrained(
    data_args.model_name_or_path,
    subfolder=data_args.subfolder,
    cache_dir=data_args.cache_dir,
    use_fast=data_args.use_fast_tokenizer,
)
dtype = getattr(torch, data_args.dtype)
model = WhisperForConditionalGeneration.from_pretrained(
    data_args.model_name_or_path,
    subfolder=data_args.subfolder,
    torch_dtype=dtype,
    attn_implementation=data_args.attn_implementation,
    low_cpu_mem_usage=is_accelerate_available(),
    cache_dir=data_args.cache_dir,
    variant=data_args.model_variant,
)
model.to("cuda:0", dtype=dtype)

teacher_model = WhisperForConditionalGeneration.from_pretrained(
    "models/local_whisper_medium",
    subfolder=data_args.subfolder,
    torch_dtype=dtype,
    attn_implementation=data_args.attn_implementation,
    low_cpu_mem_usage=is_accelerate_available(),
    cache_dir=data_args.cache_dir,
    variant=data_args.model_variant,
)
teacher_model.to("cuda:0", dtype=dtype)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [24]:
data_args.use_pipeline

True

In [25]:
model_pipeline = None
if data_args.use_pipeline:
    model_pipeline = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=dtype,
        device=model.device,
        chunk_length_s=data_args.chunk_length_s,
    )
    model_pipeline_forward = model_pipeline._forward

    teacher_model_pipeline = pipeline(
        "automatic-speech-recognition",
        model=teacher_model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=dtype,
        device=model.device,
        chunk_length_s=data_args.chunk_length_s,
    )
    teacher_model_pipeline_forward = teacher_model_pipeline._forward

assistant_model = None
if data_args.assistant_model_name_or_path is not None:
    logger.info("Loading assistant model...")

    if data_args.assistant_model_name_or_path.startswith("openai"):
        assistant_model = WhisperForConditionalGeneration.from_pretrained(
            data_args.assistant_model_name_or_path,
            torch_dtype=dtype,
            attn_implementation=data_args.attn_implementation,
            low_cpu_mem_usage=is_accelerate_available(),
            cache_dir=data_args.cache_dir,
        )
    else:
        assistant_model = WhisperForCausalLM.from_pretrained(
            data_args.assistant_model_name_or_path,
            torch_dtype=dtype,
            attn_implementation=data_args.attn_implementation,
            low_cpu_mem_usage=is_accelerate_available(),
            cache_dir=data_args.cache_dir,
        )

    assistant_model.cuda()

In [26]:
raw_datasets = raw_datasets.cast_column(
    data_args.audio_column_name,
    datasets.features.Audio(sampling_rate=processor.feature_extractor.sampling_rate),
)

# 7. Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
audio_column_name = data_args.audio_column_name
normalizer = (
    BasicTextNormalizer() if data_args.language is not None
    else EnglishTextNormalizer(processor.tokenizer.english_spelling_normalizer)
)
sampling_rate = processor.feature_extractor.sampling_rate

In [27]:
def prepare_dataset(batch):
    # process audio
    audio = [sample["array"].astype(np.float32) for sample in batch[audio_column_name]]

    if model_pipeline is None:
        inputs = processor.feature_extractor(
            audio,
            sampling_rate=sampling_rate,
            return_tensors="pt",
            truncation=False,
            padding="longest",
            return_attention_mask=True,
        )
        if inputs.input_features.shape[-1] < 3000:
            inputs = processor.feature_extractor(
                audio,
                sampling_rate=sampling_rate,
                return_tensors="pt",
                return_attention_mask=True,
            )
        batch["input_features"] = inputs.input_features.to(dtype)
        batch["attention_mask"] = inputs.attention_mask
    else:
        batch["input_features"] = audio

    # process audio length
    batch["length_in_s"] = [len(sample) / sampling_rate for sample in audio]
    # process targets
    batch["reference"] = batch["text"]
    return batch

vectorized_datasets = IterableDatasetDict()

for split in raw_datasets:
    raw_datasets_features = list(raw_datasets[split].features.keys())

    vectorized_datasets[split] = raw_datasets[split].map(
        function=prepare_dataset,
        remove_columns=raw_datasets_features,
        batch_size=data_args.batch_size,
        batched=True,
    )

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [28]:

metric = evaluate.load("wer")

def compute_metrics(pred_str, label_str):
    # normalize everything and re-compute the WER
    norm_pred_str = [normalizer(pred) for pred in pred_str]
    norm_label_str = [normalizer(label) for label in label_str]

    # filtering step to only evaluate the samples that correspond to non-zero normalized references:
    norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
    norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]

    wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
    return wer

In [29]:
gen_kwargs = {
    "max_length": data_args.generation_max_length,
    "return_timestamps": data_args.return_timestamps,
    "num_beams": data_args.num_beams,
    "top_k": 0,
}

if hasattr(model.generation_config, "is_multilingual") and model.generation_config.is_multilingual:
    gen_kwargs["language"] = data_args.language
    gen_kwargs["task"] = data_args.task
elif data_args.language is not None:
    raise ValueError(
        "Setting language token for an English-only checkpoint is not permitted. The language argument should "
        "only be set for multilingual checkpoints."
    )

In [30]:
if assistant_model is not None:
    gen_kwargs["assistant_model"] = assistant_model

if data_args.prompt_text is not None:
    gen_kwargs["prompt_ids"] = processor.get_prompt_ids(data_args.prompt_text, return_tensors="pt").to("cuda:0")

long_form_gen_kwargs = {
    "condition_on_prev_tokens": data_args.condition_on_prev_tokens,
    "compression_ratio_threshold": data_args.compression_ratio_threshold,
    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0) if data_args.temperature_fallback else 0,
    "logprob_threshold": data_args.logprob_threshold,
    "no_speech_threshold": data_args.no_speech_threshold,
}

In [39]:
def benchmark(batch, use_teacher=False):
    if model_pipeline is None:
        inputs = torch.stack(batch["input_features"], dim=0).cuda()
        attention_mask = torch.stack(batch["attention_mask"], dim=0).cuda()
        # automatically use long-form args if required
        inner_batch_size, num_mels, seq_len = inputs.shape
        if seq_len == 3000:
            batch_gen_kwargs = gen_kwargs
        else:
            batch_gen_kwargs = {**gen_kwargs, **long_form_gen_kwargs}

        set_seed(data_args.seed)
        start_time = time.time()
        output_ids = model.generate(inputs, attention_mask=attention_mask, **batch_gen_kwargs)
        batch["time"] = inner_batch_size * [(time.time() - start_time) / inner_batch_size]

        batch["transcription"] = processor.batch_decode(
            output_ids, skip_special_tokens=True, decode_with_timestamps=data_args.return_timestamps
        )

    else:
        inputs = batch["input_features"]
        # Time forward: let's make sure that only forward is timed and not pre- and post-processing
        time_result = []

        def _forward_time(*args, **kwargs):
            start_time = time.time()
            result = model_pipeline_forward(*args, **kwargs)
            end_time = time.time() - start_time
            time_result.append(end_time)
            return result

        model_pipeline._forward = _forward_time
        teacher_model_pipeline._forward = _forward_time

        # print(.shape)
        inputs = np.array(inputs)
        if use_teacher:
            result = model_pipeline(inputs, generate_kwargs=gen_kwargs)
        else:
            result = teacher_model_pipeline(inputs, generate_kwargs=gen_kwargs)
        # print(result)
        result = result["text"]
        batch["transcription"] = [result]
        batch["time"] = [sum(time_result)]

    batch["num_words"] = [len(r.split()) for r in batch["reference"]]
    return batch

result_datasets = DatasetDict()

for split in vectorized_datasets:
    result_datasets[split] = vectorized_datasets[split].map(
        function=benchmark,
        remove_columns=["input_features"],
        batch_size=data_args.batch_size,
        batched=False,
    )

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



In [42]:
teacher_model_pipeline.model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [43]:
model_pipeline.model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [40]:
result_datasets['mozila_test']['transcription'][:10]

[[' Мі хотів стягти її з вагончику'],
 [" Відтоді є встих об'їхати увесь світ."],
 [' Пробачте, і тут лериків.'],
 [' Приїхав дідрозказав бабі, що так і так.'],
 [' І тут наперед, приготуйтеся.'],
 [' Ін підійшов до дверей і пустуков.'],
 [' Але я хочу їхати коло вас.'],
 [' Однак не чужих робила, та і на вас буду.'],
 [' Не завжди так складається, як сподівається.'],
 [' Взяти ноги на плечі.']]

In [44]:
result_datasets['mozila_test']['reference'][:10]

['Він хотів стягти її з вагончика.',
 "Відтоді я встиг об'їхати увесь світ.",
 'Пробачте: і тут лірика.',
 'Приїхав дід, розказав бабі, що так і так.',
 'І тут наперед приготуйтеся',
 'Він підійшов до дверей і постукав.',
 'Але я хочу їхати коло вас.',
 'Однак на чужих робила, то й на вас буду.',
 'Не завжди так складається, як сподівається.',
 'Взяти ноги на плечі.']

In [38]:
result_datasets['mozila_test']['transcription'][:10]

[[' Мі хотів стягти її з вагончику'],
 [" Відтоді є встих об'їхати увесь світ."],
 [' Пробачте, і тут лериків.'],
 [' Приїхав дідрозказав бабі, що так і так.'],
 [' І тут наперед, приготуйтеся.'],
 [' Ін підійшов до дверей і пустуков.'],
 [' Але я хочу їхати коло вас.'],
 [' Однак не чужих робила, та і на вас буду.'],
 [' Не завжди так складається, як сподівається.'],
 [' Взяти ноги на плечі.']]

In [35]:
stats_dataset = DatasetDict()

all_stats = {"rtf": 0, "wer": 0}
rtf_stats = {
    "times_audio_total": 0,
    "times_transcription_total": 0,
}

logger.info("***** Running Evaluation *****")
for key in generation_arguments:
    logger.info(f"  {key}: {generation_arguments[key]}")

datasets_evaluated_progress_bar = tqdm(result_datasets, desc="Datasets", position=0)
for split in datasets_evaluated_progress_bar:
    transcriptions = []
    references = []
    stats = {}
    times_audio_total = 0
    times_transcription_total = 0

    datasets_evaluated_progress_bar.write(f"Start benchmarking {split}...")
    result_iter = iter(result_datasets[split])
    for result in tqdm(result_iter, desc="Samples", position=1):
        times_audio_total += result["length_in_s"]
        # print(result["time"])
        times_transcription_total += result["time"][0]
        # ensure prompt is removed from the transcription (awaiting fix in Transformers)
        if data_args.prompt_text is not None:
            result["transcription"] = result["transcription"].replace(data_args.prompt_text, "")
        transcriptions.append(result["transcription"])
        references.append(result["reference"])

    norm_transcriptions = [normalizer(pred[0]) for pred in transcriptions]
    norm_references = [normalizer(label) for label in references]

    transcriptions = [transcriptions[i] for i in range(len(transcriptions)) if len(norm_references[i]) > 0]
    references = [references[i] for i in range(len(references)) if len(norm_references[i]) > 0]

    norm_transcriptions = [
        norm_transcriptions[i] for i in range(len(norm_transcriptions)) if len(norm_references[i]) > 0
    ]
    norm_references = [norm_references[i] for i in range(len(norm_references)) if len(norm_references[i]) > 0]

    stats["wer"] = compute_metrics(norm_transcriptions, norm_references)

    wer_per_sample = []
    for pred, ref in zip(norm_transcriptions, norm_references):
        wer_per_sample.append(compute_metrics([pred], [ref]))

    stats["rtf"] = times_audio_total / times_transcription_total
    stats_dataset[split] = stats

    wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in stats.items()])
    datasets_evaluated_progress_bar.write(wer_desc)

    write_wandb_metric(wandb_logger, stats, prefix=split)

    if data_args.log_predictions:
        write_wandb_pred(
            wandb_logger,
            transcriptions,
            references,
            norm_transcriptions,
            norm_references,
            wer_per_sample,
            prefix=split,
        )

    rtf_stats["times_audio_total"] += times_audio_total
    rtf_stats["times_transcription_total"] += times_transcription_total
    all_stats["wer"] += stats["wer"]

all_stats["wer"] = all_stats["wer"] / len(result_datasets)
# technically this is the reciprocal of the RTF, but it makes the scale easier to read on wandb
all_stats["rtf"] = rtf_stats["times_audio_total"] / rtf_stats["times_transcription_total"]

stats_dataset["all"] = all_stats

write_wandb_metric(wandb_logger, all_stats, prefix="all")

benchmark_artifact = wandb.Artifact("Benchmark", type="datasets")
with tempfile.TemporaryDirectory() as temp_dir:
    for split in stats_dataset:
        file_name = os.path.join(temp_dir, f"{'_'.join(split.split('/'))}.json")

        with open(file_name, "w") as json_file:
            json.dump(stats_dataset[split], json_file)

        benchmark_artifact.add_file(file_name, split)

    wandb_logger.log_artifact(benchmark_artifact)

05/10/2024 11:20:14 - INFO - __main__ - ***** Running Evaluation *****
05/10/2024 11:20:14 - INFO - __main__ -   torch_version: 2.2.2
05/10/2024 11:20:14 - INFO - __main__ -   transformers_version: 4.40.1
05/10/2024 11:20:14 - INFO - __main__ -   attn_implementation: sdpa
05/10/2024 11:20:14 - INFO - __main__ -   model_name_or_path: ./models/distiled_student_model_moz_v1
05/10/2024 11:20:14 - INFO - __main__ -   subfolder: 
05/10/2024 11:20:14 - INFO - __main__ -   assistant_model_name_or_path: None
05/10/2024 11:20:14 - INFO - __main__ -   seed: 42
05/10/2024 11:20:14 - INFO - __main__ -   batch_size: 1
05/10/2024 11:20:14 - INFO - __main__ -   num_beams: 1
05/10/2024 11:20:14 - INFO - __main__ -   return_timestamps: True
05/10/2024 11:20:14 - INFO - __main__ -   condition_on_prev_tokens: False
05/10/2024 11:20:14 - INFO - __main__ -   temperature_fallback: True
05/10/2024 11:20:14 - INFO - __main__ -   logprob_threshold: -1.0
05/10/2024 11:20:14 - INFO - __main__ -   no_speech_thresh

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Start benchmarking mozila_test...


Samples: 100it [00:00, 14288.21it/s]
Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Eval wer: 31.728045325779036 | Eval rtf: 35.65086328417897 |


Datasets: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


In [33]:
# Eval wer: 31.728045325779036 | Eval rtf: 34.67377667224213 |

In [18]:
model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [18]:
# Start benchmarking mozila_test...
# Samples: 9791it [00:00, 17022.25it/s]
# Datasets:   0%|          | 0/1 [01:49<?, ?it/s]
# Eval wer: 29.481835266854127 | Eval rtf: 43.302215022844024 |
# Datasets: 100%|██████████| 1/1 [01:51<00:00, 111.21s/it]