### Валидация WER на данных LibriSpeech ASR (test-clean)

Перед запуском
```powershell
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
pip install transformers datasets jiwer psutil
```

In [None]:
import os
import tarfile
import io
import time
import psutil
import torch
import torchaudio
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from jiwer import wer
from jiwer.transforms import Compose, RemovePunctuation, ToLowerCase, ReduceToListOfListOfWords

In [None]:
torch.cuda.empty_cache()

CUDA

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

torch dtype: 
* torch.float32, 
* torch.float16, 
* torch.int8

In [None]:
torch_dtype = torch.float16

#### Результаты

Machine config:

**CUDA 12.4, GPU: AD104 12GB VRAM, CPU: 12/24 4.4**

Model conf | WER | WER-time | Latency | VRAM | CPU |
--- |:---:|:---:|:---:|:---:|:---:|
Baseline FP32 (base model, torch.float32)|0.031|12 min 51 sec|x1.00|3.258 GB| 69% |
Baseline FP16 (base model, torch.float16)|0.031|9 min 12 sec|x0.72|1.597 GB| 71% |
**Quantized Int8** (torch, loaded weights dequantize to fp16)|0.030|8 min 59 sec|x0.70|1.597 GB| 79% |
ONNX FP16 (ORTModel (inference only))|0.031|25 min 5 sec|x1.95|73.36 MB| 36% |
ONNX QUInt8 (ORTModel + ORTQuantizer + AutoQuantization)|0.032|131 min 04 sec|x10.2|71.32 MB| 95% |
ONNX Int8 (Sherpa ONNX)|0.033|149 min 03 sec|x11.6|0.00 MB| 42% |
ONNX FP16 (Sherpa ONNX)|0.033|203 min 01 sec|x15.8|0.00 MB| 44% |
Faster-Whisper FP16 (Base Inference)|0.145|14 min 26 sec|x1.12|~ 1.900 GB| 7% |
Faster-Whisper Int8 (Base Inference)|0.114|13 min 57 sec|x1.09|~ 1.200 GB| 7% |
Faster-Whisper FP16 (Batched Inference)|0.039|12 min 50 sec|x1.00|~ 1.800 GB| 8% |
Faster-Whisper Int8 (Batched Inference)|0.039|12 min 19 sec|x0.96|~ 1.000 GB| 7% |


Дополнительные конфигурации, которые запустить не удалось (но по карточке модели они могут улучшить показатели):
* Использование torch.compile для ускорения в 4,5 раза (невозможность протестировать в jupiter из-за синхронизации логирования, распространенная ошибка в issues модели, решения от разработчиков нет);
* Использование Flash Attention 2 (build на windows с учетом рекомендуемых параметров среды занимает более 6 часов, рекомендуется WSL или Linux для запуска) + в версиях torch 2.1.1+ для оптимизации используется Torch Scale-Product-Attention (SDPA); 

#### WER

WER transformation (lowercase + remove punctuation + convert to str words list)

In [None]:
wer_transform = Compose([RemovePunctuation(), ToLowerCase(), ReduceToListOfListOfWords()])

In [None]:
tar_path = "test-clean.tar.gz"

references = []
hypotheses = []

total_cpu_time = 0.0
total_wall_time = 0.0
vram_usages = []

Обработка test-clean, может использоваться для:
* baseline
* quantized
* optimum
    <details><summary>другие</summary>
    обработка описана в разделе моделей
    </details>

In [None]:
with tarfile.open(tar_path, "r:gz") as tar:
    # сбор всех транскрипций
    transcriptions = {}
    for member in tar.getmembers():
        if member.name.endswith(".trans.txt"):
            f = tar.extractfile(member)
            content = f.read().decode("utf-8")
            for line in content.splitlines():
                if line.strip():
                    utterance_id, text = line.strip().split(" ", 1)
                    transcriptions[utterance_id] = text

    # обработка FLAC записей
    for member in tar.getmembers():
        if member.name.endswith(".flac"):
            utterance_id = os.path.splitext(os.path.basename(member.name))[0]
            if utterance_id not in transcriptions:
                print(f"Skipping {utterance_id}: No transcript found.")
                continue

            f = tar.extractfile(member)
            audio_bytes = f.read()
            audio_io = io.BytesIO(audio_bytes)
            
            waveform, sample_rate = torchaudio.load(audio_io)
            # mono & resample, если нужно
            if waveform.size(0) > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            if sample_rate != 16000:
                resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
                waveform = resampler(waveform)
            audio_array = waveform.squeeze().numpy().astype(np.float32)

            forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
            inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features.to(device, dtype=torch_dtype)
            
            # CPU & wall time перед генеерацией
            start_wall = time.time()
            process = psutil.Process()
            start_cpu = process.cpu_times().user + process.cpu_times().system
            
            # Сброс CUDA peak memory перед генеерацией
            if device.startswith('cuda'):
                torch.cuda.reset_peak_memory_stats()
            

            with torch.no_grad():
                predicted_ids = model.generate(
                    inputs,
                    forced_decoder_ids=forced_decoder_ids,
                    max_length=448
                )
            
            # CPU time & wall time после генерации
            end_wall = time.time()
            end_cpu = process.cpu_times().user + process.cpu_times().system
            
            # CPU & wall time accumulate
            current_cpu = end_cpu - start_cpu
            current_wall = end_wall - start_wall
            total_cpu_time += current_cpu
            total_wall_time += current_wall
            
            # VRAM usage
            if device.startswith('cuda'):
                current_vram = torch.cuda.max_memory_allocated()
                vram_usages.append(current_vram)
            
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

            references.append(transcriptions[utterance_id])
            hypotheses.append(transcription)
            print(f"Processed: {utterance_id}")

Расчет WER и использования ресурсов

In [None]:
if references and hypotheses:
    wer_score = wer(
        references,
        hypotheses,
        truth_transform=wer_transform,
        hypothesis_transform=wer_transform
    )
    print(f"Word Error Rate (WER): {wer_score:.3f}")
    
    # VRAM usage
    if device.startswith('cuda') and vram_usages:
        average_vram = sum(vram_usages) / len(vram_usages)
        average_vram_mb = average_vram / (1024 ** 2)
        print(f"VRAM usage during inference: {average_vram_mb:.2f} MB")
    elif not device.startswith('cuda'):
        print("CPU Inference")
    
    # CPU usage
    if total_wall_time > 0:
        average_cpu_usage_percent = (total_cpu_time / total_wall_time) * 10
        print(f"CPU usage during inference: {average_cpu_usage_percent:.2f}%")
    else:
        print("No CPU data")
    
else:
    print("Распознаваний не обнаружено.")

---

#### Baseline

In [None]:
model_id = "openai/whisper-large-v3-turbo"
processor = AutoProcessor.from_pretrained(model_id)
batch_size = 8

##### FP16/FP32 CUDA

Для выбора 16 или 32 - необходимо поставить соответсвующее значение в 4 ячейке

In [None]:
# Model pipeline
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True
)

Проверка dtype

In [None]:
print(f"Model parameter dtype: {next(model.parameters()).dtype}")

In [None]:
print(f"Pipe parameter dtype: {pipe.torch_dtype}")

Data processing and WER calc

In [None]:
from IPython.display import HTML

js_code = """
var cells = Jupyter.notebook.get_cells();
for (var i = 0; i < cells.length; i++) {
    if (cells[i].metadata.tags && cells[i].metadata.tags.includes("target_cell")) {
        cells[i].execute();
        break;
    }
}
"""
html = f'<a href="#" onclick="{js_code}">Click to Run Target Cell</a>'
display(HTML(html))

----

#### Torch copmile

In [None]:
import torch
from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
from tqdm import tqdm

torch.set_float32_matmul_precision("high")

In [None]:
model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
).to(device)

# Enable static cache and compile the forward pass
model.generation_config.cache_implementation = "static"
model.generation_config.max_new_tokens = 256
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True
)

In [None]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [None]:
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

# # 2 warmup steps
# for _ in tqdm(range(2), desc="Warm-up step"):
#     with sdpa_kernel(SDPBackend.MATH):
#         result = pipe(sample.copy(), generate_kwargs={"min_new_tokens": 256, "max_new_tokens": 256})

# fast run
with sdpa_kernel(SDPBackend.MATH):
    result = pipe(sample.copy())

print(result["text"])

Error: here

----

#### Self-Quantized Int8

Квантизация модели описана в файле quantization.ipynb

In [None]:
import torch
from transformers import AutoProcessor, pipeline, AutoModelForSpeechSeq2Seq
from datasets import load_dataset

In [None]:
class QuantizedWhisperForSpeechSeq2Seq(AutoModelForSpeechSeq2Seq):
    @classmethod
    def from_pretrained_quantized(cls, pretrained_model_name_or_path, quantized_checkpoint_path):
        """
        Загрузка base model через from_pretrained() и перезапись весов модели
        деквантованными весами из checkpoint.
        Checkpoint это словарь с ключами:
          - "state_dict": словарь, отображающий имена параметров в тензоры int8
          - "scales": словарь, отображающий имена параметров в scale factor.
        """
        # quantized checkpoint
        checkpoint = torch.load(quantized_checkpoint_path, map_location="cpu")
        quantized_state_dict = checkpoint["state_dict"]
        scales = checkpoint["scales"]

        # оригинальная модель с HF (FP16)
        model = cls.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)      
        # новый словарь состояний, деквантуя каждый параметр
        new_state_dict = {}
        for name, param in quantized_state_dict.items():
            if name in scales:
                # преобразование int8 tensor в float16 умножение на его scale factor
                new_state_dict[name] = param.to(torch.float16) * scales[name]
            else:
                new_state_dict[name] = param

        # деквантованный словарь состояний в модель
        # strict=False если ключи отсутствуют/не совпадают
        model.load_state_dict(new_state_dict, strict=False)
        return model

In [None]:
model_id = "openai/whisper-large-v3-turbo"
quantized_checkpoint_path = "whisper_large_v3_turbo_int8.pth"

In [None]:
quantized_model = QuantizedWhisperForSpeechSeq2Seq.from_pretrained_quantized(
    model_id, quantized_checkpoint_path
)
# quantized_model.eval()

In [None]:
processor = AutoProcessor.from_pretrained(model_id)
batch_size = 8

In [None]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=quantized_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True
)

In [None]:
print(f"Model parameter dtype: {next(quantized_model.parameters()).dtype}", f"Pipeline dtype: {pipe.torch_dtype}")

In [None]:
model = quantized_model
model.to(device)

---

#### Self-ONNX with Optimum

Конвертация модели в onnx описана в файле onnx.ipynb

Перед использованием - рекомендация для CUDAExecutionProvider
```powershell
pip uninstall onnx
pip uninstall onnxruntime
pip install optimum[onnxruntime-gpu]
```

In [None]:
model_id = "openai/whisper-large-v3-turbo"
processor = AutoProcessor.from_pretrained(model_id)
batch_size = 8

In [None]:
onnx_v1 = 'onnxf/whisper_lv3t_onnx_v1' #QUInt8
onnx_v2 = 'onnxf/whisper_lv3t_onnx_v2' #FP16
onnx_v3 = 'onnxf/whisper_lv3t_onnx_v3' #FP16 optimized

In [None]:
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor

In [None]:
def onnxmodel(path):
    model = ORTModelForSpeechSeq2Seq.from_pretrained(path)
    tokenizer = AutoTokenizer.from_pretrained(path)
    feature_extractor = AutoFeatureExtractor.from_pretrained(path)
    return model, tokenizer, feature_extractor

In [None]:
model, tokenizer, feature_extractor = onnxmodel(onnx_v1)

In [None]:
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

In [None]:
print(f"Pipeline dtype: {pipe.torch_dtype}")

---

#### ONNX with Sherpa-onnx

Перед использованием:
* посмотреть версии [моделей](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/export-onnx.html#available-models);
* склонировать репозиторий выбранной модели;
* установить инструменты с помощью

    ```powershell
    pip install torch openai-whisper onnxruntime onnx librosa soundfile

    git clone https://github.com/k2-fsa/sherpa-onnx/
    cd sherpa-onnx/scripts/whisper
    ```

* и далее конвертация в onnx, либо тестирование с помощью

    ```powershell
    python3 ./export-onnx.py --model modelName
    ```

* или

    ```powershell
    python3 ./test.py \
    --encoder ./modelName-encoder.onnx \
    --decoder ./modelName-decoder.onnx \
    --tokens ./modelName-tokens.txt \
    ./path-to-audio.wav
    ```

Оригинальные [файлы](https://github.com/k2-fsa/sherpa-onnx/tree/master/scripts/whisper).

Скрипт запуска модели из консоли (powershell tested)

In [None]:
!python ./sherpa_onnx_test.py --encoder ../sherpa-onnx-whisper-turbo/turbo-encoder.onnx --decoder ../sherpa-onnx-whisper-turbo/turbo-decoder.onnx --tokens ../sherpa-onnx-whisper-turbo/turbo-tokens.txt ./test_wavs-0.wav

Эмуляция запуска через sys

In [None]:
test = './test_wavs-0.wav'

In [None]:
import sys

sys.argv = [
    'test.py',
    '--encoder', '../sherpa-onnx-whisper-turbo/turbo-encoder.onnx',
    '--decoder', '../sherpa-onnx-whisper-turbo/turbo-decoder.onnx',
    '--tokens', '../sherpa-onnx-whisper-turbo/turbo-tokens.txt',
    test
]

In [None]:
import sherpa_onnx_test
result_list = sherpa_onnx_test.main()

In [None]:
print(str(result_list))

Эксперименты с тестированием модели:
* рекомендация к использованию - распаковать архив (не удаляя tar.gz) и запустить

In [None]:
import sherpa_onnx_test
import sys

In [None]:
k = 0
with tarfile.open(tar_path, "r:gz") as tar:
    # сбор всех транскрипций
    transcriptions = {}
    for member in tar.getmembers():
        if member.name.endswith(".trans.txt"):
            f = tar.extractfile(member)
            content = f.read().decode("utf-8")
            for line in content.splitlines():
                if line.strip():
                    utterance_id, text = line.strip().split(" ", 1)
                    transcriptions[utterance_id] = text

    # обработка FLAC записей
    for member in tar.getmembers():
        if k == 10: break
        if member.name.endswith(".flac"):
            utterance_id = os.path.splitext(os.path.basename(member.name))[0]
            if utterance_id not in transcriptions:
                print(f"Skipping {utterance_id}: No transcript found.")
                continue
            
            # CPU & wall time перед генеерацией
            start_wall = time.time()
            process = psutil.Process()
            start_cpu = process.cpu_times().user + process.cpu_times().system
            
            # Сброс CUDA peak memory перед генеерацией
            if device.startswith('cuda'):
                torch.cuda.reset_peak_memory_stats()
            
            # CPU time & wall time после генерации
            end_wall = time.time()
            end_cpu = process.cpu_times().user + process.cpu_times().system
            
            # CPU & wall time accumulate
            current_cpu = end_cpu - start_cpu
            current_wall = end_wall - start_wall
            total_cpu_time += current_cpu
            total_wall_time += current_wall
            
            # VRAM usage
            if device.startswith('cuda'):
                current_vram = torch.cuda.max_memory_allocated()
                vram_usages.append(current_vram)
            
            filename='test-clean/'+f'{member.name}'
            sys.argv = [
                'test.py',
                '--encoder', '../sherpa-onnx-whisper-turbo/turbo-encoder.int8.onnx',
                '--decoder', '../sherpa-onnx-whisper-turbo/turbo-decoder.int8.onnx',
                '--tokens', '../sherpa-onnx-whisper-turbo/turbo-tokens.txt',
                filename
            ]
            
            transcription = str(sherpa_onnx_test.main())

            references.append(transcriptions[utterance_id])
            hypotheses.append(transcription)
            print(f"Processed: {utterance_id}")

---

#### Faster-Whisper

Перед запуском
```powershell
pip install faster-whisper
```

In [None]:
from faster_whisper import WhisperModel, BatchedInferencePipeline

In [None]:
model_size = "large-v3-turbo"
batch_size = 8

In [None]:
# запуск на GPU с FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

# или запуск на GPU с INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# или запуск на CPU с INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

Base Inference

In [None]:
segments, info = model.transcribe("test_wavs-0.wav", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print((segment.text))

Batched Inference

In [None]:
batched_model = BatchedInferencePipeline(model=model)
segments, info = batched_model.transcribe("test_wavs-0.wav", batch_size=16)

for segment in segments:
    print((segment.text))

Эксперимент с тестированием модели

In [None]:
k = 0
with tarfile.open(tar_path, "r:gz") as tar:
    # сбор всех транскрипций
    transcriptions = {}
    for member in tar.getmembers():
        if member.name.endswith(".trans.txt"):
            f = tar.extractfile(member)
            content = f.read().decode("utf-8")
            for line in content.splitlines():
                if line.strip():
                    utterance_id, text = line.strip().split(" ", 1)
                    transcriptions[utterance_id] = text

    # обработка FLAC записей
    for member in tar.getmembers():
        # if k == 10: break
        if member.name.endswith(".flac"):
            utterance_id = os.path.splitext(os.path.basename(member.name))[0]
            if utterance_id not in transcriptions:
                print(f"Skipping {utterance_id}: No transcript found.")
                continue

            f = tar.extractfile(member)
            audio_bytes = f.read()
            audio_io = io.BytesIO(audio_bytes)
            
            waveform, sample_rate = torchaudio.load(audio_io)
            # mono & resample, если нужно
            if waveform.size(0) > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            if sample_rate != 16000:
                resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
                waveform = resampler(waveform)
            audio_array = waveform.squeeze().numpy().astype(np.float32)
            
            # CPU & wall time перед генеерацией
            start_wall = time.time()
            process = psutil.Process()
            start_cpu = process.cpu_times().user + process.cpu_times().system
            
            # Сброс CUDA peak memory перед генеерацией
            if device.startswith('cuda'):
                torch.cuda.reset_peak_memory_stats()
            
            '''batched inference'''
            # batched_model = BatchedInferencePipeline(model=model)
            # segments, info = batched_model.transcribe(audio_array, batch_size=batch_size)
            
            '''base inference'''
            segments, info = model.transcribe(audio_array, beam_size=5)

            # CPU time & wall time после генерации
            end_wall = time.time()
            end_cpu = process.cpu_times().user + process.cpu_times().system
            
            # CPU & wall time accumulate
            current_cpu = end_cpu - start_cpu
            current_wall = end_wall - start_wall
            total_cpu_time += current_cpu
            total_wall_time += current_wall
            
            # VRAM usage
            if device.startswith('cuda'):
                current_vram = torch.cuda.max_memory_allocated()
                vram_usages.append(current_vram)
            
            for segment in segments:
                transcription = segment.text

            references.append(transcriptions[utterance_id])
            hypotheses.append(transcription)
            print(f"Processed: {utterance_id}")
            k+=1

---