<a href="https://colab.research.google.com/github/EdwardFang09/IEE4912/blob/main/whisper_benchmark_(documentation).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> Benchmark (no early stopping)

Suara direkam di lab ramai untuk simulasi.

In [1]:
#Library for quick-start on google colab
!pip install faster-whisper jiwer nvidia-ml-py3



In [10]:
import time
from faster_whisper import WhisperModel, BatchedInferencePipeline
import pandas as pd
import jiwer
import torch
from jiwer import transforms
import nvidia_smi

# Define audio file and its ground truth transcript
audio_file = "edwardmentimeter.m4a"  # Replace with your audio file
ground_truth_transcript = "sora, open mentimeter"  # Replace with the actual transcript

# Model sizes and compute types to benchmark
model_configs = [
    {"size": "tiny", "compute": "float32"},
    {"size": "tiny", "compute": "float16"},
    {"size": "tiny", "compute": "int8"},  # CPU INT8
    {"size": "tiny", "compute": "int8_float16"}, # GPU INT8
    {"size": "base", "compute": "float32"},
    {"size": "base", "compute": "float16"},
    {"size": "base", "compute": "int8_float16"}, # GPU INT8
    {"size": "small", "compute": "float32"},
    {"size": "small", "compute": "float16"},
    {"size": "small", "compute": "int8_float16"}, # GPU INT8
    {"size": "medium", "compute": "float32"},
    {"size": "medium", "compute": "float16"},
    {"size": "medium", "compute": "int8_float16"}, # GPU INT8
    {"size": "large-v2", "compute": "float32"},
    {"size": "large-v2", "compute": "float16"},
    {"size": "large-v2", "compute": "int8_float16"}, # GPU INT8
    {"size": "large-v3", "compute": "float32"},
    {"size": "large-v3", "compute": "float16"},
    {"size": "large-v3", "compute": "int8_float16"}, # GPU INT8
    {"size": "turbo", "compute": "float16"},  # Turbo only supports FP16
]

batch_sizes =  [8, 16, 32]# Experiment with batch sizes

results = []

for config in model_configs:
    for batch_size in batch_sizes:
        if config["size"] in ["base", "small", "medium", "large-v2", "large-v3"]:
            effective_batch_size = min(batch_size, 4)  # Reduce for larger models
        elif config["size"] == "turbo":
             effective_batch_size = min(batch_size, 8)
        else:
            effective_batch_size = batch_size

        start_time = time.time()

        try:
            device = "cuda" if config["compute"]!= "int8" else "cpu"
            model = WhisperModel(config["size"], device=device, compute_type=config["compute"])

            if device == "cuda":
                nvidia_smi.nvmlInit()
                handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
                # No memory info printed here

            if config["size"] == "turbo":
                batched_model = BatchedInferencePipeline(model=model)
                segments, info = batched_model.transcribe(audio_file, batch_size=effective_batch_size, language='en')
            else:
                segments, info = model.transcribe(audio_file, beam_size=5, language='en')

            segments_list = list(segments)

            transform = transforms.Compose([
                transforms.ToLowerCase(),
                transforms.RemovePunctuation(),
                transforms.RemoveMultipleSpaces(),
                transforms.Strip(),
            ])

            ground_truth_transformed = transform(ground_truth_transcript)
            predicted_transcript = " ".join([segment.text for segment in segments_list])
            predicted_transcript_transformed = transform(predicted_transcript)


            wer = jiwer.wer(ground_truth_transformed, predicted_transcript_transformed)

            end_time = time.time()
            inference_time = end_time - start_time

            if device == "cuda":
                infoo = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

            results.append({
                "model_size": config["size"],
                "compute_type": config["compute"],
                "batch_size": effective_batch_size,
                "inference_time": inference_time,
                "language": info.language,
                "language_probability": info.language_probability,
                "num_segments": len(segments_list),
                "wer": wer,
                "predicted_transcript": predicted_transcript, #added predicted transcript
                "Used memory (GB)": infoo.used / (1024 ** 3),
            })

            print(f"Model: {config['size']}, Compute: {config['compute']}, Batch: {effective_batch_size}, Time: {inference_time:.2f}s, WER: {wer:.2f}, Text:, {predicted_transcript}, Used memory (GB): {infoo.used / (1024 ** 3)}") # added predicted transcript to the output

        except Exception as e:
            print(f"Error with Model: {config['size']}, Compute: {config['compute']}, Batch: {effective_batch_size}: {e}")
            results.append({
                "model_size": config["size"],
                "compute_type": config["compute"],
                "batch_size": effective_batch_size,
                "inference_time": "Error",
                "error": str(e),
                "wer": "Error",
                "predicted_transcript": "Error" # added predicted transcript in case of error
            })

        finally:
            del model
            if 'batched_model' in locals():
                del batched_model
            torch.cuda.empty_cache()

# Print or save results (e.g., to a CSV file)
df = pd.DataFrame(results)
df.to_csv("no_stop_whisper_benchmark.csv", index=False)
print(df)

Model: tiny, Compute: float32, Batch: 8, Time: 1.29s, WER: 3.00, Text:,  So, now we are going to connect the metter., Used memory (GB): 1.1522216796875
Model: tiny, Compute: float32, Batch: 16, Time: 1.41s, WER: 3.00, Text:,  So, now we are going to connect the metter., Used memory (GB): 0.6092529296875
Model: tiny, Compute: float32, Batch: 32, Time: 1.34s, WER: 3.00, Text:,  So, now we are going to connect the metter., Used memory (GB): 0.7030029296875
Model: tiny, Compute: float16, Batch: 8, Time: 1.64s, WER: 3.00, Text:,  So, now we are going to connect the metter., Used memory (GB): 0.6405029296875
Model: tiny, Compute: float16, Batch: 16, Time: 0.95s, WER: 3.00, Text:,  So, now we are going to connect the metter., Used memory (GB): 0.6092529296875
Model: tiny, Compute: float16, Batch: 32, Time: 1.23s, WER: 3.00, Text:,  So, now we are going to connect the metter., Used memory (GB): 0.5780029296875
Model: tiny, Compute: int8, Batch: 8, Time: 3.74s, WER: 3.00, Text:,  So, now we are

Kesimpulan: tidak perlu model besar. Model small bahkan sudah cukup untuk akurasi 100% dengan pemakaian memori ~1GB juga dengan waktu proses <2 detik

<h1 >with early stopping (all models)

In [11]:
import time
from faster_whisper import WhisperModel, BatchedInferencePipeline
import pandas as pd
import jiwer
import nvidia_smi
from jiwer import transforms
import os
import torch

# Define audio files and their ground truth transcripts (dictionary)
audio_ground_truth = {
    "edwardchrome.m4a": "hey sora, open chrome.",
    "edwardkahoot.m4a": "hey sora, open kahoot.",
    #... more audio files and transcripts
}

# Model sizes and compute types to benchmark
model_configs = [
    {"size": "tiny", "compute": "float32"},
    {"size": "tiny", "compute": "float16"},
    {"size": "tiny", "compute": "int8"},  # CPU INT8
    {"size": "base", "compute": "float32"},
    {"size": "base", "compute": "float16"},
    {"size": "small", "compute": "float32"},
    {"size": "small", "compute": "float16"},
    {"size": "medium", "compute": "float32"},
    {"size": "medium", "compute": "float16"},
    {"size": "turbo", "compute": "float16"},  # Turbo only supports FP16
    {"size": "large-v2", "compute": "float32"},
    {"size": "large-v2", "compute": "float16"},
    {"size": "large-v3", "compute": "float32"},
    {"size": "large-v3", "compute": "float16"},
]

results = []

print(f"CUDA available: {torch.cuda.is_available()}")

i = 0
for audio_file, ground_truth_transcript in audio_ground_truth.items():
    for config in model_configs:
        start_time = time.time()

        try:
            device = "cuda" if config["compute"]!= "int8" else "cpu"

            if device == "cuda":
                nvidia_smi.nvmlInit()
                handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
                # No memory info printed here

            model = WhisperModel(config["size"], device=device, compute_type=config["compute"])

            if config["size"] == "turbo":
                batched_model = BatchedInferencePipeline(model=model)
                segments, info = batched_model.transcribe(audio_file, language='en')
            else:
                segments, info = model.transcribe(audio_file, beam_size=5, language='en')

            segments_list = list(segments)

            transform = transforms.Compose([
                transforms.ToLowerCase(),
                transforms.RemovePunctuation(),
                transforms.RemoveMultipleSpaces(),
                transforms.Strip(),
            ])

            ground_truth_transformed = transform(ground_truth_transcript)
            predicted_transcript = " ".join([segment.text for segment in segments_list])
            predicted_transcript_transformed = transform(predicted_transcript)

            wer = jiwer.wer(ground_truth_transformed, predicted_transcript_transformed)

            end_time = time.time()
            inference_time = end_time - start_time

            if device == "cuda":
                infoo = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

            results.append({
                "audio_file": audio_file,
                "model_size": config["size"],
                "compute_type": config["compute"],
                "inference_time": inference_time,
                "language": info.language,
                "language_probability": info.language_probability,
                "num_segments": len(segments_list),
                "wer": wer,
                "predicted_transcript": predicted_transcript,
                "Used memory (GB)": infoo.used / (1024 ** 3)
            })

            print(
                f"Audio: {audio_file}, Model: {config['size']}, Compute: {config['compute']}, Time: {inference_time:.2f}s, WER: {wer:.2f}, Text:, {predicted_transcript}, Used memory (GB): {infoo.used / (1024 ** 3)}"
            )

            if wer == 0:
                if i == 0:
                  i += 1 #add one more iteration for safety measure
                else:
                  print(f"Skipping remaining models for this audio")
                  break

        except Exception as e:
            print(
                f"Error with Audio: {audio_file}, Model: {config['size']}, Compute: {config['compute']}: {e}"
            )
            results.append({
                "audio_file": audio_file,
                "model_size": config["size"],
                "compute_type": config["compute"],
                "inference_time": "Error",
                "error": str(e),
                "wer": "Error",
                "predicted_transcript": "Error",
            })

        finally:
            del model
            if config["size"] == "turbo":
                del batched_model
            torch.cuda.empty_cache()

# Print or save results (e.g., to a CSV file)
df = pd.DataFrame(results)
df.to_csv("whisper_benchmark_results.csv", index=False)
print(df)

CUDA available: True
Audio: edwardchrome.m4a, Model: tiny, Compute: float32, Time: 1.57s, WER: 0.00, Text:,  Hey Sora, open Chrome., Used memory (GB): 0.6092529296875
Audio: edwardchrome.m4a, Model: tiny, Compute: float16, Time: 0.90s, WER: 0.00, Text:,  Hey Sora, open Chrome., Used memory (GB): 0.5780029296875
Skipping remaining models for this audio
Audio: edwardkahoot.m4a, Model: tiny, Compute: float32, Time: 0.80s, WER: 0.50, Text:,  Hey Sora, open the hood., Used memory (GB): 0.6405029296875
Audio: edwardkahoot.m4a, Model: tiny, Compute: float16, Time: 0.66s, WER: 0.50, Text:,  Hey Sora, open the hood., Used memory (GB): 0.6405029296875
Audio: edwardkahoot.m4a, Model: tiny, Compute: int8, Time: 2.36s, WER: 0.50, Text:,  Hey Sora, open the hood., Used memory (GB): 0.6405029296875
Audio: edwardkahoot.m4a, Model: base, Compute: float32, Time: 2.66s, WER: 0.25, Text:,  Hey Sora, open Kakut., Used memory (GB): 0.7342529296875
Audio: edwardkahoot.m4a, Model: base, Compute: float16, Time

<h1> turbo model (special model - optimized large)

In [12]:
import time
from faster_whisper import WhisperModel, BatchedInferencePipeline
import pandas as pd
import jiwer
import nvidia_smi
from jiwer import transforms
import os
import torch

# Define audio files and their ground truth transcripts (dictionary)
audio_ground_truth = {
    "edwardchrome.m4a": "hey sora, open chrome.",
    "edwardkahoot.m4a": "hey sora, open kahoot.",
    #... more audio files and transcripts
}

# Model sizes and compute types to benchmark
model_configs = [
    {"size": "turbo", "compute": "float16"},  # Turbo only supports FP16
]

results = []

print(f"CUDA available: {torch.cuda.is_available()}")

for audio_file, ground_truth_transcript in audio_ground_truth.items():
    for config in model_configs:
        start_time = time.time()

        try:
            device = "cuda" if config["compute"]!= "int8" else "cpu"

            if device == "cuda":
                nvidia_smi.nvmlInit()
                handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
                # No memory info printed here

            model = WhisperModel(config["size"], device=device, compute_type=config["compute"])

            if config["size"] == "turbo":
                batched_model = BatchedInferencePipeline(model=model)
                segments, info = batched_model.transcribe(audio_file, language='en')
            else:
                segments, info = model.transcribe(audio_file, beam_size=5, language='en')

            segments_list = list(segments)

            transform = transforms.Compose([
                transforms.ToLowerCase(),
                transforms.RemovePunctuation(),
                transforms.RemoveMultipleSpaces(),
                transforms.Strip(),
            ])

            ground_truth_transformed = transform(ground_truth_transcript)
            predicted_transcript = " ".join([segment.text for segment in segments_list])
            predicted_transcript_transformed = transform(predicted_transcript)

            wer = jiwer.wer(ground_truth_transformed, predicted_transcript_transformed)

            end_time = time.time()
            inference_time = end_time - start_time

            if device == "cuda":
                infoo = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

            results.append({
                "audio_file": audio_file,
                "model_size": config["size"],
                "compute_type": config["compute"],
                "inference_time": inference_time,
                "language": info.language,
                "language_probability": info.language_probability,
                "num_segments": len(segments_list),
                "wer": wer,
                "predicted_transcript": predicted_transcript,
                "Used memory (GB)": infoo.used / (1024 ** 3)
            })

            print(
                f"Audio: {audio_file}, Model: {config['size']}, Compute: {config['compute']}, Time: {inference_time:.2f}s, WER: {wer:.2f}, Text:, {predicted_transcript}, Used memory (GB): {infoo.used / (1024 ** 3)}"
            )

            if wer == 0:
                print(f"Skipping remaining models for this audio")
                break

        except Exception as e:
            print(
                f"Error with Audio: {audio_file}, Model: {config['size']}, Compute: {config['compute']}: {e}"
            )
            results.append({
                "audio_file": audio_file,
                "model_size": config["size"],
                "compute_type": config["compute"],
                "inference_time": "Error",
                "error": str(e),
                "wer": "Error",
                "predicted_transcript": "Error",
            })

        finally:
            del model
            if config["size"] == "turbo":
                del batched_model
            torch.cuda.empty_cache()

# Print or save results (e.g., to a CSV file)
df = pd.DataFrame(results)
df.to_csv("whisper_benchmark_results_turbo.csv", index=False)
print(df)

CUDA available: True
Audio: edwardchrome.m4a, Model: turbo, Compute: float16, Time: 2.24s, WER: 0.00, Text:,  Hey Sora, open Chrome., Used memory (GB): 2.1092529296875
Skipping remaining models for this audio
Audio: edwardkahoot.m4a, Model: turbo, Compute: float16, Time: 2.50s, WER: 0.25, Text:,  Hey Sora, open kahut., Used memory (GB): 2.1092529296875
         audio_file model_size compute_type  inference_time language  \
0  edwardchrome.m4a      turbo      float16        2.240465       en   
1  edwardkahoot.m4a      turbo      float16        2.495633       en   

   language_probability  num_segments   wer     predicted_transcript  \
0                     1             1  0.00   Hey Sora, open Chrome.   
1                     1             1  0.25    Hey Sora, open kahut.   

   Used memory (GB)  
0          2.109253  
1          2.109253  


Kesimpulan: turbo tidak digunakan karena memakan memori >2GB, yaitu melebihi kapasitas jetson nano 2GB

<h1 >Trained benchmark

In [13]:
!pip install transformers torch jiwer datasets



In [14]:

import time
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import jiwer
from jiwer import transforms
import librosa
import numpy as np

# Load the processor and model
processor = WhisperProcessor.from_pretrained("EdwardFang09/whisper-base-TA-2025_v2")
model = WhisperForConditionalGeneration.from_pretrained("EdwardFang09/whisper-base-TA-2025_v2").to("cuda")

# Load your audio data and ground truth transcripts (replace with your data)
audio_ground_truth = {
    "edwardchrome.m4a": "sora, open chrome.",
    "edwardkahoot.m4a": "sora, open kahoot.",
    # ... more audio files and transcripts
}

results = []

for audio_file, ground_truth_transcript in audio_ground_truth.items():
    start_time = time.time()

    try:
        device = "cuda" if config["compute"]!= "int8" else "cpu"

        if device == "cuda":
          nvidia_smi.nvmlInit()
          handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
          # No memory info printed here

        # Load audio data using librosa
        audio_data, sr = librosa.load(audio_file, sr=16000)  # Load audio at 16kHz

        # Use the audio data (NumPy array) as input to the processor
        input_features = processor(audio_data, sampling_rate=sr, return_tensors="pt").input_features.to("cuda")

        # Generate predictions
        predicted_ids = model.generate(input_features)
        predicted_transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

        # Calculate WER
        transform = transforms.Compose([
            transforms.ToLowerCase(),
            transforms.RemovePunctuation(),
            transforms.RemoveMultipleSpaces(),
            transforms.Strip(),
        ])

        ground_truth_transformed = transform(ground_truth_transcript)
        predicted_transcript_transformed = transform(predicted_transcript)
        wer = jiwer.wer(ground_truth_transformed, predicted_transcript_transformed)

        end_time = time.time()
        inference_time = end_time - start_time

        if device == "cuda":
                infoo = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

        results.append({
            "audio_file": audio_file,
            "inference_time": inference_time,
            "wer": wer,
            "predicted_transcript": predicted_transcript,
            "Used memory (GB)": infoo.used / (1024 ** 3)
        })

        print(f"Audio: {audio_file}, Time: {inference_time:.2f}s, WER: {wer:.2f}, Text: {predicted_transcript}, Used memory (GB): {infoo.used / (1024 ** 3)}")

    except Exception as e:
        print(f"Error with Audio: {audio_file}: {e}")
        results.append({
            "audio_file": audio_file,
            "inference_time": "Error",
            "error": str(e),
            "wer": "Error",
            "predicted_transcript": "Error",
            "Used memory (GB)": infoo.used / (1024 ** 3)
        })

# Save or print the results
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("huggingface_whisper_benchmark_trained.csv", index=False)
print(df)

  audio_data, sr = librosa.load(audio_file, sr=16000)  # Load audio at 16kHz
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Audio: edwardchrome.m4a, Time: 2.36s, WER: 0.25, Text: sora open chrome, Used memory (GB): 0.7440185546875


  audio_data, sr = librosa.load(audio_file, sr=16000)  # Load audio at 16kHz
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Audio: edwardkahoot.m4a, Time: 0.30s, WER: 0.25, Text: sora open kahoot, Used memory (GB): 0.7440185546875
         audio_file  inference_time   wer predicted_transcript  \
0  edwardchrome.m4a        2.355231  0.25     sora open chrome   
1  edwardkahoot.m4a        0.302079  0.25     sora open kahoot   

   Used memory (GB)  
0          0.744019  
1          0.744019  


buat csvnya dengan screen shot

Dengan dataset yang sama, perhatikan perbedaan dengan model biasa.

Solusi:
1. Model di finetune supaya akurasi meningkat. <2 detik sudah relatif cepat.
  - contoh: kahut, kehut, kuhut, dll.
2. Cari banyak data untuk train dan testing. Pakai mikrofon ampas.


need a code that fetch a whole data inside a folder

optimized

In [15]:
torch.cuda.get_device_properties(0)

_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40, uuid=27ca1096-5681-b38d-b6fd-8d8f4b4ad7d2, L2_cache_size=4MB)

In [16]:
torch.cuda.get_device_properties(0).total_memory

15828320256

penemuan: turbo mendeteksi aksen bahasa indonesia walau sudah dicoding english dan berbicara bahasa inggris.