In [1]:
!pip install -U bitsandbytes peft datasets[audio] jiwer



In [2]:
!pip install -q git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install torch



In [15]:
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
from peft import PeftModel, PeftConfig
import torch
class SpeechRecognizer:
    def __init__(self, peft_model_id, language="malay", task="transcribe"):
        # Store the language and task
        self.language = language
        self.task = task

        # Load PeftConfig
        self.peft_model_id = peft_model_id
        peft_config = PeftConfig.from_pretrained(self.peft_model_id)

        # Load the model, tokenizer, processor, and other necessary components
        self.model = WhisperForConditionalGeneration.from_pretrained(
            peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
        )
        self.model = PeftModel.from_pretrained(self.model, self.peft_model_id)
        self.tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
        self.processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
        self.feature_extractor = self.processor.feature_extractor
        self.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language=self.language, task=self.task)

        # Initialize the pipeline for ASR
        self.pipe = AutomaticSpeechRecognitionPipeline(
            model=self.model, tokenizer=self.tokenizer, feature_extractor=self.feature_extractor
        )

    def transcribe(self, audio_file):
        # Perform transcription with automatic mixed precision if CUDA is available
        if torch.cuda.is_available():
            with torch.cuda.amp.autocast():
                result = self.pipe(audio_file, generate_kwargs={"forced_decoder_ids": self.forced_decoder_ids}, max_new_tokens=255)
        else:
            result = self.pipe(audio_file, generate_kwargs={"forced_decoder_ids": self.forced_decoder_ids}, max_new_tokens=255)

        # Return the transcribed text
        return result["text"]

In [16]:
peft_model_id_1 = "clt013/whisper-large-v3-ft-malay-peft-v1"
perf_model_id_2 = "clt013/whisper-small-ft-malay-peft-v1"
whisper_large_v3_recognizer = SpeechRecognizer(peft_model_id=peft_model_id_1)
whisper_small_recognizer = SpeechRecognizer(peft_model_id=perf_model_id_2)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [17]:
# save to drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import pandas as pd
import time
from datasets import load_dataset, concatenate_datasets

def transcribe_and_compare(dataset_name, whisper_small_recognizer, whisper_large_v3_recognizer, limit=10, limited=False):
    """
    Transcribe audio samples from the dataset using two different Whisper models, compare the transcriptions,
    and measure the processing time for each transcription.

    Args:
    - dataset_name (str): The name of the dataset to load.
    - whisper_small_recognizer: The recognizer object for the small Whisper model.
    - whisper_large_v3_recognizer: The recognizer object for the large Whisper model.
    - limit (int): The number of samples to process if 'limited' is True.
    - limited (bool): Whether to limit the number of samples processed.

    Returns:
    - df (DataFrame): A pandas DataFrame containing the reference sentence, transcriptions,
                      and processing times for the two Whisper models.
    """
    # Load the dataset
    dataset = load_dataset(dataset_name)

    # Concatenate train and test sets
    concatenated_dataset = concatenate_datasets([dataset["train"], dataset["test"]])

    # Prepare to store the results
    df_list = []

    # Loop through the dataset, limited by the specified amount if necessary
    for i in range(limit if limited else len(concatenated_dataset)):
        audio_sample = concatenated_dataset[i]['audio']
        samples = audio_sample['array']
        reference = concatenated_dataset[i]['sentence']

        # Transcribe using whisper_small_recognizer and measure processing time
        start_time_small = time.time()
        transcribed_1 = whisper_small_recognizer.transcribe(audio_sample)
        end_time_small = time.time()
        process_time_small = end_time_small - start_time_small

        # Transcribe using whisper_large_v3_recognizer and measure processing time
        start_time_large = time.time()
        audio_sample = concatenated_dataset[i]['audio']
        transcribed_2 = whisper_large_v3_recognizer.transcribe(audio_sample)
        end_time_large = time.time()
        process_time_large = end_time_large - start_time_large

        # Append the results to the list, including processing times
        df_list.append(pd.DataFrame({
            'reference': [reference],
            'whisper_small_prediction': [transcribed_1],
            'whisper_large_v3_prediction': [transcribed_2],
            'whisper_small_process_time': [process_time_small],
            'whisper_large_v3_process_time': [process_time_large]
        }))

        print(f"Processed sample {i}: small model time = {process_time_small:.2f}s, large model time = {process_time_large:.2f}s")

    # Combine the results into a single DataFrame
    df = pd.concat(df_list, ignore_index=True)

    return df

In [21]:
df_results = transcribe_and_compare("clt013/malay-speech-3k-rows-dataset_v2", whisper_small_recognizer, whisper_large_v3_recognizer)
df_results

  with torch.cuda.amp.autocast():


Processed sample 0: small model time = 0.91s, large model time = 1.89s
Processed sample 1: small model time = 1.70s, large model time = 2.97s
Processed sample 2: small model time = 1.36s, large model time = 2.91s
Processed sample 3: small model time = 0.57s, large model time = 1.16s
Processed sample 4: small model time = 1.07s, large model time = 2.48s
Processed sample 5: small model time = 1.18s, large model time = 1.83s
Processed sample 6: small model time = 1.01s, large model time = 2.02s
Processed sample 7: small model time = 1.48s, large model time = 2.97s
Processed sample 8: small model time = 1.43s, large model time = 2.54s
Processed sample 9: small model time = 0.75s, large model time = 2.34s
Processed sample 10: small model time = 1.78s, large model time = 2.45s
Processed sample 11: small model time = 0.94s, large model time = 1.80s
Processed sample 12: small model time = 1.63s, large model time = 3.40s
Processed sample 13: small model time = 0.48s, large model time = 0.93s
Pr

Unnamed: 0,reference,whisper_small_prediction,whisper_large_v3_prediction,whisper_small_process_time,whisper_large_v3_process_time
0,hai weh,hai weh,hai weh,0.906096,1.893994
1,ah makanan bagi aku macam struggle kan,ah makanan bagi aku macam struggles kan,ah makanan bagi aku macam sages kan,1.695222,2.967040
2,makanan yang sedap kat Malaysia ni tu,makanan yang sedap kat Malaysia ni tu,makanan yang setiap orang Malaysia ni tu,1.364464,2.907638
3,kenapa,kenapa,kenapa,0.571702,1.163654
4,dia punya dia punya,dia punya dia punya,dia punya dia punya,1.072436,2.483804
...,...,...,...,...,...
3143,okey aku pun tu je la makanan yang,okey aku pun terjelah makanan yang,okay aku pun tu je lah makanan yang,1.736142,3.742666
3144,makanan kegemaran aku,makanan kegemaran aku,makanan kegemaran aku,1.142110,2.224437
3145,so,so,so,0.437891,0.932857
3146,jumpa lagi,semua lagi,jumpa lagi,0.656888,1.561898


In [22]:
df_results.to_csv("peft_evaluate.csv", index=False)

!cp {"peft_evaluate.csv"} "/content/drive/MyDrive/"