In [2]:
!pip install -q bitsandbytes peft datasets[audio] jiwer

In [3]:
!pip install torch



In [4]:
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
from peft import PeftModel, PeftConfig
import torch
class SpeechRecognizer:
    def __init__(self, peft_model_id, language="malay", task="transcribe"):
        # Store the language and task
        self.language = language
        self.task = task

        # Load PeftConfig
        self.peft_model_id = peft_model_id
        peft_config = PeftConfig.from_pretrained(self.peft_model_id)

        # Load the model, tokenizer, processor, and other necessary components
        self.model = WhisperForConditionalGeneration.from_pretrained(
            peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
        )
        self.model = PeftModel.from_pretrained(self.model, self.peft_model_id)
        self.tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
        self.processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
        self.feature_extractor = self.processor.feature_extractor
        self.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language=self.language, task=self.task)

        # Initialize the pipeline for ASR
        self.pipe = AutomaticSpeechRecognitionPipeline(
            model=self.model, tokenizer=self.tokenizer, feature_extractor=self.feature_extractor
        )

    def transcribe(self, audio_file):
        # Perform transcription with automatic mixed precision if CUDA is available
        if torch.cuda.is_available():
            with torch.cuda.amp.autocast():
                result = self.pipe(audio_file, generate_kwargs={"forced_decoder_ids": self.forced_decoder_ids}, max_new_tokens=255)
        else:
            result = self.pipe(audio_file, generate_kwargs={"forced_decoder_ids": self.forced_decoder_ids}, max_new_tokens=255)

        # Return the transcribed text
        return result["text"]

In [5]:
peft_model_id_1 = "clt013/whisper-large-v3-ft-malay-peft-v1"
perf_model_id_2 = "clt013/whisper-small-ft-malay-peft-epoch-20"
# whisper_large_v3_recognizer = SpeechRecognizer(peft_model_id=peft_model_id_1)
whisper_small_recognizer = SpeechRecognizer(peft_model_id=perf_model_id_2)

adapter_config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [6]:
# save to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import pandas as pd
import time
from datasets import load_dataset, concatenate_datasets

def transcribe_and_compare(dataset_name, whisper_small_recognizer,
                           #whisper_large_v3_recognizer,
                           limit=10, limited=False):
    """
    Transcribe audio samples from the dataset using two different Whisper models, compare the transcriptions,
    and measure the processing time for each transcription.

    Args:
    - dataset_name (str): The name of the dataset to load.
    - whisper_small_recognizer: The recognizer object for the small Whisper model.
    - whisper_large_v3_recognizer: The recognizer object for the large Whisper model.
    - limit (int): The number of samples to process if 'limited' is True.
    - limited (bool): Whether to limit the number of samples processed.

    Returns:
    - df (DataFrame): A pandas DataFrame containing the reference sentence, transcriptions,
                      and processing times for the two Whisper models.
    """
    # Load the dataset
    dataset = load_dataset(dataset_name)

    # Concatenate train and test sets
    concatenated_dataset = concatenate_datasets([dataset["train"], dataset["test"]])

    # Prepare to store the results
    df_list = []

    # Loop through the dataset, limited by the specified amount if necessary
    for i in range(limit if limited else len(concatenated_dataset)):
        audio_sample = concatenated_dataset[i]['audio']
        samples = audio_sample['array']
        reference = concatenated_dataset[i]['sentence']

        # Transcribe using whisper_small_recognizer and measure processing time
        start_time_small = time.time()
        transcribed_1 = whisper_small_recognizer.transcribe(audio_sample)
        end_time_small = time.time()
        process_time_small = end_time_small - start_time_small

        # Transcribe using whisper_large_v3_recognizer and measure processing time
        # start_time_large = time.time()
        # audio_sample = concatenated_dataset[i]['audio']
        # transcribed_2 = whisper_large_v3_recognizer.transcribe(audio_sample)
        # end_time_large = time.time()
        # process_time_large = end_time_large - start_time_large

        # Append the results to the list, including processing times
        df_list.append(pd.DataFrame({
            'reference': [reference],
            'whisper_small_prediction': [transcribed_1],
            # 'whisper_large_v3_prediction': [transcribed_2],
            'whisper_small_process_time': [process_time_small],
            # 'whisper_large_v3_process_time': [process_time_large]
        }))

        print(f"Processed sample {i}: small model time = {process_time_small:.2f}s")
        #, large model time = {process_time_large:.2f}s")

    # Combine the results into a single DataFrame
    df = pd.concat(df_list, ignore_index=True)

    return df

In [12]:
df_results = transcribe_and_compare("clt013/malay-speech-3k-rows-dataset_v2", whisper_small_recognizer)
df_results

README.md:   0%|          | 0.00/460 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/59.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2544 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/604 [00:00<?, ? examples/s]

  with torch.cuda.amp.autocast():
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Processed sample 0: small model time = 3.55s
Processed sample 1: small model time = 1.38s
Processed sample 2: small model time = 1.31s
Processed sample 3: small model time = 0.55s
Processed sample 4: small model time = 0.88s
Processed sample 5: small model time = 1.40s
Processed sample 6: small model time = 1.32s
Processed sample 7: small model time = 2.90s
Processed sample 8: small model time = 1.50s


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed sample 9: small model time = 0.79s
Processed sample 10: small model time = 1.06s
Processed sample 11: small model time = 1.29s
Processed sample 12: small model time = 1.82s
Processed sample 13: small model time = 0.69s
Processed sample 14: small model time = 0.69s
Processed sample 15: small model time = 3.49s
Processed sample 16: small model time = 1.40s
Processed sample 17: small model time = 0.58s
Processed sample 18: small model time = 1.06s
Processed sample 19: small model time = 0.94s
Processed sample 20: small model time = 2.42s
Processed sample 21: small model time = 0.80s
Processed sample 22: small model time = 0.65s
Processed sample 23: small model time = 0.78s
Processed sample 24: small model time = 2.37s
Processed sample 25: small model time = 2.75s
Processed sample 26: small model time = 3.39s
Processed sample 27: small model time = 2.20s
Processed sample 28: small model time = 0.98s
Processed sample 29: small model time = 3.30s
Processed sample 30: small model ti

Unnamed: 0,reference,whisper_small_prediction,whisper_small_process_time
0,hai weh,hai weh,3.553121
1,ah makanan bagi aku macam struggle kan,ah makanan bagi aku macam struggle kan,1.381811
2,makanan yang sedap kat Malaysia ni tu,makanan yang sedap kat Malaysia ni tu,1.314597
3,kenapa,kenapa,0.551974
4,dia punya dia punya,dia punya dia punya,0.882010
...,...,...,...
3143,okey aku pun tu je la makanan yang,okey aku pun tercelah makanan yang,1.890374
3144,makanan kegemaran aku,makanan kegemaran aku,1.101784
3145,so,so,0.414639
3146,jumpa lagi,sempa lagi,0.632356


In [13]:
df_results.to_csv("peft_evaluate_small_epoch_20.csv", index=False)

!cp {"peft_evaluate_small_epoch_20.csv"} "/content/drive/MyDrive/"