In [1]:
!pip install --upgrade datasets[audio] jiwer transformers==4.41.2

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets[audio]
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets[audio])
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers==4.41.2)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets[audio])
  Downloading multiprocess-0.70.16-py310-none-any.whl.

In [2]:
from datasets import load_dataset, concatenate_datasets
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
import torch
from jiwer import wer
import pandas as pd
import string
import time

In [3]:
# save to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dataset_name = "clt013/malay-speech-1.6-million-rows-dataset"
model_before_finetuning_name = "openai/whisper-large-v3"
model_after_finetuning_name = "clt013/whisper-large-v3-ft-malay-test-1"
eval_dataset_name = "whisper_large_v3_eval_results"

In [5]:
class MalayEvalSpeechRecognition:
    def __init__(self, dataset_name, model_before_finetuning_name, model_after_finetuning_name):
        self.dataset = load_dataset(dataset_name)
        self.concatenated_dataset = concatenate_datasets([self.dataset['train'], self.dataset['test']])
        self.pipe = self.get_pipe_model(model_before_finetuning_name, "malay")
        self.pipe2 = self.get_pipe_model(model_after_finetuning_name, "malay")
        self.df = pd.DataFrame(columns=['reference', 'before_fine_tuning_prediction', 'after_fine_tuning_prediction', 'before_fine_tuning_wer', 'after_fine_tuning_wer', 'process_time'])
        self.total_dataset_hours = 0.0

    def get_dataset(self):
        return self.dataset

    def get_concatenated_dataset(self):
        return self.concatenated_dataset

    def get_pipe(self):
        return self.pipe

    def get_pipe2(self):
        return self.pipe2

    def get_df(self):
        return self.df

    def get_pipe_model(self, model_id, language):
        model = WhisperForConditionalGeneration.from_pretrained(model_id)
        processor = WhisperProcessor.from_pretrained(model_id)
        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
        model.generation_config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
        # print(model.config.forced_decoder_ids)
        # print(model.generation_config.forced_decoder_ids)
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        device = torch.device(device)
        return pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            max_new_tokens=128,
            chunk_length_s=30,
            batch_size=16,
            torch_dtype=torch_dtype,
            device=device,
        )

    def get_total_dataset_hours(self):
        total_duration = 0.0
        for i in range(len(self.concatenated_dataset)):
            audio_sample = self.concatenated_dataset[i]['audio']
            samples = audio_sample['array']
            sampling_rate = audio_sample['sampling_rate']
            duration = len(samples) / sampling_rate
            total_duration += duration

        self.total_dataset_hours = total_duration / 3600
        return self.total_dataset_hours

    def get_trascribe(self, pipe, audio1):
        return pipe(audio1)["text"]

    # filter out less than 1 sec and more than 30 sec
    def filter_dataset(self):
        self.concatenated_dataset = self.concatenated_dataset.filter(lambda x: 1.0 < (len(x["audio"]["array"])/x["audio"]["sampling_rate"]) < 30.0)

    def run(self, limited=False, limited_amount=10):
        df_list = []
        for i in range(len(self.concatenated_dataset.take(limited_amount) if limited else self.concatenated_dataset)):
            print("Processing num " + str(i))
            audio_sample = asr_dataset[i]['audio']
            samples = audio_sample['array']
            sampling_rate = audio_sample['sampling_rate']
            duration = len(samples) / sampling_rate
            if duration < 1.0 or duration > 30.0:
                continue
            start_time = time.time()
            reference = self.concatenated_dataset[i]['sentence']
            prediction = self.get_trascribe(self.pipe, self.concatenated_dataset[i]['audio']).translate(str.maketrans('', '', string.punctuation)).lower()
            prediction2 = self.get_trascribe(self.pipe2, self.concatenated_dataset[i]['audio']).translate(str.maketrans('', '', string.punctuation)).lower()
            wer1 = wer(reference, prediction)
            wer2 = wer(reference, prediction2)
            process_time = time.time() - start_time
            print("Process time: %s seconds" % (process_time))
            df_list.append(pd.DataFrame({'reference': [reference],
                                         'before_fine_tuning_prediction': [prediction],
                                         'after_fine_tuning_prediction': [prediction2],
                                         'before_fine_tuning_wer': [wer1],
                                         'after_fine_tuning_wer': [wer2],
                                         'process_time': [process_time]}))

        self.df = pd.concat(df_list, ignore_index=True)


In [6]:
asr = MalayEvalSpeechRecognition(dataset_name, model_before_finetuning_name, model_after_finetuning_name)

Downloading readme:   0%|          | 0.00/485 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/86 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/86 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/86 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/1308479 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/327120 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/86 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/112k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/2.99k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
asr.get_concatenated_dataset()

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 1635599
})

In [9]:
asr.filter_dataset()
asr.get_concatenated_dataset()

Filter:   0%|          | 0/1635599 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 1632978
})

In [7]:
asr_dataset = asr.get_concatenated_dataset()

In [12]:
asr_dataset

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 1632978
})

In [13]:
total_duration = 0.0
for i in range(len(asr_dataset)):
    audio_sample = asr_dataset[i]['audio']
    samples = audio_sample['array']
    sampling_rate = audio_sample['sampling_rate']
    duration = len(samples) / sampling_rate
    total_duration += duration
    print("Processing num " + str(i))

total_dataset_hours = total_duration / 3600
print("Total dataset hours: " + str(round(total_dataset_hours, 3)))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing num 1627979
Processing num 1627980
Processing num 1627981
Processing num 1627982
Processing num 1627983
Processing num 1627984
Processing num 1627985
Processing num 1627986
Processing num 1627987
Processing num 1627988
Processing num 1627989
Processing num 1627990
Processing num 1627991
Processing num 1627992
Processing num 1627993
Processing num 1627994
Processing num 1627995
Processing num 1627996
Processing num 1627997
Processing num 1627998
Processing num 1627999
Processing num 1628000
Processing num 1628001
Processing num 1628002
Processing num 1628003
Processing num 1628004
Processing num 1628005
Processing num 1628006
Processing num 1628007
Processing num 1628008
Processing num 1628009
Processing num 1628010
Processing num 1628011
Processing num 1628012
Processing num 1628013
Processing num 1628014
Processing num 1628015
Processing num 1628016
Processing num 1628017
Processing num 1628018
Processing num 

In [8]:
asr.concatenated_dataset = asr_dataset.shuffle()
asr.run(limited=True, limited_amount=10000)

Processing num 0
Process time: 3.6259377002716064 seconds
Processing num 1
Process time: 1.3686833381652832 seconds
Processing num 2
Process time: 1.985039234161377 seconds
Processing num 3
Process time: 2.3462724685668945 seconds
Processing num 4
Process time: 1.7631447315216064 seconds
Processing num 5
Process time: 1.7241849899291992 seconds
Processing num 6
Process time: 1.4561777114868164 seconds
Processing num 7
Process time: 1.6266343593597412 seconds
Processing num 8
Process time: 1.8975143432617188 seconds
Processing num 9


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Process time: 2.264840841293335 seconds
Processing num 7500
Process time: 2.2490768432617188 seconds
Processing num 7501
Process time: 2.2604711055755615 seconds
Processing num 7502
Process time: 2.4603590965270996 seconds
Processing num 7503
Process time: 2.1881747245788574 seconds
Processing num 7504
Process time: 3.0424745082855225 seconds
Processing num 7505
Process time: 2.1228995323181152 seconds
Processing num 7506
Process time: 1.4782230854034424 seconds
Processing num 7507
Process time: 2.1762843132019043 seconds
Processing num 7508
Process time: 1.9702298641204834 seconds
Processing num 7509
Process time: 2.382267951965332 seconds
Processing num 7510
Process time: 1.8887042999267578 seconds
Processing num 7511
Process time: 1.360469102859497 seconds
Processing num 7512
Process time: 1.6790151596069336 seconds
Processing num 7513
Process time: 1.6487653255462646 seconds
Processing num 7514
Process time: 2.3454432

In [9]:
asr.get_df()

Unnamed: 0,reference,before_fine_tuning_prediction,after_fine_tuning_prediction,before_fine_tuning_wer,after_fine_tuning_wer,process_time
0,saya ingin mengunjungi luxembourg,terima kasih kerana menonton,saya ingin mengundang hidup sumber,1.000000,0.750000,3.625938
1,perak member gaji,perakmin macam manusia dikaji setidaknya,perak min macam manusia dikaji,1.666667,1.333333,1.368683
2,apanya bila tadi ada orang kenal dia tapi dia ...,kenapa bila ada orang snap dia macam melepask...,bila jadi ada orang kena pi macam lebih rumah,1.076923,0.615385,1.985039
3,kita harus juga melihat bagaimana yuran ajar p...,kita harus juga melihat bagaimana yuran pajar...,kita harus juga melihat bagaimana yuran pajar ...,0.400000,0.533333,2.346272
4,menemui kesukaran dalam memutuskan rantaian wa...,menemui kesukaran dalam memutuskan rantaian w...,menemui kesukaran dalam mengutuskan rantaian w...,0.142857,0.142857,1.763145
...,...,...,...,...,...,...
9984,nabi apa dalam surah balik tulisan dalam seora...,dalam surah asharali dalam surah annamali jug...,nabi saw dalam surah as harali seorang di dala...,0.571429,0.571429,2.368849
9985,tuan tuan dan puan puan insya allah kut kurang...,tuantuan dan perempuan insyaallah kot orang h...,tuan tuan dan perempuan insya allah kut orang ...,0.785714,0.285714,2.283926
9986,selain itu bersukan juga dapat mengisi masa se...,selain itu persukan juga dapat mengisi masa s...,selain itu bersukan juga dapat mengisi masa se...,0.111111,0.000000,1.723747
9987,bukan tak bagi family adalah salah satu soalan...,podcast ni nak argue okay ni last lah banyak ...,takkan tak lagi malam jadilah salah saya banya...,1.076923,0.692308,2.180114


In [10]:
# get average WER
average_wer_before_fine_tuning = asr.get_df()['before_fine_tuning_wer'].mean()
average_wer_after_fine_tuning = asr.get_df()['after_fine_tuning_wer'].mean()
average_process_time = asr.get_df()['process_time'].mean()

print("Average process time: " + str(round(average_process_time, 3)) + " seconds")
print("Average WER before fine tuning: " + str(round(average_wer_before_fine_tuning * 100, 2)) + " %")
print("Average WER after fine tuning: " + str(round(average_wer_after_fine_tuning * 100, 2)) + " %")

Average process time: 2.014 seconds
Average WER before fine tuning: 73.79 %
Average WER after fine tuning: 41.4 %


In [11]:
asr.get_df().to_csv(eval_dataset_name + ".csv", index=False)

!cp {eval_dataset_name + ".csv"} "/content/drive/MyDrive/"