In [1]:
!pip install --upgrade datasets[audio] jiwer

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting datasets[audio]
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets[audio])
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets[audio])
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets[audio])
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets[audio])
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Do

In [2]:
from datasets import load_dataset, concatenate_datasets
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
import torch
from jiwer import wer
import pandas as pd
import string

In [11]:
# save to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset_name = "clt013/malay-speech-3k-rows-dataset"
model_before_finetuning_name = "openai/whisper-small"
model_after_finetuning_name = "clt013/whisper-small-ft-malay-test-3"
eval_dataset_name = "whisper_small_eval_results"

In [4]:
class MalayEvalSpeechRecognition:
    def __init__(self, dataset_name, model_before_finetuning_name, model_after_finetuning_name):
        self.dataset = load_dataset(dataset_name)
        self.concatenated_dataset = concatenate_datasets([self.dataset['train'], self.dataset['test']])
        self.pipe = self.get_pipe_model(model_before_finetuning_name, "malay")
        self.pipe2 = self.get_pipe_model(model_after_finetuning_name, "malay")
        self.df = pd.DataFrame(columns=['reference', 'before_fine_tuning_prediction', 'after_fine_tuning_prediction', 'before_fine_tuning_wer', 'after_fine_tuning_wer'])
        self.total_dataset_hours = 0.0

    def get_dataset(self):
        return self.dataset

    def get_concatenated_dataset(self):
        return self.concatenated_dataset

    def get_pipe(self):
        return self.pipe

    def get_pipe2(self):
        return self.pipe2

    def get_df(self):
        return self.df

    def get_pipe_model(self, model_id, language):
        model = WhisperForConditionalGeneration.from_pretrained(model_id)
        processor = WhisperProcessor.from_pretrained(model_id)
        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
        model.generation_config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
        # print(model.config.forced_decoder_ids)
        # print(model.generation_config.forced_decoder_ids)
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        device = torch.device(device)
        return pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            max_new_tokens=128,
            chunk_length_s=30,
            batch_size=16,
            torch_dtype=torch_dtype,
            device=device,
        )

    def get_total_dataset_hours(self):
        return self.total_dataset_hours

    def get_trascribe(self, pipe, audio1):
        return pipe(audio1)["text"]

    def run(self, limited=False):
        df_list = []
        for i in range(len(self.concatenated_dataset.take(10) if limited else self.concatenated_dataset)):
            audio_sample = self.concatenated_dataset[i]['audio']
            samples = audio_sample['array']
            sampling_rate = audio_sample['sampling_rate']
            duration = len(samples) / sampling_rate
            self.total_dataset_hours += duration

            print("Processing num " + str(i))
            reference = self.concatenated_dataset[i]['sentence']
            prediction = self.get_trascribe(self.pipe, self.concatenated_dataset[i]['audio']).translate(str.maketrans('', '', string.punctuation)).lower()
            prediction2 = self.get_trascribe(self.pipe2, self.concatenated_dataset[i]['audio']).translate(str.maketrans('', '', string.punctuation)).lower()
            wer1 = wer(reference, prediction)
            wer2 = wer(reference, prediction2)
            df_list.append(pd.DataFrame({'reference': [reference], 'before_fine_tuning_prediction': [prediction], 'after_fine_tuning_prediction': [prediction2], 'before_fine_tuning_wer': [wer1], 'after_fine_tuning_wer': [wer2]}))
        self.df = pd.concat(df_list, ignore_index=True)
        self.total_dataset_hours /= 3600


In [5]:
asr = MalayEvalSpeechRecognition(dataset_name, model_before_finetuning_name, model_after_finetuning_name)

Downloading readme:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/216M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/59.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2545 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/604 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
asr.run()

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processing num 0




Processing num 1
Processing num 2
Processing num 3
Processing num 4
Processing num 5
Processing num 6
Processing num 7
Processing num 8
Processing num 9
Processing num 10
Processing num 11
Processing num 12
Processing num 13
Processing num 14
Processing num 15
Processing num 16
Processing num 17
Processing num 18
Processing num 19
Processing num 20
Processing num 21
Processing num 22
Processing num 23
Processing num 24
Processing num 25
Processing num 26
Processing num 27
Processing num 28
Processing num 29
Processing num 30
Processing num 31
Processing num 32
Processing num 33
Processing num 34
Processing num 35
Processing num 36
Processing num 37
Processing num 38
Processing num 39
Processing num 40
Processing num 41
Processing num 42
Processing num 43
Processing num 44
Processing num 45
Processing num 46
Processing num 47
Processing num 48
Processing num 49
Processing num 50
Processing num 51
Processing num 52
Processing num 53
Processing num 54
Processing num 55
Processing num 56
P

In [14]:
total_duration = asr.get_total_dataset_hours()
print("Total dataset hours: " + str(round(total_duration, 3)))

Total dataset hours: 2.405


In [15]:
asr.get_df()

Unnamed: 0,reference,before_fine_tuning_prediction,after_fine_tuning_prediction,before_fine_tuning_wer,after_fine_tuning_wer
0,hai weh,hai wee,hai wei,0.500000,0.500000
1,ah makanan bagi aku macam struggle kan,makanan bagi aku macam sagas kan,ah makanan bagi aku macam struggle kan,0.285714,0.000000
2,makanan yang sedap kat Malaysia ni doh,makanan yang setahuk nasi ini doh,makanan yang sedap kat malaysia ni doh,0.571429,0.142857
3,kenapa,kenapa,kenapa,0.000000,0.000000
4,dia punya dia punya,dia punya dia punya,dia punya dia punya,0.000000,0.000000
...,...,...,...,...,...
3144,okey aku pun tu je la makanan yang,aku pun terserah makanan yang,okey aku pun tu je la makanan yang,0.500000,0.000000
3145,makanan kegemaran aku,makan yang kegumaran aku,makan akan gemaran aku,1.000000,1.000000
3146,so,so,so,0.000000,0.000000
3147,jumpa lagi,jumpa lagi,sempur lagi,0.000000,0.500000


In [16]:
# get average WER
average_wer_before_fine_tuning = asr.get_df()['before_fine_tuning_wer'].mean()
average_wer_after_fine_tuning = asr.get_df()['after_fine_tuning_wer'].mean()

print("Average WER before fine tuning: " + str(round(average_wer_before_fine_tuning * 100, 2)) + " %")
print("Average WER after fine tuning: " + str(round(average_wer_after_fine_tuning * 100, 2)) + " %")

Average WER before fine tuning: 68.74 %
Average WER after fine tuning: 15.56 %


In [17]:
asr.get_df().to_csv(eval_dataset_name + ".csv", index=False)

!cp {eval_dataset_name + ".csv"} "/content/drive/MyDrive/"