# **Fine-tuning a Speech-To-Text model for specific language**
Please refer to the respective sections in the book for further details.


## **Step 1. Installing Libraries and Data loading**

In [None]:
!pip install datasets
!pip install jiwer
!pip install evaluate
!pip install transformers[torch]
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16
Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from dataclasses import dataclass
from typing import Any, List, Dict, Union
import re
import evaluate
import torch
from datasets import load_dataset, Audio, metric, DatasetDict
from transformers import WhisperProcessor, WhisperForConditionalGeneration,Seq2SeqTrainingArguments, Seq2SeqTrainer, WhisperFeatureExtractor, WhisperTokenizer, pipeline
from dataclasses import dataclass
from typing import Any, Dict, List, Union

## **Step 2. Data pre-processing**

In [None]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "mrj", split="train")
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "mrj", split="test")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
common_voice = common_voice.select_columns(["audio", "sentence"])

In [None]:
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Russian", task="transcribe")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

In [None]:
def data_preprocessing(batch):
    audio = batch["audio"]

    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

common_voice = common_voice.map(data_preprocessing, remove_columns=common_voice.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/7272 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4428 [00:00<?, ? examples/s]

## **Step 3. Model training (fine-tuning)**

In [None]:
@dataclass
class SpeechToTextDataCollator:
    speech_processor: Any

    def __call__(self, samples: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        audio_input_features = [{"input_features": sample["input_features"]} for sample in samples]
        processed_batch = self.speech_processor.feature_extractor.pad(audio_input_features, return_tensors="pt")

        text_label_features = [{"input_ids": sample["labels"]} for sample in samples]
        processed_labels_batch = self.speech_processor.tokenizer.pad(text_label_features, return_tensors="pt")

        processed_labels = processed_labels_batch["input_ids"].masked_fill(processed_labels_batch.attention_mask.ne(1), -100)
        if (processed_labels[:, 0] == self.speech_processor.tokenizer.bos_token_id).all().cpu().item():
            processed_labels = processed_labels[:, 1:]
        processed_batch["labels"] = processed_labels
        return processed_batch

In [None]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Russian", task="transcribe")

In [None]:
speech_to_text_data_collator = SpeechToTextDataCollator(speech_processor=processor)

In [None]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [None]:
wer_metric = evaluate.load("wer")
def compute_wer_metrics(prediction):
    predicted_ids = prediction.predictions
    true_label_ids = prediction.label_ids
    true_label_ids[true_label_ids == -100] = tokenizer.pad_token_id
    predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    true_texts = tokenizer.batch_decode(true_label_ids, skip_special_tokens=True)
    predicted_texts = [normalize_text(text) for text in predicted_texts]
    true_texts = [normalize_text(text) for text in true_texts]
    word_error_rate = 100 * wer_metric.compute(predictions=predicted_texts, references=true_texts)
    return {"wer": word_error_rate}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
whisper_model.config.forced_decoder_ids = None
whisper_model.config.suppress_tokens = []

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [None]:
whisper_hill_mari_finetuning_args  = Seq2SeqTrainingArguments(
    output_dir="dkhublani/whisper_small_model_fine_tuned",
    ##"/content/drive/MyDrive/whisper-small-hill-mary"
    hub_model_id="dkhublani/whisper_small_model_fine_tuned",
    hub_strategy="every_save",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=20,
    max_steps=500,
    gradient_checkpointing=True,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=250,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
processor.save_pretrained(whisper_hill_mari_finetuning_args.output_dir)

[]

In [None]:
whisper_hill_mari_trainer  = Seq2SeqTrainer(
    args=whisper_hill_mari_finetuning_args ,
    model=whisper_model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=speech_to_text_data_collator,
    compute_metrics=compute_wer_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
whisper_hill_mari_trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,Wer
250,0.3034,0.363153,41.128277
500,0.1523,0.314081,30.813046


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=500, training_loss=0.4903016772270203, metrics={'train_runtime': 4645.9188, 'train_samples_per_second': 1.722, 'train_steps_per_second': 0.108, 'total_flos': 2.30637451935744e+18, 'train_loss': 0.4903016772270203, 'epoch': 1.1})

In [None]:
whisper_hill_mari_trainer.push_to_hub("dkhublani/whisper_small_model_fine_tuned")

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


events.out.tfevents.1708549062.e0537022bc02.1266.0:   0%|          | 0.00/9.42k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dkhublani/whisper_small_model_fine_tuned/commit/e0923a10f3c91387943be8136cb81d109b2b0ff0', commit_message='dkhublani/whisper_small_model_fine_tuned', commit_description='', oid='e0923a10f3c91387943be8136cb81d109b2b0ff0', pr_url=None, pr_revision=None, pr_num=None)

## **Step 4. Model Evaluation & Inference**

In [None]:
from transformers import pipeline
pipe = pipeline("automatic-speech-recognition", model="dkhublani/whisper_small_model_fine_tuned")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from jiwer import wer

def normalize_text(text):
    # Convert to lowercase and remove punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

for sample in common_voice["test"]:
    reference = sample["sentence"]
    reference = normalize_text(reference)
    hypothesis = pipe(sample["audio"].copy(), generate_kwargs={"task": "transcribe"})
    hypothesis = normalize_text(hypothesis['text'])
    print(f"Reference: {reference}")
    print(f"Hypothesis: {hypothesis}")
    print(f"WER: {wer(reference, hypothesis)}")

Reference: колжы миде сеткӓшкӹ
Hypothesis: колжы миде сет кӓшкӹ
WER: 0.6666666666666666
Reference: йынгы йыла стихотворени отважный морякреволюционер макаров лӹмеш сирӹмӹ
Hypothesis: йынгы йыла стихотворени утважный моряк революционер макаров лӹмеш сирӹмӹ
WER: 0.375
Reference: кырыквлӓ ӓнгӹрвлӓм йоктарат
Hypothesis: карыквлӓ ӓнгӹрвлӓм йыктарат
WER: 0.6666666666666666
Reference: а микитӓм ӓштет тӹнь 
Hypothesis: абикитӓм ӓштет тӹнь
WER: 0.5
Reference: райжы вет уке ылеш
Hypothesis: райжы вет у келеш
WER: 0.5


KeyboardInterrupt: 