#Downloading The Required Packages:
- *datasets*: To download and prepare our data.
- *transformers* and *accelerate*: To load and train our Whisper model.
- *soundfile* and *librosa*: To pre-process audio files. (used internally)
- *evaluate* and *jiwer*: To measure the performance of our model. (used internally)

In [None]:
!pip install datasets
!pip install transformers
!pip install accelerate
!pip install soundfile
!pip install librosa
!pip install evaluate
!pip install jiwer

# Loading The Dataset:

In [None]:
!cd /content/

You need to login to Hugging Face to download some models and datasets that require accepting their terms

In [None]:
from huggingface_hub import notebook_login

notebook_login()

##Loading Common Voice 13:

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

# Combine both training and validation splits into one since Arabic dataset is small
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "ar", split="train+validation")
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "ar", split="test")

In [None]:
print(common_voice)

In [None]:
# Removing unwanted features (we only want audio and its transcription)
common_voice = common_voice.remove_columns(["client_id", "path", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment", "variant"])

print(common_voice)

In [None]:
print(common_voice["train"][0])

# Preparing The dataset:

## Preparing Feature Extractor & Tokenizer:



* Feature Extractor:
  * Transforms audio into 30s clips either by splitting them if longer than 30s or adding silence if less than 30s.
  This is essential since audio files can have different durations and thus different and this can affect the extracted features length for each audio
  * Transforms audio to log-mel spectogram which the model expects as input.







* Tokenizer:
  * Transforms the output of the model (token IDs) to their respective text.





In [None]:
from transformers import WhisperProcessor

# WhisperProcesor combines both feature extractor and tokenizer
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Arabic", task="transcribe")

## Getting Dataset Ready:

In [None]:
# We need to change the sample rate from 48KHz to 16KHz since this is what whisper expects
from datasets import Audio

# cast_column makes datasets perform the resampling on the fly when the data is loaded
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# loading it into memory like this will automatically cast it to 16KHz
print(common_voice["train"][0])

In [None]:
# checking the encoding and decoding
sentence = common_voice["train"][0]["sentence"]
labels = processor.tokenizer(sentence)
decoded_with_special = processor.tokenizer.decode(labels.input_ids, skip_special_tokens=False)
decoded = processor.tokenizer.decode(labels.input_ids, skip_special_tokens=True)
print(decoded_with_special)
print('*' * 100)
print(decoded)

In [None]:
def prepare_dataset(data_item):
    # loading the data item to resample it
    audio = data_item["audio"]
    sentence = data_item["sentence"]

    # compute log-Mel input features from input audio array and add it to our item
    data_item["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"])["input_features"][0]

    # encode target text to label ids and add it to our items
    data_item["labels"] = processor.tokenizer(sentence)["input_ids"]

    # the returned item will only have input_features and labels
    return data_item

In [None]:
# apply prepare_dataset function to all the training data and remove the original columns (audio and sentence)
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

In [None]:
# input features are the same length, but labels aren't
print(len(common_voice["train"][0]["input_features"]))
print(len(common_voice["train"][1]["input_features"]))
print(len(common_voice["train"][2]["input_features"]))
print(len(common_voice["train"][0]["labels"]))
print(len(common_voice["train"][1]["labels"]))
print(len(common_voice["train"][2]["labels"]))

In [None]:
# debugging data collator
def testing_func(data_item):
    input_features = [{"input_features": feature["input_features"]} for feature in data_item]

    print(len(input_features))
    print(type(input_features))
    print(type(input_features[0]))
    print(input_features[0].keys())
    print('*' * 100)

    batch = processor.feature_extractor.pad(input_features, return_tensors="pt")

    print("size of audio feature vector before padding: " + str(len(data_item[0]["input_features"])))
    print("size of audio feature vector after padding: " + str(len(batch["input_features"][0])))
    print('*' * 100)

    print(len(batch["input_features"]))
    print(type(batch))
    print(type(batch["input_features"]))
    print(batch.keys())
    print(batch["input_features"])
    print('*' * 100)


    label_features = [{"input_ids": feature["labels"]} for feature in data_item]
    labels_batch = processor.tokenizer.pad(label_features, return_tensors="pt")

    print("size of labels vector 1 before padding: " + str(len(data_item[0]["labels"])))
    print("size of labels vector 2 before padding: " + str(len(data_item[1]["labels"])))
    print("size of labels vector 1 before padding: " + str(len(labels_batch["input_ids"][0])))
    print("size of labels vector 2 before padding: " + str(len(labels_batch["input_ids"][1])))
    print('*' * 100)

    print(labels_batch.keys())
    print(type(labels_batch["input_ids"]))
    print(labels_batch["input_ids"][0])
    print(labels_batch["input_ids"][1])
    print(type(labels_batch["attention_mask"]))
    print(labels_batch["attention_mask"][0])
    print(labels_batch["attention_mask"][1])
    print('*' * 100)

    labels = labels_batch["input_ids"].masked_fill(labels_batch["attention_mask"].ne(1), -100)

    print(labels[0])
    print(labels[1])
    print('*' * 100)

    print(labels[:,0])
    print(processor.tokenizer.bos_token_id)
    print('*' * 100)

    if (labels[:, 0] == processor.tokenizer.bos_token_id).all().cpu().item():
        print("Entered")
        labels = labels[:, 1:]
    print(labels[0])
    print(labels[1])
    print(processor.tokenizer.bos_token_id)
    print('*' * 100)

    batch["labels"] = labels
    print(batch.keys())
    print(batch["labels"][0])
testing_func([common_voice["train"][0], common_voice["train"][1]])

In [None]:
# creating a class to get the data and batch it
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorWhisper:
    processor: Any

    # data will be passed to this function
    def __call__(self, data_batch: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply converting them to PyTorch tensors and nothing more
        # no padding will be done since all input_features are padded to 30s and converted to a log-Mel spectrogram of fixed dimension before
        input_features = [{"input_features": feature["input_features"]} for feature in data_batch] # list of features where each element is the dictionary containing the feature vector of a data item from the data batch

        # pad() searches for the longest input features vector and pads the rest to be just like it in length, "pt" means PyTorch which indicates the returned feature as PyTorch tensor
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt") # dictionary containing a list of audio features as PyTorch tensors.

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in data_batch]
        # pad the labels to max length to make them all have the same length
        # for two audio files with input_id vectors of length 16 and 23, after padding, an attention_mask is created
        # attention_mask will contain two vectors coinciding with the two vectors of input_ids
        # their length is 23 each containing 1s and 0s, 0s at an index means that these elements have been padded at that index
        # so, the first attention_mask vector which corresponds to input_id 16, will have 0s starting from index 16 till 22
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore these tokens when calculating loss according to whisper requirements
        labels = labels_batch["input_ids"].masked_fill(labels_batch["attention_mask"].ne(1), -100)

        # if beginning of sequence (bos) token is appended in previous tokenization step
        # remove it here; as it's appended later
        # .all checks if this condition is true for all sequences in the batch
        # .cpu().item() converts the result from a tensor to a boolean to evaluate the if condition
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorWhisper(processor=processor)

# Fine-tuning:

## Evaluation Metric:
We will use WER for evaluation.

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
print(metric)

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id to allow the decoder to decode them back to strings (so that it doesn't try to decode -100 back to a string)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Loading Pre-trained Model:

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

Since we are fine tuning on a specific language, we need to remove some of the restrictions of the model's generation.



* Forced Decoder IDs:
  They control the transcription language and task for zero-shot automatic speech recognition (ASR). It is a mechanism to pre-determine or force certain token IDs as outputs of the model before the autoregressive generation process begins (generating output one token at a time). It is a way to control the starting point of output generation. Essentially, it's like telling the model, "Start your output with these specific tokens, and then continue generating the rest.". It can be like specifying the language to generate or the type of generation task like transcription. By setting this to None, we are configuring the model to not force any specific token IDs during the generation (prediction) process which is decoding (predicting while training) the token IDs to produce the text.

* Suppress Tokens:
This deals with tokens that the model should never generate, like harmful and inappropriate tokens. When setting it to an empty list, we're indicating that no tokens should be suppressed. This is a form of configuration that ensures all possible tokens can be sampled during the generation, which is often desired during the fine-tuning process.






In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

## Defining The Training Configuration:

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./content/whisper-tiny-ar",
    per_device_train_batch_size=16, # batch size for training per GPU/CPU
    learning_rate=1e-5,
    warmup_steps=500, # linear warmup (from 0 to learning_rate)
    max_steps=4000, # a step a batche of data will be processed the model parameters will be updated based on that batch. 4000 steps will be processed regardless of the number of epochs.
    gradient_checkpointing=True, # saves memory by recomputing some activations during the backward pass instead of storing all the activation values. This takes more time.
    fp16=True, # use 16-bit mixed precision during training instead of 32
    evaluation_strategy="steps", # evaluation is done every eval_steps
    eval_steps=1000,
    per_device_eval_batch_size=8, # batch size for evaluation per GPU/CPU
    predict_with_generate=True, # allows the model to generate entire sequences for evaluation, instead of just single tokens
    generation_max_length=225, # the max number of tokens to be generated during evaluation
    save_steps=1000, # a checkpoint is saved every 1000 steps
    logging_steps=25, # when to receive logs
    load_best_model_at_end=True, # the best model will be loaded at the end of training
    metric_for_best_model="wer", # this metric will be used when comparing different models during training to get the best model
    greater_is_better=False, # it means that a lower value for WER indicates that a model is better
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)

In [None]:
processor.save_pretrained(training_args.output_dir)

In [None]:
trainer.train()