#Downloading The Required Packages:
- *datasets*: To download and prepare our data.
- *transformers*: To load and train our Whisper model.
- *soundfile*: To pre-process audio files.
- *evaluate* and *jiwer*: To measure the performance of our model.

In [None]:
!pip install datasets
!pip install transformers
!pip install accelerate
!pip install soundfile
!pip install librosa
!pip install evaluate

# Loading The Dataset:

In [2]:
!cd /content/

You need to login to Hugging Face to download some models and datasets that require accepting their terms

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

##Loading Common Voice 13:

In [4]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

# Combine both training and validation splits into one since Arabic dataset is small
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "ar", split="train+validation")
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "ar", split="test")

Downloading builder script:   0%|          | 0.00/8.18k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/717M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/311M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/995M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/450M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.95M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.53M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.80M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 8357it [00:00, 83558.52it/s][A
Reading metadata...: 28167it [00:00, 120894.82it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 10409it [00:00, 96596.68it/s] 


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 10445it [00:00, 149267.30it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 15655it [00:00, 156539.70it/s][A
Reading metadata...: 36192it [00:00, 167572.70it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 14994it [00:00, 128611.85it/s]


In [5]:
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 38576
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 10445
    })
})


In [6]:
# Removing unwanted features (we only want audio and its transcription)
common_voice = common_voice.remove_columns(["client_id", "path", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment", "variant"])

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 38576
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 10445
    })
})


In [7]:
print(common_voice["train"][0])

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/16dcf9a265646675365f5a6dca5286bafd824e745b8f99fad6d37251d933b4cd/ar_train_0/common_voice_ar_24082672.mp3', 'array': array([-2.13162821e-14, -1.01252340e-13, -3.55271368e-14, ...,
        1.26173454e-08,  1.54317320e-07,  1.32340631e-07]), 'sampling_rate': 48000}, 'sentence': 'وما أدراك ما يوم الدين'}


# Preparing The dataset:

## Preparing Feature Extractor & Tokenizer:



* Feature Extractor:
  * Transforms audio into 30s clips either by splitting them if longer than 30s or adding silence if less than 30s.
  This is essential since audio files can have different durations and thus different and this can affect the extracted features length for each audio
  * Transforms audio to log-mel spectogram which the model expects as input.







* Tokenizer:
  * Transforms the output of the model (token IDs) to their respective text.





In [8]:
from transformers import WhisperProcessor

# WhisperProcesor combines both feature extractor and tokenizer
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Arabic", task="transcribe")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

## Getting Dataset Ready:

In [9]:
# We need to change the sample rate from 48KHz to 16KHz since this is what whisper expects
from datasets import Audio

# cast_column makes datasets perform the resampling on the fly when the data is loaded
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [10]:
# loading it into memory like this will automatically cast it to 16KHz
print(common_voice["train"][0])

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/16dcf9a265646675365f5a6dca5286bafd824e745b8f99fad6d37251d933b4cd/ar_train_0/common_voice_ar_24082672.mp3', 'array': array([ 8.73114914e-11, -4.36557457e-11,  1.67347025e-10, ...,
       -2.33121682e-08, -2.25205440e-07,  4.55183908e-08]), 'sampling_rate': 16000}, 'sentence': 'وما أدراك ما يوم الدين'}


In [16]:
# checking the encoding and decoding
sentence = common_voice["train"][0]["sentence"]
labels = processor.tokenizer(sentence)
decoded_with_special = processor.tokenizer.decode(labels.input_ids, skip_special_tokens=False)
decoded = processor.tokenizer.decode(labels.input_ids, skip_special_tokens=True)
print(decoded_with_special)
print('*' * 100)
print(decoded)

<|startoftranscript|><|ar|><|transcribe|><|notimestamps|>وما أدراك ما يوم الدين<|endoftext|>
****************************************************************************************************
وما أدراك ما يوم الدين


In [17]:
def prepare_dataset(data_item):
    # loading the data item to resample it
    audio = data_item["audio"]
    sentence = data_item["sentence"]

    # compute log-Mel input features from input audio array and add it to our item
    data_item["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"])["input_features"][0]

    # encode target text to label ids and add it to our items
    data_item["labels"] = processor.tokenizer(sentence)["input_ids"]

    # the returned item will only have input_features and labels
    return data_item

In [36]:
# input features are the same length, but labels aren't
print(len(common_voice["train"][0]["input_features"]))
print(len(common_voice["train"][1]["input_features"]))
print(len(common_voice["train"][2]["input_features"]))
print(len(common_voice["train"][0]["labels"]))
print(len(common_voice["train"][1]["labels"]))
print(len(common_voice["train"][2]["labels"]))

80
80
80
16
23
13


In [None]:
# apply prepare_dataset function to all the training data and remove the original columns (audio and sentence)
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

In [96]:
# debugging data collator
def testing_func(data_item):
    input_features = [{"input_features": feature["input_features"]} for feature in data_item]

    print(len(input_features))
    print(type(input_features))
    print(type(input_features[0]))
    print(input_features[0].keys())
    print('*' * 100)

    batch = processor.feature_extractor.pad(input_features, return_tensors="pt")

    print("size of audio feature vector before padding: " + str(len(data_item[0]["input_features"])))
    print("size of audio feature vector after padding: " + str(len(batch["input_features"][0])))
    print('*' * 100)

    print(len(batch["input_features"]))
    print(type(batch))
    print(type(batch["input_features"]))
    print(batch.keys())
    print(batch["input_features"])
    print('*' * 100)


    label_features = [{"input_ids": feature["labels"]} for feature in data_item]
    labels_batch = processor.tokenizer.pad(label_features, return_tensors="pt")

    print("size of labels vector 1 before padding: " + str(len(data_item[0]["labels"])))
    print("size of labels vector 2 before padding: " + str(len(data_item[1]["labels"])))
    print("size of labels vector 1 before padding: " + str(len(labels_batch["input_ids"][0])))
    print("size of labels vector 2 before padding: " + str(len(labels_batch["input_ids"][1])))
    print('*' * 100)

    print(labels_batch.keys())
    print(type(labels_batch["input_ids"]))
    print(labels_batch["input_ids"][0])
    print(labels_batch["input_ids"][1])
    print(type(labels_batch["attention_mask"]))
    print(labels_batch["attention_mask"][0])
    print(labels_batch["attention_mask"][1])
    print('*' * 100)

    labels = labels_batch["input_ids"].masked_fill(labels_batch["attention_mask"].ne(1), -100)

    print(labels[0])
    print(labels[1])
    print('*' * 100)

    print(labels[:,0])
    print(processor.tokenizer.bos_token_id)
    print('*' * 100)

    if (labels[:, 0] == processor.tokenizer.bos_token_id).all().cpu().item():
        print("Entered")
        labels = labels[:, 1:]
    print(labels[0])
    print(labels[1])
    print(processor.tokenizer.bos_token_id)
    print('*' * 100)

    batch["labels"] = labels
    print(batch.keys())
testing_func([common_voice["train"][0], common_voice["train"][1]])

2
<class 'list'>
<class 'dict'>
dict_keys(['input_features'])
****************************************************************************************************
size of audio feature vector before padding: 80
size of audio feature vector after padding: 80
****************************************************************************************************
2
<class 'transformers.feature_extraction_utils.BatchFeature'>
<class 'torch.Tensor'>
dict_keys(['input_features'])
tensor([[[-0.5928, -0.5928, -0.5928,  ..., -0.5928, -0.5928, -0.5928],
         [-0.5928, -0.5928, -0.5928,  ..., -0.5928, -0.5928, -0.5928],
         [-0.5928, -0.5928, -0.5928,  ..., -0.5928, -0.5928, -0.5928],
         ...,
         [-0.5928, -0.5928, -0.5928,  ..., -0.5928, -0.5928, -0.5928],
         [-0.5928, -0.5928, -0.5928,  ..., -0.5928, -0.5928, -0.5928],
         [-0.5928, -0.5928, -0.5928,  ..., -0.5928, -0.5928, -0.5928]],

        [[-0.6717, -0.6717, -0.6717,  ..., -0.6717, -0.6717, -0.6717],
         [-0

In [97]:
# creating a class to get the data and batch it
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    # data will be passed to this function
    def __call__(self, data_batch: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply converting them to PyTorch tensors and nothing more
        # no padding will be done since all input_features are padded to 30s and converted to a log-Mel spectrogram of fixed dimension before
        input_features = [{"input_features": feature["input_features"]} for feature in data_batch] # list of features where each element is the dictionary containing the feature vector of a data item from the data batch

        # pad() searches for the longest input features vector and pads the rest to be just like it in length, "pt" means PyTorch which indicates the returned feature as PyTorch tensor
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt") # dictionary containing a list of audio features as PyTorch tensors.

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in data_batch]
        # pad the labels to max length to make them all have the same length
        # for two audio files with input_id vectors of length 16 and 23, after padding, an attention_mask is created
        # attention_mask will contain two vectors coinciding with the two vectors of input_ids
        # their length is 23 each containing 1s and 0s, 0s at an index means that these elements have been padded at that index
        # so, the first attention_mask vector which corresponds to input_id 16, will have 0s starting from index 16 till 22
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore these tokens when calculating loss according to whisper requirements
        labels = labels_batch["input_ids"].masked_fill(labels_batch["attention_mask"].ne(1), -100)

        # if bos token is appended in previous tokenization step
        # remove it token here as it's appended later
        # .all checks if this condition is true for all sequences in the batch
        # .cpu().item() converts the result from a tensor to a boolean to evaluate the if condition
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [98]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Fine-tuning:

## Evaluation Metric:
We will use WER for evaluation.

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
print(metric)