# -- Whisper Fine-Tuning for Arabic Speech Recognition --

This notebook demonstrates **fine-tuning OpenAI's Whisper model** on the **Common Voice Arabic dataset** using Hugging Face Transformers.

📌 **Notebook Steps:**

1. Install dependencies and import libraries  
2. Load and prepare Common Voice Arabic dataset  
3. Preprocess text (clean and normalize Arabic)  
4. Convert audio + labels into model inputs  
5. Define evaluation metric (Word Error Rate)  
6. Load Whisper model and processor  
7. Define training configuration and data collator  
8. Train the model  
9. Evaluate on test data  
10. Save and reload model for inference  
11. Transcribe sample audio from test set


# Install Required Libraries

In [None]:
!pip install datasets
!pip install transformers
!pip install jiwer
!pip install evaluate



# Hugging Face Login

In [None]:
from huggingface_hub import login
secret_key = ""
login(token = secret_key)

#  Load Dataset

In [None]:
from datasets import load_dataset, DatasetDict

dataset = DatasetDict()
dataset['train'] = load_dataset("mozilla-foundation/common_voice_12_0", "ar", split="train[57%:]", trust_remote_code=True)
dataset['validation'] = load_dataset("mozilla-foundation/common_voice_12_0", "ar", split="validation[90%:]", trust_remote_code=True)
dataset['test'] = load_dataset("mozilla-foundation/common_voice_12_0", "ar", split="test[65%:]", trust_remote_code=True)

# Keep only audio and sentence columns
columns_to_keep = ['audio', 'sentence']
dataset = dataset.remove_columns([col for col in dataset['train'].column_names if col not in columns_to_keep])


## Dataset shape

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 12124
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 1035
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 3652
    })
})

## Filter and Normalize Text
### - Text Cleaning and Normalization

This step cleans the Arabic text by:

- Removing diacritics and unwanted symbols  
- Normalizing similar characters (e.g., "أ", "إ", "آ" → "ا")  
- Removing non-Arabic characters and extra whitespace

This ensures that the text is consistent for model training.


In [None]:
import re

def filter_empty(example):
    return example["sentence"].strip() != ""

def text_preprocessing(batch):
    batch['sentence'] = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', batch['sentence'])
    batch['sentence'] = re.sub(r'[^\u0621-\u063A\u0641-\u064A\s]', '', batch['sentence'])
    batch['sentence'] = re.sub(r'[إأآا]', 'ا', batch['sentence'])
    batch['sentence'] = re.sub(r'ى', 'ي', batch['sentence'])
    batch['sentence'] = re.sub(r'ؤ', 'و', batch['sentence'])
    batch['sentence'] = re.sub(r'ئ', 'ي', batch['sentence'])
    batch['sentence'] = re.sub(r'ة', 'ه', batch['sentence'])
    batch['sentence'] = re.sub(r'\s+', ' ', batch['sentence']).strip()
    return batch

dataset = dataset.filter(filter_empty)
dataset = dataset.map(text_preprocessing)

Filter:   0%|          | 0/12124 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3624 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4173 [00:00<?, ? examples/s]

Map:   0%|          | 0/12124 [00:00<?, ? examples/s]

Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/4173 [00:00<?, ? examples/s]

#  Load Whisper Model and Processor

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="ar", task="transcribe")

# Print basic model architecture info
print("Model base:", model.base_model_prefix)
print("Model architecture:", model.__class__.__name__)
print("Model size (parameters):", sum(p.numel() for p in model.parameters()) / 1e6, "M")

# Print tokenizer info
print("Tokenizer vocab size:", processor.tokenizer.vocab_size)

2025-05-06 15:33:09.074542: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746545589.259646      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746545589.313556      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Model base: model
Model architecture: WhisperForConditionalGeneration
Model size (parameters): 241.734912 M
Tokenizer vocab size: 50258


In [None]:
model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

## Audio & Text Preprocessing
### - Audio and Text Preprocessing: `prepare_dataset(batch)`

This function processes each audio-text pair into model-ready inputs:

1. Converts audio waveform into input features using Whisper's processor  
2. Tokenizes the normalized text into label IDs  
3. Returns `input_features` and `labels` ready for training

This mapping is applied to the full dataset.


In [None]:
import librosa  # Ensure you have librosa for resampling

def prepare_dataset(batch):
    audio = batch["audio"]
    # Resample audio from 48kHz to 16kHz (Whisper's expected sampling rate)
    audio_resampled = librosa.resample(audio["array"], orig_sr=audio["sampling_rate"], target_sr=16000)
    # Process the resampled audio with Whisper's feature extractor
    inputs = processor(audio_resampled, sampling_rate=16000, return_tensors="pt")
    # Store input features and labels
    batch["input_features"] = inputs.input_features[0]
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids

    return batch

dataset = dataset.map(prepare_dataset, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Map:   0%|          | 0/522 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 282
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 518
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 522
    })
})

# Define Data Collator
### - What is a Data Collator?

The **data collator** is used to dynamically pad input and label sequences within each training batch.
Since audio inputs and their corresponding tokenized labels can have variable lengths, the data collator:

- Pads all `input_features` (audio representations) to the length of the longest in the batch.
- Pads all `labels` (token IDs) to the length of the longest label sequence in the batch.
- Ensures consistent tensor shapes for efficient training.

This is essential for feeding batched data into the model correctly during training and evaluation.


In [None]:
import torch

def data_collator(features):
    input_features = [torch.tensor(f["input_features"]) for f in features]
    input_features = torch.nn.utils.rnn.pad_sequence(input_features, batch_first=True, padding_value=0)

    labels = [torch.tensor(f["labels"]) for f in features]
    labels = torch.nn.utils.rnn.pad_sequence(
        labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id or -100
    )

    return {
        "input_features": input_features,
        "labels": labels
    }


# Evaluation Metric (WER)

**Word Error Rate (WER)** is the standard metric for evaluating speech recognition models.  
It measures how many words were incorrectly predicted, and is calculated as:

`[
text{WER} = frac{S + D + I}{N}
]`

Where:
- **S** = Substitutions  
- **D** = Deletions  
- **I** = Insertions  
- **N** = Total words in the reference (ground truth)

Lower WER means better transcription performance.


In [None]:
import evaluate
wer_metric = evaluate.load('wer')
def get_wer(reference, prediction):
  wer_results = wer_metric.compute(predictions=predictions, references=references)
  return wer_results

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

# Define Training Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments
from transformers import TrainerCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-ar-finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    eval_strategy="epoch",  # No evaluation during training
    save_strategy="epoch",
    num_train_epochs=3,
    gradient_accumulation_steps=2,
    fp16=True,
    save_steps=500,
    logging_steps=500,
    logging_strategy="steps",
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=1,
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=False,
    length_column_name="input_length",
    remove_unused_columns=False,
    predict_with_generate=False,
    eval_accumulation_steps=8,
)


class PrintLossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        print(f">>> Step {state.global_step}: {logs}")


class WEREvery2EpochsCallback(TrainerCallback):  ## * note -> I don't use this class because it still have OOM ERROR :'-(
    def on_epoch_begin(self, args, state, control, **kwargs):
        if state.epoch % 2 < 1e-6:
            trainer.args.predict_with_generate = True
        else:
            trainer.args.predict_with_generate = False

#  Create Trainer and Start Training

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    #compute_metrics = get_wer,
    callbacks = [TrainerCallback()]
)

  trainer = Seq2SeqTrainer(


## Training Progress

In [8]:
trainer.train()

epoch,train_loss,eval_loss
1,0.6326,0.6956
2,0.2641,0.6268
3,0.1291,0.5529


In [None]:
trainer.args.predict_with_generate = True # Make sure generation is enable

# Save Fine-Tuned Model and Processor

In [None]:
# Save the fine-tuned Whisper model and processor
model_path = "./whisper-arabic-finetuned-best"
trainer.save_model(model_path)
processor.save_pretrained(model_path)

print(f"\n >>> Model and processor saved to: {model_path}")


# Final Evaluation on Training Set

In [None]:
import pandas as pd
import numpy as np

print("----------------------")
predictions = trainer.predict(dataset["train"])
pred_str = processor.batch_decode(predictions.predictions, skip_special_tokens=True)
wer_metric = evaluate.load("wer")

print("----------------------")

# Decode references (labels)
label_ids = predictions.label_ids
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id  # Masking
label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
wers = [wer_metric.compute(predictions=[p], references=[r]) for p, r in zip(pred_str, label_str)]
df_train = pd.DataFrame({
    "reference": label_str,
    "prediction": pred_str,
    "wer": wers
})

df_train.to_csv("wer_train.csv", index=False)
df_train.head()

In [None]:
print("\n--- Final Evaluation on Training Set ---\n")
print(f'\n TRAINING WER : {df_train['wer'].mean()}')

# Final Evaluation on Validation Set

In [None]:
print("----------------------")
predictions = trainer.predict(dataset["validation"])
pred_str = processor.batch_decode(predictions.predictions, skip_special_tokens=True)
wer_metric = evaluate.load("wer")

print("----------------------")

# Decode references (labels)
label_ids = predictions.label_ids
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id  # Masking
label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
wers = [wer_metric.compute(predictions=[p], references=[r]) for p, r in zip(pred_str, label_str)]
df_valid = pd.DataFrame({
    "reference": label_str,
    "prediction": pred_str,
    "wer": wers
})

df_valid.to_csv("wer_valid.csv", index=False)
df_valid.head()

In [None]:
print("\n--- Final Evaluation on Validation Set ---\n")
print(f'\n VALIDATION WER : {df_valid['wer'].mean()}')

#  Final Evaluation on Test Set

In [None]:
import pandas as pd
print("----------------------")
predictions = trainer.predict(dataset["test"])
pred_str = processor.batch_decode(predictions.predictions, skip_special_tokens=True)
wer_metric = evaluate.load("wer")

print("----------------------")

# Decode references (labels)
label_ids = predictions.label_ids
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id  # Masking
label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
wers = [wer_metric.compute(predictions=[p], references=[r]) for p, r in zip(pred_str, label_str)]
df_test = pd.DataFrame({
    "reference": label_str,
    "prediction": pred_str,
    "wer": wers
})

df_test.to_csv("wer_test.csv", index=False)
df_test.head()

In [None]:
print("\n--- Final Evaluation on Test Set ---\n")
print(f'\n TEST WER : {df_test['wer'].mean()}')

# Load Saved Model For Inference

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# Load the fine-tuned model and processor
model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda")
processor = WhisperProcessor.from_pretrained(model_path)

print("... Model and processor successfully reloaded for inference.")


# INFERENCE FUNCTION

In [None]:
def transcribe_audio(audio_array, sampling_rate=16000):
    inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt")
    input_features = inputs.input_features.to("cuda")

    # Generate tokens
    predicted_ids = model.generate(input_features)

    # Decode to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription


In [None]:
# **--------------------- The End --------------------**
#Code by Mohammed Mossad