This notebook is designed to be run in Google Colab, which provides a pre-configured Python environment with many common data science and machine learning libraries already installed.
Package Installation Syntax:
In Colab, you can install additional Python packages using the ! at the start of a cell:
This is different from Jupyter notebooks run locally, where you might use %pip install or run pip in a terminal.

Some imports may not have explicit installation commands in the notebook because they are already available in the Colab environment by default.

In [None]:
# Installing dependencies
!pip -q install transformers datasets jiwer accelerate sentencepiece ipywidgets soundfile librosa torch torchaudio

In [None]:
# Importing required libraries for data processing and model training
import gc, random, torch, inspect
from itertools import islice
from random import randint

# Importing Hugging Face datasets and model components
from datasets import load_dataset, load_dataset_builder, Dataset, interleave_datasets, Audio, Features, Value
from transformers import WhisperForConditionalGeneration, WhisperProcessor, Seq2SeqTrainingArguments, Seq2SeqTrainer
from huggingface_hub import login
import torchaudio.transforms as T

# Authenticating with Hugging Face using API token
login(token="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")

In [10]:
# Setting up CUDA if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Defining supported languages and priority languages for training
languages = ["en", "bg", "uk", "ru", "ar", "it", "pl", "pt"]
priority_languages = ["bg", "uk", "ru", "ar", "pl"]
CAP_HR, CAP_LR = 20_000, 50_000

# Defining audio feature specifications
AUDIO_FT = Audio(sampling_rate=16_000)  # Setting audio sampling rate to 16kHz
FEATS = Features({
    "audio": AUDIO_FT,        # Audio feature configuration
    "sentence": Value("string")  # Text transcription feature
})

In [11]:
def stream_to_ds(lang: str, split: str, cap: int) -> Dataset:
    """
    Creates a streaming dataset for a specific language and split.
    
    Args:
        lang (str): Language code
        split (str): Dataset split 
        cap (int): Maximum number of samples to include
        
    Returns:
        Dataset: Hugging Face dataset with audio and transcriptions
    """
    # Loading the Common Voice dataset for the specified language
    stream = load_dataset(
        "mozilla-foundation/common_voice_13_0",
        lang, 
        split=split, 
        streaming=True,  # Enable streaming to handle large datasets
        trust_remote_code=True
    ).shuffle(buffer_size=10_000, seed=42)  # Shuffle with fixed seed for reproducibility

    def generator():
        # Extracting audio samples and their transcriptions up to the specified cap
        for ex, _ in zip(stream, range(cap)):
            yield {"audio": ex["audio"], "sentence": ex["sentence"]}

    return Dataset.from_generator(generator, features=FEATS)

def split_available(lang: str, split: str) -> bool:
    """
    Checks if a specific split is available for a language in Common Voice.
    
    Args:
        lang (str): Language code
        split (str): Dataset split to check
        
    Returns:
        bool: True if split exists and has samples, False otherwise
    """
    info = load_dataset_builder(
        "mozilla-foundation/common_voice_13_0",
        lang, 
        trust_remote_code=True
    ).info
    
    return bool(info.splits) and split in info.splits and info.splits[split].num_examples > 0

In [None]:
# Building multi-language dataset
print("Building multi-language dataset...")
small_sets = []
EXTRA_TRAIN = 20_000  # Additional training samples for languages with limited data

for lang in languages:
    if lang in priority_languages:
        # For priority languages, use validation set and either 'other' split or extra training data
        val = stream_to_ds(lang, "validation", CAP_LR)

        parts = [val]
        if split_available(lang, "other"):
            # Use 'other' split if available
            parts.append(stream_to_ds(lang, "other", CAP_LR))
        else:
            # Fallback: use a small portion of training data
            parts.append(stream_to_ds(lang, "train", EXTRA_TRAIN))

        ds = interleave_datasets(parts)
        small_sets.append(ds)
    else:
        # For non-priority languages (EN, IT, PT), use validation set with higher cap
        val = stream_to_ds(lang, "validation", CAP_HR)
        small_sets.append(val)

In [None]:
# Noise augmentation setup
def get_noise_sampler():
    """
    Creates a noise sampler that provides background noise for data augmentation.
    Attempts to use CAIMAN noise dataset, falls back to UrbanSound8K if unavailable.
    
    Returns:
        function: A function that returns a random noise sample
    """
    try:
        # Try loading CAIMAN background noise dataset
        ns = load_dataset(
            "Myrtle/CAIMAN-ASR-BackgroundNoise",
            split="train", 
            streaming=True
        )
    except Exception:
        print("CAIMAN noise missing -> UrbanSound8K fallback")
        # Fallback to UrbanSound8K dataset
        ns = load_dataset(
            "danavery/urbansound8K",
            split="train", 
            streaming=True
        )
    # Create a bank of 200 noise samples
    bank = [row["audio"]["array"] for row in islice(ns, 200)]
    def _sample(): return random.choice(bank)
    return _sample

# Initialize noise sampler
get_noise = get_noise_sampler()

def match_length(sig: torch.Tensor, tgt_len: int) -> torch.Tensor:
    """
    Adjusts the length of an audio signal to match a target length.
    
    Args:
        sig (torch.Tensor): Input audio signal
        tgt_len (int): Target length in samples
        
    Returns:
        torch.Tensor: Adjusted audio signal matching target length
    """
    cur = sig.shape[-1]
    if cur == tgt_len:
        return sig
    if cur < tgt_len:
        # If signal is too short, repeat it
        reps = tgt_len // cur + 1
        return sig.repeat(reps)[:tgt_len]
    # If signal is too long, truncate it
    return sig[:tgt_len]


In [None]:
# Initialize Whisper model and processor
HF_MODEL = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(HF_MODEL)


def whisper_data_collator(batch, snr_db: int = 15):
    """
    Collates and preprocesses batches of data for Whisper model training.
    Includes noise augmentation for improved robustness.
    
    Args:
        batch: List of examples containing audio and text
        snr_db (int): Signal-to-noise ratio in decibels for noise augmentation
        
    Returns:
        dict: Processed batch with features and labels
    """
    feats, ids = [], []
    for ex in batch:
        # Convert audio to tensor and ensure float32 type
        wav = torch.tensor(ex["audio"]["array"]).float()

        # Apply noise augmentation to 10% of samples
        if random.random() < 0.1:
            noise = torch.tensor(get_noise()).float()
            # Handle mono/stereo conversion
            if noise.dim() == 2: noise = noise[0]
            if wav.dim() == 2: wav = wav[0]
            # Match noise length to audio length
            noise = match_length(noise, wav.shape[-1])
            # Add scaled noise based on SNR
            wav += wav.std() / (10**(snr_db/20) * noise.std() + 1e-9) * noise

        # Extract features and tokenize text
        feats.append(
            processor.feature_extractor(
                wav.numpy(), sampling_rate=16_000, return_tensors="pt"
            ).input_features[0]
        )
        ids.append(
            processor.tokenizer(
                ex["sentence"], return_tensors="pt"
            ).input_ids[0]
        )

    # Pad features and labels to same length within batch
    batch_in = processor.feature_extractor.pad(
        {"input_features": feats}, return_tensors="pt"
    )
    batch_lab = processor.tokenizer.pad(
        {"input_ids": ids}, return_tensors="pt", padding=True
    )
    # Replace padding tokens with -100 for loss calculation
    batch_lab["input_ids"][batch_lab["attention_mask"] == 0] = -100
    batch_in["labels"] = batch_lab["input_ids"]
    return batch_in

In [None]:
# Loading the model
model = WhisperForConditionalGeneration.from_pretrained(HF_MODEL)
model.freeze_encoder() # Freeze encoder weights
    
# Unfreezing last 12 encoder layers for fine-tuning
for p in model.model.encoder.layers[-12:].parameters():
    p.requires_grad = True
model.gradient_checkpointing_enable() # Enable gradient checkpointing for memory efficiency

# Reset generation-related config for multilingual support
for cfg in (model.config, model.generation_config):
    cfg.forced_decoder_ids = None
    cfg.suppress_tokens = None
    cfg.begin_suppress_tokens = None

In [None]:
# Training arguments
base_kwargs = dict(
    output_dir = "/content/whisper-ft", # Directory to save model checkpoints
    per_device_train_batch_size = 4, # Batch size per GPU/CPU
    gradient_accumulation_steps = 8, # Accumulate gradients over multiple steps
    num_train_epochs = 8, # Total number of training epochs
    learning_rate = 1e-5, # Initial learning rate
    warmup_steps = 500, # Number of warmup steps for learning rate scheduler
    lr_scheduler_type = "cosine", # Learning rate scheduler type
    eval_steps = 1000, # Evaluate every N steps
    save_steps = 1000, # Save checkpoint every N steps
    save_total_limit = 1, # Keep only N most recent checkpoints
    logging_steps = 50, # Log training metrics every N steps
    fp16 = torch.cuda.is_available(), # Use mixed precision training if CUDA available
    remove_unused_columns = False, # Keep all columns in dataset
)

# Filter training arguments based on available parameters
sig = inspect.signature(Seq2SeqTrainingArguments.__init__)
args = Seq2SeqTrainingArguments(
    **{k: v for k, v in base_kwargs.items() if k in sig.parameters}
)

# Training the model
trainer = Seq2SeqTrainer(
    model = model, # Fine-tuned Whisper model
    args = args, # Training arguments
    train_dataset = train_ds, # Training dataset
    eval_dataset = eval_ds, # Evaluation dataset
    data_collator = whisper_data_collator, # Custom data collator with augmentation
    tokenizer = processor.tokenizer, # Tokenizer for text processing
)
trainer.train()


In [None]:
# Save the fine-tuned model
trainer.save_model("/content/whisper-ft")
model.save_pretrained("/content/whisper-ft")
processor.save_pretrained("/content/whisper-ft")


In [None]:
from google.colab import files
import shutil
# Download the fine-tuned model
shutil.make_archive("whisper-ft", 'zip', "/content/whisper-ft")
files.download("whisper-ft.zip")
