- PEFT
- bitsandbytes
- accelerate

In [1]:
!pip install -q transformers datasets librosa evaluate jiwer gradio bitsandbytes==0.37 accelerate 
!pip install -q git+https://github.com/huggingface/peft.git@main

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset, DatasetDict

In [5]:
common_voice = DatasetDict()

In [6]:
language_abbr = "pa-IN"

In [7]:
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", language_abbr, split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", language_abbr, split="test", use_auth_token=True)



In [8]:
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 997
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 459
    })
})


For ASR, we only need input audio samples (audio) and the corresponding transcribed text (sentence). Hence, we can remove additional metadata information, such as "accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes", "variant".

In [9]:
common_voice = common_voice.remove_columns([ "accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes", "variant"])

ASR Pipeline = 


1. A feature extractor which pre-processes the raw audio-inputs
2. The model which performs the sequence-to-sequence mapping
3. A tokenizer which post-processes the model outputs to text format

In [10]:
from transformers import WhisperFeatureExtractor

In [11]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v2")

In [12]:
from transformers import WhisperTokenizer

task = "transcribe"

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v2", language="punjabi", task=task)

In [13]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2", language="punjabi", task="transcribe")

In [14]:
print(common_voice["train"][0])

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/0ae81662e85c06e585d3ad85aecd7c518d5e2785aed4c0b5154156b935ad582c/common_voice_pa-IN_23337849.mp3', 'array': array([0., 0., 0., ..., 0., 0., 0.]), 'sampling_rate': 48000}, 'sentence': 'ਕਲੱਬਾਂ ਦੇ ਬੈਨਰ ਲੱਗੇ ਹੋਏ ਸਨ'}


**Upsampling :** Increasing the sample rate of the audio signal

**Advantages:** High Resolution

**Disadvantages:**

- does not create new information - so upsampling cannot recover details that were not captured in the original recording.

- takes more storage space and requires more computational resources to process.

**Downsampling:**Decreasing the sample rate of the audio signal.

**Advantages:** Reduced Storage and Computational Processing

**Disadvantages:**

- Loss of Details - low resolution
- Aliasing: This occurs when high frequencies are mapped to lower frequencies during the downsampling process. To prevent this, a low-pass filter (also known as an anti-aliasing filter) is often applied before downsampling.

Since our input audio is sampled at 48kHz, we need to downsample it to 16kHz prior to passing it to the Whisper feature extractor, 16kHz being the sampling rate expected by the Whisper model.

In [15]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [16]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 997
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 459
    })
})

In [17]:
print(common_voice["train"][0])

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/0ae81662e85c06e585d3ad85aecd7c518d5e2785aed4c0b5154156b935ad582c/common_voice_pa-IN_23337849.mp3', 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        1.57673874e-12, -5.00932629e-13, -3.45390383e-13]), 'sampling_rate': 16000}, 'sentence': 'ਕਲੱਬਾਂ ਦੇ ਬੈਨਰ ਲੱਗੇ ਹੋਏ ਸਨ'}


In [18]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

We can apply the data preparation function to all of our training examples using dataset's .map method. The argument num_proc specifies how many CPU cores to use. Setting num_proc > 1 will enable multiprocessing. If the .map method hangs with multiprocessing, set num_proc=1 and process the dataset sequentially.

In [19]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)



In [21]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [22]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [24]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

To reduce our models memory footprint, we load the model in 8bit, this means we quantize the model to use 1/4th precision (when compared to float32) with minimal loss to performance.

In [25]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("DrishtiSharma/whisper-large-v2-punjabi", load_in_8bit=True, device_map="auto")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

In [28]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

Since the Whisper model uses Convolutional layers in the Encoder, checkpointing disables grad computation to avoid this we specifically need to make the inputs trainable.

In [30]:
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)

model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

<torch.utils.hooks.RemovableHandle at 0x7fbe510a4df0>

register_forward_hook method is used to attach a forward hook to the conv1 layer of the model.model.encoder module. The purpose of the forward hook is to modify the output of the conv1 layer by setting requires_grad to True.

By setting requires_grad to True - the output of the conv1 layer should be included in the computation of gradients during the backward pass. This allows you to compute gradients with respect to the output of conv1 and perform further gradient-based optimization or analysis.

Apply Low-rank adapters (LoRA) to the model

In [31]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 15,728,640 || all params: 1,559,033,600 || trainable%: 1.0088711365810203


In [41]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="DrishtiSharma/whisper-large-v2-pa-IN-PEFT",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-3,
    warmup_steps=50,
    #num_train_epochs=5,
    evaluation_strategy="steps",
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=50,
    max_steps=100, # only for testing purposes, remove this from your final run :)
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
)

Fine-tuning a model with PEFT comes with a few caveats.

We need to explicitly set remove_unused_columns=False and label_names=["labels"] as the PeftModel's forward doesn't inherit the signature of the base model's forward.

Since INT8 training requires autocasting, we cannot use the native predict_with_generate call in Trainer as it doesn't automatically cast.

Similarly, since we cannot autocast, we cannot pass the compute_metrics to Seq2SeqTrainer so we'll comment it out whilst instantiating the Trainer.

In [39]:
import os

In [42]:
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

# This callback helps to save only the adapter weights and remove the base model weights.
class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [43]:
trainer.train()



Step,Training Loss,Validation Loss
50,0.0097,0.301387
100,0.025,0.270052


TrainOutput(global_step=100, training_loss=0.01735192000865936, metrics={'train_runtime': 717.6026, 'train_samples_per_second': 1.115, 'train_steps_per_second': 0.139, 'total_flos': 1.71665620992e+18, 'train_loss': 0.01735192000865936, 'epoch': 0.8})

In [44]:
peft_model_id = "DrishtiSharma/whisper-large-v2-punjabi-100-steps-LoRA"
model.push_to_hub(peft_model_id)



adapter_model.bin:   0%|          | 0.00/63.1M [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/63.1M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/DrishtiSharma/whisper-large-v2-punjabi-100-steps-LoRA/commit/f0d9f385fd9e08305550e387407108d85d52c681', commit_message='Upload model', commit_description='', oid='f0d9f385fd9e08305550e387407108d85d52c681', pr_url=None, pr_revision=None, pr_num=None)

Evaluation and Inference
On to the fun part, we've successfully fine-tuned our model. Now let's put it to test and calculate the WER on the test set.

As with training, we do have a few caveats to pay attention to:

Since we cannot use predict_with_generate function, we will hand roll our own eval loop with torch.cuda.amp.autocast() you can check it out below.
Since the base model is frozen, PEFT model sometimes fails to recognise the language while decoding. To fix that, we force the starting tokens to mention the language we are transcribing. This is done via forced_decoder_ids = processor.get_decoder_prompt_ids(language="Marathi", task="transcribe") and passing that too the model.generate call.

In [45]:
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer

peft_model_id = "DrishtiSharma/whisper-large-v2-punjabi-100-steps-LoRA" # Use the same model ID as before.
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)
model = PeftModel.from_pretrained(model, peft_model_id)
model.config.use_cache = True

Downloading (…)/adapter_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/63.1M [00:00<?, ?B/s]

Evaluation loop

In [47]:
import gc
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

eval_dataloader = DataLoader(common_voice["test"], batch_size=8, collate_fn=data_collator)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="punjabi", task=task)
normalizer = BasicTextNormalizer()

predictions = []
references = []
normalized_predictions = []
normalized_references = []

model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            generated_tokens = (
                model.generate(
                    input_features=batch["input_features"].to("cuda"),
                    forced_decoder_ids=forced_decoder_ids,
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
            decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
            predictions.extend(decoded_preds)
            references.extend(decoded_labels)
            normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
            normalized_references.extend([normalizer(label).strip() for label in decoded_labels])
        del generated_tokens, labels, batch
    gc.collect()
wer = 100 * metric.compute(predictions=predictions, references=references)
normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)
eval_metrics = {"eval/wer": wer, "eval/normalized_wer": normalized_wer}

print(f"{wer=} and {normalized_wer=}")
print(eval_metrics)

100%|██████████| 58/58 [41:12<00:00, 42.62s/it]

wer=37.84283513097073 and normalized_wer=22.52461951656222
{'eval/wer': 37.84283513097073, 'eval/normalized_wer': 22.52461951656222}





Inference with Pipeline

In [50]:
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
from peft import PeftModel, PeftConfig


peft_model_id = "DrishtiSharma/whisper-large-v2-punjabi-100-steps-LoRA" # Use the same model ID as before.
language = "punjabi"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)

model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)


def transcribe(audio):
    with torch.cuda.amp.autocast():
        text = pipe(audio, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]
    return text



The model 'PeftModel' is not supported for . Supported models are ['SpeechEncoderDecoderModel', 'Speech2TextForConditionalGeneration', 'SpeechT5ForSpeechToText', 'WhisperForConditionalGeneration', 'Data2VecAudioForCTC', 'HubertForCTC', 'MCTCTForCTC', 'SEWForCTC', 'SEWDForCTC', 'UniSpeechForCTC', 'UniSpeechSatForCTC', 'Wav2Vec2ForCTC', 'Wav2Vec2ConformerForCTC', 'WavLMForCTC'].


In [None]:
transcribe("test_file.mp3")