In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Low Rank Adaptation or LoRA

The LoRA paper https://arxiv.org/abs/2106.09685 introduced a wild idea for training foundational models that can **"reduce the number of trainable parameters by 10,000 times and the GPU memory requirement by 3 times"**. It is ubiquitous in Language Modeling, being rapidly adopted in generative modeling and is also used in, you guessed it, speech rec! Just to understand how powerful it can be, check out the following table (from the paper) showing the performance of standard fine-tuning vs LoRA fine-tuning of GPT3 on the WikiSQL task. LoRA requires training only ~5M params for the 175B param GPT3!! Perfect for low compute settings!

![image.png](attachment:c01573d5-d586-4592-b3fc-c79b5122ba0e.png)

# The trick

As the name suggests, instead of training whole weight matrices, LoRA keeps the original weights frozen and 'adapts' the frozen weights by adding a low rank matrix to the original weights. Suppose you have input $x$ to any weight matrix $W$ (e.g., the key or query embedding matrix) that produces output via $Wx$. LoRA will freeze $W$ and add two new trainable matrices $A$ and $B$ to learn the operation $(W + AB)x$. If the inner dimension $r$ of $A$ and $B$ is smaller than the dimensionality of $x$ and $h$, then $r$ will upper bound the rank of resultant matrix $AB$. Hence by controlling $r$ we can control the number of parameters we are actually learning! I'm skipping all the additional motivations as they can be found in the paper. Low #params good enough motivation for us atm xD

# Training Whisper-Large

We will do mixed precision training of LoRA + Whisper-Large. Small detail, LoRA falls under the umbrella of Parameter-Efficient Fine-Tuning or PEFT. We'll be using the PEFT library to implement LoRA. First we will import the model in 8-bit and add the LoRA adapter. Then we will only keep the LoRA weights trainable and train on a part of the training dataset (for this example).

The code below is adopted from [here](https://github.com/huggingface/peft/blob/main/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb). Bits and pieces of codes are adopted from the notebook by Nicholas Broad Whisper starter kit for the competition https://www.kaggle.com/code/nbroad/whisper-training-starter-kit. Also using the parquet files by Nicholas containing extracted spectrograms.  

#### Disclaimer
This notebook is still not working as expected. I had to use an `autocast` to make it work out. Will try to find time and fix the bugs. Please feel free to copy and use/improve it yourself!! Would be super grateful!


# Dependencies

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# %%capture

!pip install --upgrade bitsandbytes
!pip install --upgrade transformers
!pip install peft
!pip install accelerate
!pip install -q datasets librosa evaluate jiwer gradio #accelerate #bitsandbytes==0.37




import  huggingface_hub

huggingface_hub.login(token = "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")


# !apt install git-lfs

# !pip install -U datasets
from datasets import load_dataset, Audio, load_metric
# from datasets import ClassLabel
import random
import pandas as pd
import numpy as np
# from IPython.display import display, HTML
import re
import json

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Import the data and preprocess it


In [None]:
dataset = load_dataset("asapp/slue-phase-2", "hvb")

dataset = dataset.remove_columns(['issue_id', 'speaker_id', 'utt_index', 'channel', 'role', 'start_ms', 'duration_ms', 'intent', 'dialog_acts'])

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

dataset = dataset.map(remove_special_characters)

def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names["train"])

vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/11344 [00:00<?, ? examples/s]

Map:   0%|          | 0/1690 [00:00<?, ? examples/s]

Map:   0%|          | 0/6121 [00:00<?, ? examples/s]

In [None]:
repo_name = "whisper-base-lora"
model_name_or_path = "openai/whisper-base.en"
task = "transcribe"

from transformers import WhisperFeatureExtractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)

from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, task=task)
tokenizer.push_to_hub(repo_name, use_auth_token = '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') #Nicco

from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained(model_name_or_path, language='bn', task=task)




preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]



In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor ([`WhisperProcessor`])
            The processor used for processing the data.
        decoder_start_token_id (`int`)
            The begin-of-sentence of the decoder.
        forward_attention_mask (`bool`)
            Whether to return attention_mask.
    """

    processor: Any
#     decoder_start_token_id: int
#     forward_attention_mask: bool

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        model_input_name = self.processor.model_input_names[0]
        input_features = [
            {model_input_name: feature[model_input_name]} for feature in features
        ]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(
            input_features, return_tensors="pt"
        )

#         if self.forward_attention_mask:
#             batch["attention_mask"] = torch.LongTensor(
#                 [feature["attention_mask"] for feature in features]
#             )

        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        # replace padding with -100 to ignore loss correctly
#         labels = labels_batch["input_ids"].masked_fill(
#             labels_batch.attention_mask.ne(1), -100
#         )

#         # if bos token is appended in previous tokenization step,
#         # cut bos token here as it's append later anyways
#         if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
#             labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
#     decoder_start_token_id=model.config.decoder_start_token_id,
#     forward_attention_mask=forward_attention_mask,
)

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=1)

Map:   0%|          | 0/11344 [00:00<?, ? examples/s]

Map:   0%|          | 0/1690 [00:00<?, ? examples/s]

Map:   0%|          | 0/6121 [00:00<?, ? examples/s]

In [None]:
max_input_length_in_sec = 4.0
# dataset["train"] = dataset["train"].filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_features"])

In [None]:
metric = load_metric("wer")

  metric = load_metric("wer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
# for param in model.parameters():
#     param.requires_grad = False  # freeze the model - train adapters later
#     if param.ndim == 1:
#     # cast the small parameters (e.g. layernorm) to fp32 for stability
#         param.data = param.data.to(torch.float32)

# model.gradient_checkpointing_enable()  # reduce number of stored activations
# model.enable_input_require_grads()

# class CastOutputToFloat(torch.nn.Sequential):
#     def forward(self, x): return super().forward(x).to(torch.float32)
# model.proj_out = CastOutputToFloat(model.proj_out)

In [None]:


import torch
from transformers import WhisperForConditionalGeneration, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

# Define the quantization configuration
# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True
# )


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True
)


model_name_or_path = "openai/whisper-base.en"
model = WhisperForConditionalGeneration.from_pretrained(
    model_name_or_path,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Use the updated function for k-bit training preparation
model = prepare_model_for_kbit_training(model)

def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)
model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

<torch.utils.hooks.RemovableHandle at 0x7b89266a7880>

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


# LoRa config

In [None]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=16, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 589,824 || all params: 73,183,232 || trainable%: 0.8060


In [None]:
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)

model.base_model.model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

<torch.utils.hooks.RemovableHandle at 0x7b8918923850>

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/NLP",  # change to a repo name of your choice
    report_to="none", ### comment this out to login to wandb
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-4,
    warmup_steps=50,
    num_train_epochs=10,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    fp16=True,
    per_device_eval_batch_size=8,
#     generation_max_length=128,
#     max_steps=100, # only for testing purposes, remove this from your final run :)
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above

)




In [None]:
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

# This callback helps to save only the adapter weights and remove the base model weights.
class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],

)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
# with torch.autocast("cuda"):
trainer.train()
trainer.save_model()



Step,Training Loss,Validation Loss
500,1.2688,0.745957
1000,0.7108,0.722395
1500,0.6956,0.709532
2000,0.6708,0.706399


