In [1]:
# Necessary installations
!pip install transformers[torch]
!pip install datasets
!pip install evaluate
!pip install jiwer
!pip install -U openai-whisper

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting more-itertools
  Downloading more_itertools-10.2.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: more-itertools
Successfully installed more-itertools-10.2.0


## Dataset preparation

In [2]:
CUDA_VISIBLE_DEVICES=0,1,2,3,4

In [None]:
# Download dataset and unzip
# !wget https://www.cse.iitb.ac.in/~pjyothi/cs753/dataset.zip
# !unzip dataset.zip

In [3]:
import datasets
from datasets import load_dataset

from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperConfig, WhisperForConditionalGeneration

import torch
from torch import nn
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

import whisper

In [4]:
# Load json files for each training split
data_files = {
    "train": "dataset/CodeSwitched_Data/train.json",
    "validation": "dataset/CodeSwitched_Data/valid.json",
    "test": "dataset/CodeSwitched_Data/test.json"
}
dataset = load_dataset("json", data_files=data_files)

# Update the audio paths to include appropriate folder name
def prepend_folder_name(row):
    row["audio"] = 'dataset/CodeSwitched_Data/' + row["audio"]
    return row
for key in dataset:
    dataset[key] = dataset[key].map(prepend_folder_name)

# Cast columns to appropriate features
features = datasets.Features(
    {
        "id": datasets.Value("string"),
        "transcription": datasets.Value("string"),
        "audio": datasets.Audio(sampling_rate=16000),
    }
)
dataset = dataset.map(features.encode_example, features=features)

In [5]:
# Load necessary processors
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

dataset = dataset.map(prepare_dataset, num_proc=2)

In [8]:
print(f"Sample example:{dataset['validation'][10]['transcription']}")

Sample example:cycle चला सकते हो आराम से पैदल घूम सकते हो



In [9]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's appended later anyway
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [10]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Finetuning whisper model

In [11]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [12]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.generation_config.language = "hi"  # assign the language of choice
print(f'Total number of parameters: {sum([p.numel() for p in model.parameters() if p.requires_grad])/1e6:.2f}M')

# STEP 1: Freeze all parameters of whisper base model.
for param in model.model.parameters():
    param.requires_grad = False

# STEP 2: Unfreeze last 2 layers of encoder with layer norm
for layer in model.model.encoder.layers[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

for param in model.model.encoder.layer_norm.parameters():
    param.requires_grad = True

# STEP 3: Unfreeze last 2 layers of decoder with layer norm
for layer in model.model.decoder.layers[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

for param in model.model.decoder.layer_norm.parameters():
    param.requires_grad = True

print(f'Number of trainable parameters: {sum([p.numel() for p in model.parameters() if p.requires_grad])/1e6:.2f}M')

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Total number of parameters: 240.58M
Number of trainable parameters: 33.08M


In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="runs/whisper-small-hi",
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=3e-3,
    warmup_steps=10,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    num_train_epochs=30,
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    generation_max_length=225,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [None]:
trainer.train()

In [None]:
# Create the reverse mapping adapting it from the original `WHISPER_MAPPING` in
# the `convert_openai_to_hf.py` script:
REVERSE_WHISPER_MAPPING = {
    "layers": "blocks",
    "fc1": "mlp.0",
    "fc2": "mlp.2",
    "final_layer_norm": "mlp_ln",
    ".self_attn.q_proj": ".attn.query",
    ".self_attn.k_proj": ".attn.key",
    ".self_attn.v_proj": ".attn.value",
    ".self_attn_layer_norm": ".attn_ln",
    ".self_attn.out_proj": ".attn.out",
    ".encoder_attn.q_proj": ".cross_attn.query",
    ".encoder_attn.k_proj": ".cross_attn.key",
    ".encoder_attn.v_proj": ".cross_attn.value",
    ".encoder_attn_layer_norm": ".cross_attn_ln",
    ".encoder_attn.out_proj": ".cross_attn.out",
    "decoder.layer_norm.": "decoder.ln.",
    "encoder.layer_norm.": "encoder.ln_post.",
    "embed_tokens": "token_embedding",
    "encoder.embed_positions.weight": "encoder.positional_embedding",
    "decoder.embed_positions.weight": "decoder.positional_embedding",
}


def reverse_rename_keys(s_dict: dict) -> dict:
    """
    Renames the keys back from Hugging Face to OpenAI Whisper format.
    """
    keys = list(s_dict.keys())
    for orig_key in keys:
        new_key = orig_key
        for key_r, value_r in REVERSE_WHISPER_MAPPING.items():
            if key_r in orig_key:
                new_key = new_key.replace(key_r, value_r)

        s_dict[new_key] = s_dict.pop(orig_key)
    return s_dict


def make_emb_from_linear(linear: nn.Linear) -> nn.Embedding:
    """
    Converts a linear layer's weights into an embedding layer.

    The linear layer's `in_features` dimension corresponds to the vocabulary size and its `out_features` dimension
    corresponds to the embedding size.
    """
    vocab_size, emb_size = linear.weight.data.shape
    emb_layer = nn.Embedding(vocab_size, emb_size, _weight=linear.weight.data)
    return emb_layer


def extract_dims_from_hf(config: WhisperConfig) -> dict:
    """
    Extracts necessary dimensions from Hugging Face's WhisperConfig.

    Extracts necessary dimensions and related configuration data from the Hugging Face model and then restructure it
    for the OpenAI Whisper format.
    """
    dims = {
        "n_vocab": config.vocab_size,
        "n_mels": config.num_mel_bins,
        "n_audio_state": config.d_model,
        "n_text_ctx": config.max_target_positions,
        "n_audio_layer": config.encoder_layers,
        "n_audio_head": config.encoder_attention_heads,
        "n_text_layer": config.decoder_layers,
        "n_text_head": config.decoder_attention_heads,
        "n_text_state": config.d_model,
        "n_audio_ctx": config.max_source_positions,
    }
    return dims


def convert_tfms_to_openai_whisper(hf_model, whisper_dump_path):
    """
    Converts a Whisper model from the Hugging Face to the OpenAI format.

    Takes in a Hugging Face Whisper model, extracts its state_dict, renames keys as needed, and then saves
    the model OpenAI's format.
    """

    # Load the HF model's state_dict
    state_dict = model.state_dict()

    # Use a reverse mapping to rename state_dict keys
    state_dict = reverse_rename_keys(state_dict)

    # Extract configurations and other necessary metadata
    dims = extract_dims_from_hf(model.config)

    # Remove the proj_out weights from state dictionary
    del state_dict["proj_out.weight"]

    # Construct the Whisper checkpoint structure
    state_dict = {k.replace("model.", "", 1): v for k, v in state_dict.items()}
    whisper_checkpoint = {"dims": dims, "model_state_dict": state_dict}

    # Save in Whisper's format
    torch.save(whisper_checkpoint, whisper_dump_path)

In [None]:
model_save_file = "whisper-small-finetuned.pt"
convert_tfms_to_openai_whisper(model, model_save_file)