In [1]:
import torch
from transformers import (
    pipeline,
    WhisperProcessor,
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from audio_desilencer.audio_processor import AudioProcessor
from torch.nn.utils import prune
import io
from datasets import Audio
from datasets import load_dataset, DatasetDict
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate


## Fine Tuning using common-voice

In [2]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_17_0", "pt", split="train+validation", trust_remote_code=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_17_0", "pt", split="test", trust_remote_code=True)
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16_000))



In [3]:
feature_extractor:WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny", padding=True)
tokenizer:WhisperTokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="pt", task="transcribe", padding=True)
processor:WhisperProcessor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="pt", task="transcribe", padding=True)

In [16]:
def prepare_dataset(batch):
    # Process audio features in batch
    input_features = feature_extractor(
        [audio["array"] for audio in batch["audio"]], 
        padding="max_length",
        sampling_rate=16000,
        return_tensors="pt",
        max_length=480000,     # 30 seconds of audio at 16kHz
        truncation=True    # Enable truncation
    ).input_features

    # Process text labels in batch
    labels = tokenizer(
        batch["sentence"], 
        return_tensors="pt",
        padding="max_length",
        max_length=448,
        truncation=True    # Enable truncation
    ).input_ids

    # Assign processed features to batch
    batch["input_features"] = [feat for feat in input_features]
    batch["labels"] = [label for label in labels]
    
    return batch

In [17]:
common_voice = common_voice.map(
    prepare_dataset, 
    batch_size=32,  # Adjust this value based on your memory constraints
    batched=True,
    remove_columns=common_voice.column_names["train"], 
    num_proc=4,
)

In [6]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.generation_config.language = "pt"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [7]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Convert input features to tensor if they're lists and stack them
        input_features = torch.stack([
            torch.tensor(feature["input_features"]) if isinstance(feature["input_features"], list) 
            else feature["input_features"] 
            for feature in features
        ])
        
        batch = {"input_features": input_features}

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, padding=True)

        # Convert labels and attention mask to tensors
        labels = torch.tensor(labels_batch["input_ids"])
        attention_mask = torch.tensor(labels_batch["attention_mask"])

        # replace padding with -100 to ignore loss correctly
        labels = labels.masked_fill(attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [8]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, decoder_start_token_id=tokenizer.pad_token_id)

In [9]:
metric = evaluate.load("wer")

In [10]:
def compute_metrics(pred):
  pred_ids = pred.predictions
  label_ids = pred.label_ids
  
  label_ids[label_ids == -100] = tokenizer.pad_token_id
  
  pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
  
  wer = 100 * metric.compute(predictions=pred_str, references=label_str)
  return {"wer": wer}



In [11]:
args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-pt-bpra",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [12]:
trainer = Seq2SeqTrainer(
  args=args,
  model=model,
  train_dataset=common_voice["train"],
  eval_dataset=common_voice["test"],
  data_collator=data_collator,
  compute_metrics=compute_metrics,
  processing_class=processor.feature_extractor,
  
)


In [13]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


KeyboardInterrupt: 