In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset, load_metric, DatasetDict, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from functools import partial

In [3]:
# Load the entire dataset
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")

# Split the dataset into train and test subsets
train_test_split = dataset.train_test_split(test_size=0.2)

# Create a DatasetDict to store train and test datasets
common_voice = DatasetDict()
common_voice["train"] = train_test_split["train"]
common_voice["test"] = train_test_split["test"]

common_voice = common_voice.select_columns(["audio", "transcription"])
print(common_voice)

Found cached dataset minds14 (C:/Users/Jason Lee/.cache/huggingface/datasets/PolyAI___minds14/en-US/1.0.0/dbb7ed8d7a009916cc6561b16095b37bb4815461a20c26fb2c2d37a634bb9e37)


DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 450
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 113
    })
})


In [4]:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="english", task="transcribe")

sampling_rate = processor.feature_extractor.sampling_rate
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [5]:
def prepare_dataset(example):
    audio = example["audio"]
    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["transcription"],
    )
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    return example

In [6]:
common_voice = common_voice.map(prepare_dataset, remove_columns=['audio', 'transcription'], num_proc=1)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

In [7]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

In [8]:
common_voice["train"] = common_voice["train"].filter(is_audio_in_length_range, input_columns=["input_length"])

Filter:   0%|          | 0/450 [00:00<?, ? examples/s]

In [9]:
class DataCollatorSpeechSeq2SeqWithPadding:
    def __init__(self, processor): 
        self.processor = processor

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"][0]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

In [10]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
metric = evaluate.load("wer")
normalizer = BasicTextNormalizer()

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    pred_str_norm = [pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0]
    label_str_norm = [label_str_norm[i] for i in range(len(label_str_norm)) if len(label_str_norm[i]) > 0]

    wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm)
    return {"wer_ortho": wer_ortho, "wer": wer}

In [11]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

In [12]:
model.config.use_cache = False
model.generate = partial(model.generate, language="english", task="transcribe", use_cache=True)

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-english",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=50,
    max_steps=350,
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,  # Set to False to avoid pushing to hub
    remove_unused_columns=False
)

In [14]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

In [15]:
trainer.train()



  0%|          | 0/350 [00:00<?, ?it/s]

{'loss': 2.9446, 'learning_rate': 5e-06, 'epoch': 0.89}
{'loss': 1.1001, 'learning_rate': 1e-05, 'epoch': 1.79}
{'loss': 0.3606, 'learning_rate': 1e-05, 'epoch': 2.68}
{'loss': 0.2592, 'learning_rate': 1e-05, 'epoch': 3.57}
{'loss': 0.1574, 'learning_rate': 1e-05, 'epoch': 4.46}
{'loss': 0.0871, 'learning_rate': 1e-05, 'epoch': 5.36}
{'loss': 0.0541, 'learning_rate': 1e-05, 'epoch': 6.25}
{'loss': 0.0248, 'learning_rate': 1e-05, 'epoch': 7.14}
{'loss': 0.0131, 'learning_rate': 1e-05, 'epoch': 8.04}
{'loss': 0.0054, 'learning_rate': 1e-05, 'epoch': 8.93}
{'loss': 0.0037, 'learning_rate': 1e-05, 'epoch': 9.82}
{'loss': 0.0023, 'learning_rate': 1e-05, 'epoch': 10.71}
{'loss': 0.0015, 'learning_rate': 1e-05, 'epoch': 11.61}
{'loss': 0.0013, 'learning_rate': 1e-05, 'epoch': 12.5}
{'train_runtime': 14153.8302, 'train_samples_per_second': 0.396, 'train_steps_per_second': 0.025, 'train_loss': 0.3582215352888618, 'epoch': 12.5}


TrainOutput(global_step=350, training_loss=0.3582215352888618, metrics={'train_runtime': 14153.8302, 'train_samples_per_second': 0.396, 'train_steps_per_second': 0.025, 'train_loss': 0.3582215352888618, 'epoch': 12.5})