## Installation 

In [None]:
!pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

## Load Dataset

### loader

In [None]:
import os
from datasets import Dataset, DatasetDict, Audio

def load_asr_split(folder_path):
    data = {"audio": [], "transcript": []}

    for file in sorted(os.listdir(folder_path)):
        if file.endswith(".wav"):
            txt = file.replace(".wav", ".txt")
            txt_path = os.path.join(folder_path, txt)

            if os.path.exists(txt_path):
                with open(txt_path, "r", encoding="utf-8") as f:
                    text = f.read().strip()

                data["audio"].append(os.path.join(folder_path, file))
                data["transcript"].append(text)

    dataset = Dataset.from_dict(data)
    return dataset.cast_column("audio", Audio(sampling_rate=16000))


In [None]:
children_voice = DatasetDict({
    "train": load_asr_split("/kaggle/input/childasr/train"),
    "test": load_asr_split("/kaggle/input/childasr/test"),
})

children_voice


## Load WhisperFeatureExtractor

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(
    "openai/whisper-tiny",
    language="English",
    task="transcribe"
)

## Load WhisperTokenizer

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

## WhisperProcessor

In [None]:
from transfomer import WhisperProcessor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

### Prepare Data

In [None]:
children_voice["train"][0]

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_features"] = processor.feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    batch["labels"] = processor.tokenizer(batch["transcript"]).input_ids
    return batch

In [None]:
children_voice = children_voice.map(
    prepare_dataset,
    remove_columns=children_voice["train"].column_names,
    num_proc=2
)

# Training and Evaluation

### Load Pre-Trained Checkpoint

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

model.generation_config.language = "English"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None


### Define a Data Collator

In [None]:
import torch
from dataclasses import dataclass

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: any
    decoder_start_token_id: int

    def __call__(self, features):
        inputs = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(inputs, return_tensors="pt")

        labels = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(labels, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        if (labels[:, 0] == self.decoder_start_token_id).all():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

### Evaluation Metrics

In [None]:
import evaluate
metric = evaluate.load("wer")

def compute_metrics(pred):
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    return {"wer": 100 * metric.compute(predictions=pred_str, references=label_str)}


### Define the Training Configuration

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-en",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    max_steps=2000,
    fp16=True,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    push_to_hub=True,
    report_to=["tensorboard"],
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=children_voice["train"],
    eval_dataset=children_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)


In [None]:
processor.save_pretrained(training_args.output_dir)

## Training

In [None]:
trainer.train()

## Building a Demo

In [None]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(
    "automatic-speech-recognition",
    model=training_args.output_dir
)

def transcribe(audio):
    return pipe(audio)["text"]

gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Whisper Tiny (Fine-tuned)",
).launch()