## Installation

In [4]:
!pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio huggingface_hub

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load Dataset

### loader

In [19]:
import os
from datasets import Dataset, DatasetDict, Audio

def load_asr_split(folder_path):
    data = {"audio": [], "transcript": []}

    for file in sorted(os.listdir(folder_path)):
        if file.endswith(".wav"):
            txt = file.replace(".wav", ".txt")
            txt_path = os.path.join(folder_path, txt)

            if os.path.exists(txt_path):
                with open(txt_path, "r", encoding="utf-8") as f:
                    text = f.read().strip()

                data["audio"].append(os.path.join(folder_path, file))
                data["transcript"].append(text)

    dataset = Dataset.from_dict(data)
    return dataset.cast_column("audio", Audio(sampling_rate=16000))


In [3]:
children_voice = DatasetDict({
    "train": load_asr_split("/content/data/train"),
    "test": load_asr_split("/content/data/test"),
})

children_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcript'],
        num_rows: 7
    })
    test: Dataset({
        features: ['audio', 'transcript'],
        num_rows: 2
    })
})

## Load WhisperFeatureExtractor

In [4]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(
    "openai/whisper-tiny",
    language="English",
    task="transcribe"
)

## Load WhisperTokenizer

In [5]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

## WhisperProcessor

In [6]:
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

### Prepare Data

In [7]:
children_voice["train"][0]

{'audio': <datasets.features._torchcodec.AudioDecoder at 0x7e83417f7fe0>,
 'transcript': 'She jumped rope'}

In [8]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_features"] = processor.feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    batch["labels"] = processor.tokenizer(batch["transcript"]).input_ids
    return batch

In [9]:
children_voice = children_voice.map(
    prepare_dataset,
    remove_columns=children_voice["train"].column_names,
    num_proc=2
)

Map (num_proc=2):   0%|          | 0/7 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

# Training and Evaluation

### Load Pre-Trained Checkpoint

In [10]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

model.generation_config.language = "English"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None


### Define a Data Collator

In [11]:
import torch
from dataclasses import dataclass

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: any
    decoder_start_token_id: int

    def __call__(self, features):
        inputs = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(inputs, return_tensors="pt")

        labels = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(labels, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        if (labels[:, 0] == self.decoder_start_token_id).all():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

### Evaluation Metrics

In [13]:
import evaluate
metric = evaluate.load("wer")

def compute_metrics(pred):
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    return {"wer": 100 * metric.compute(predictions=pred_str, references=label_str)}


Downloading builder script: 0.00B [00:00, ?B/s]

### Define the Training Configuration

In [16]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-en",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    max_steps=2000,
    fp16=True,
    #evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    push_to_hub=False, # Changed to False to prevent the 403 error
    report_to=["tensorboard"],
)


In [17]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=children_voice["train"],
    eval_dataset=children_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)


  trainer = Seq2SeqTrainer(


In [None]:
processor.save_pretrained(training_args.output_dir)

## Training

In [18]:
trainer.train()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,5.1274
50,2.1529
75,1.6096
100,0.2457
125,0.0002
150,0.0001
175,0.0
200,0.0
225,0.0
250,0.0




TrainOutput(global_step=2000, training_loss=0.11421347099468403, metrics={'train_runtime': 1870.9089, 'train_samples_per_second': 17.104, 'train_steps_per_second': 1.069, 'total_flos': 3.4466439168e+17, 'train_loss': 0.11421347099468403, 'epoch': 2000.0})

## Building a Demo

In [22]:
from transformers import pipeline
import gradio as gr
import os

# Construct the path to the final model checkpoint
# Assuming the last checkpoint is checkpoint-2000 based on max_steps=2000
model_path = os.path.join(training_args.output_dir, "checkpoint-2000")

pipe = pipeline(
    "automatic-speech-recognition",
    model=model_path
)

def transcribe(audio):
    return pipe(audio)["text"]

gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"), # Changed 'type' to 'filepath' for file uploads
    outputs="text",
    title="Whisper Tiny (Fine-tuned)",
).launch()

Device set to use cuda:0


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a6440912fc2393889d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


