In [2]:
!pip install -q datasets transformers accelerate torchaudio librosa

import json
import numpy as np
import torch
from datasets import Dataset, Audio
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

# ✅ Load JSONL manually (avoiding filesystem error)
with open("dataset.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

dataset = Dataset.from_list(data)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# ✅ Load Whisper processor and model
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# ✅ Preprocessing: pad or truncate to 3000 frames
def preprocess(batch):
    audio = batch["audio"]
    mel = processor.feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="np"
    )["input_features"][0]  # (80, T)

    desired_len = 3000
    current_len = mel.shape[1]
    if current_len < desired_len:
        mel = np.pad(mel, ((0, 0), (0, desired_len - current_len)), mode="constant")
    else:
        mel = mel[:, :desired_len]

    batch["input_features"] = torch.tensor(mel, dtype=torch.float32)

    # ✅ Truncate labels to 448 tokens
    tokens = processor.tokenizer(batch["transcription"]).input_ids
    batch["labels"] = tokens[:448]
    return batch

dataset = dataset.map(preprocess)

# ✅ Custom data collator
class WhisperDataCollator:
    def __call__(self, features):
        input_features = [torch.tensor(f["input_features"]) if not isinstance(f["input_features"], torch.Tensor) else f["input_features"] for f in features]
        labels = [torch.tensor(f["labels"]) for f in features]

        return {
            "input_features": torch.stack(input_features),
            "labels": torch.nn.utils.rnn.pad_sequence(
                labels, batch_first=True, padding_value=-100
            ),
        }

# ✅ Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=1,
    learning_rate=1e-5,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=1,
    save_strategy="epoch",
    fp16=False,
)

# ✅ Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=WhisperDataCollator(),
    tokenizer=processor,  # use the full processor here
)

# ✅ Train
trainer.train()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33malahmadfahed[0m ([33malahmadfahed-fahed[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,3.3987
2,3.623
3,2.5515
4,3.1896
5,2.1716
6,2.9457
7,2.8469
8,1.9004
9,2.7055
10,1.8219




TrainOutput(global_step=10, training_loss=2.715475845336914, metrics={'train_runtime': 1104.9444, 'train_samples_per_second': 0.009, 'train_steps_per_second': 0.009, 'total_flos': 2885854003200000.0, 'train_loss': 2.715475845336914, 'epoch': 5.0})

In [19]:
!pip install -q transformers torchaudio librosa jiwer

import os
import torch
import torchaudio
from jiwer import wer
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoProcessor

# Load original processor and fine-tuned model
processor = AutoProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned/checkpoint-10")
model.generation_config.forced_decoder_ids = None
model.eval()

# ✅ Directory where test audio files (.wav) are stored
audio_dir = "tests"
os.makedirs(audio_dir, exist_ok=True)

# ✅ Transcribe each audio file
results = []
for file_name in sorted(os.listdir(audio_dir)):
    if not file_name.endswith(".wav"):
        continue

    file_path = os.path.join(audio_dir, file_name)
    speech_array, sr = torchaudio.load(file_path)
    if sr != 16000:
        speech_array = torchaudio.transforms.Resample(sr, 16000)(speech_array)

    inputs = processor(
        speech_array.squeeze().numpy(),
        sampling_rate=16000,
        return_tensors="pt",
        return_attention_mask=True
    )
    inputs["decoder_input_ids"] = torch.tensor([[50259]])  # 'en' token for Whisper

    # Then generate as usual
    with torch.no_grad():
        predicted_ids = model.generate(
            inputs["input_features"],
            max_length=448
        )
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    results.append((file_name, transcription))
    print(f"{file_name} → {transcription}")

output_path = "inference_results.txt"
with open(output_path, "w") as f:
    for fname, text in results:
        f.write(f"{fname}\t{text.strip()}\n")

print(f"\n✅ Transcriptions saved to: {output_path}")


test1.wav →  Do you have any questions? Here, this is our topic again, because afterwards, just imagine, and conclusion, our key findings are
test2.wav →  They are on slide. I'm not sure. And I mean, most of the floors are just to look nice.
test3.wav →  Were you the ones who were watching this? Yes, yes. You have to put your hands over your ears. Okay, bye.

✅ Transcriptions saved to: inference_results.txt


In [20]:
from jiwer import wer

# Load predictions
with open("inference_results.txt", "r") as f:
    pred_dict = dict(line.strip().split("\t", 1) for line in f if line.strip())

# Load ground truth
with open("ground_truth.txt", "r") as f:
    truth_dict = dict(line.strip().split("\t", 1) for line in f if line.strip())

# Compute WER per file
total_wer = 0
for fname in pred_dict:
    pred = pred_dict[fname].lower()
    truth = truth_dict.get(fname, "").lower()
    file_wer = wer(truth, pred)
    total_wer += file_wer
    print(f"{fname}: WER = {file_wer:.3f}")

# Average WER
avg_wer = total_wer / len(pred_dict)
print(f"\nAverage WER: {avg_wer:.3f}")

test1.wav: WER = 0.368
test2.wav: WER = 0.294
test3.wav: WER = 1.750

Average WER: 0.804
