# Speech-to-Text with Whisper Transfer Learning

**Objective:** Fine-tune a Whisper base model on the United-Syn-Med dataset to improve medical speech transcription accuracy in a live teleconsultation context.

In [1]:
# Installing required packages

!pip install git+https://github.com/openai/whisper.git
!pip install jiwer datasets torchaudio transformers accelerate soundfile

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-piic1u2b
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-piic1u2b
  Resolved https://github.com/openai/whisper.git to commit dd985ac4b90cafeef8712f2998d62c59c3e62d22
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->openai-whisper==20240930)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->openai-whisper==20240930)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->openai-whisper==20240930)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-many

In [2]:
# import dependent libraries

import os
import torch
import whisper
import pandas as pd
import soundfile as sf
from datasets import Dataset, DatasetDict
from jiwer import wer, cer
from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, Trainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torchaudio
from glob import glob
from tqdm import tqdm  # for progress bar

2025-06-14 17:32:02.587986: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749922323.043193      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749922323.170143      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [16]:
# Loading the data
n = 0
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if n < 3:
            print(os.path.join(dirname, filename))
            n += 1
        else: break
    if n >= 3: break

/kaggle/input/unitedsnymedsmall/unitedsynmed_small/transcript/validation.csv
/kaggle/input/unitedsnymedsmall/unitedsynmed_small/transcript/train.csv
/kaggle/input/unitedsnymedsmall/unitedsynmed_small/transcript/test.csv


In [17]:
# Paths to the dataset
audio_root = "/kaggle/input/unitedsnymedsmall/unitedsynmed_small/audio"
transcript_root = "/kaggle/input/unitedsnymedsmall/unitedsynmed_small/transcript/"

# Load CSVs and match them with audio paths
def load_split(split):
    csv_path = os.path.join(transcript_root, f"{split}.csv")
    df = pd.read_csv(csv_path)
    df["path"] = df["file_name"].apply(lambda x: os.path.join(audio_root, split, x))
    return df

# Create datasets
train_df = load_split("train")
test_df = load_split("test")
val_df = load_split("validation")

# Convert to Hugging Face Dataset
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df),
    "validation": Dataset.from_pandas(val_df)
})


In [18]:
dataset["train"][:5]

{'file_name': ['drug-male-0b01f9d4-980d-451f-a8f1-18e899158859.wav',
  'drug-male-d58aac86-05d3-40ea-a61d-e1cbb7f3e790.wav',
  'drug-female-06c23421-e597-4cf4-a912-1d44c187a4f3.wav',
  'drug-male-9300288f-77c3-4c42-a0f6-166877f7f965.wav',
  'drug-female-86945722-12e1-4983-bf51-6aa27b196dc9.wav'],
 'transcription': ['Iron calx is a commonly used medicine to treat iron deficiency anemia.',
  'If you experience nausea or vomiting, DOMPAR may help alleviate your symptoms.',
  'AGROBEN-I is a reliable medicine for treating infections in plants.',
  "Make sure to follow your healthcare provider's instructions carefully while taking FEVIBID for optimal results.",
  'Clinical trials have shown favorable results with maralixibat chloride in pediatric patients.'],
 'path': ['/kaggle/input/unitedsnymedsmall/unitedsynmed_small/audio/train/drug-male-0b01f9d4-980d-451f-a8f1-18e899158859.wav',
  '/kaggle/input/unitedsnymedsmall/unitedsynmed_small/audio/train/drug-male-d58aac86-05d3-40ea-a61d-e1cbb7f3

In [19]:
# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
print(f"Using device: {device}")

Using device: cuda


In [None]:

# # Load Whisper processor
# processor = WhisperProcessor.from_pretrained("openai/whisper-base")

# # Set target sample rate
# target_sample_rate = 16000

# def preprocess(batch):
#     audio_input, sr = sf.read(batch["path"])
    
#     # If the sample rate is not 16kHz, resample it
#     # if sr != target_sample_rate:
#     waveform = torch.tensor(audio_input, dtype=torch.float32).float().to(device) 
#     if len(waveform.shape) > 1 and waveform.shape[0] > 1:
#         waveform = waveform.mean(dim=0)  # Convert to mono
#     resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate).to(device)
#     audio_input = resampler(waveform).cpu().numpy()
    
#     inputs = processor(audio_input, sampling_rate=target_sample_rate, return_tensors="pt").to(device)
#     batch["input_features"] = inputs.input_features[0].to
#     batch["labels"] = processor.tokenizer(batch["transcription"]).input_ids
#     return batch

# # Apply preprocessing
# dataset = dataset.map(preprocess)

In [23]:
import torchaudio
from transformers import WhisperProcessor
import torch

# Initialize processor
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
device = "cuda" if torch.cuda.is_available() else "cpu"

def preprocess(batch):
    
    # 1. Load audio file
    waveform, sr = torchaudio.load(batch["path"])
    waveform = waveform.to(device)  # Move to GPU here
    
    # 2. Verify sample rate (optional if you're certain)
    if sr != 16000:
        raise ValueError(f"Invalid sample rate {sr}Hz (expected 16000Hz)")
    
    # 3. Convert to mono if needed
    if waveform.dim() > 1 and waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # 4. Process audio - key fix is making sure we use the tensor, not the method
    audio_array = waveform.squeeze().cpu().numpy()  # Explicitly move to CPU first
    
    # 5. Generate features
    inputs = processor(
        audio_array,
        sampling_rate=16000,
        return_tensors="pt"
    )
    
    # 6. Prepare output - ensure we're using the actual tensors
    batch["input_features"] = inputs.input_features[0].numpy()  # Convert to numpy array
    batch["labels"] = processor.tokenizer(batch["transcription"]).input_ids
    batch["input_features"] = inputs.input_features[0].cpu().numpy()
    
    return batch
        

# Apply preprocessing
dataset = dataset.map(preprocess)

Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [24]:
#save the preprocessed dataset

# Define your output directory in Kaggle's working directory
output_dir = "/kaggle/working/preprocessed_dataset"

# Create directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save the dataset
dataset.save_to_disk(output_dir)

print(f"✅ Dataset saved to {output_dir}")

Saving the dataset (0/19 shards):   0%|          | 0/9500 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

✅ Dataset saved to /kaggle/working/preprocessed_dataset


In [None]:
# #Reload dataset incase kernel dies

# from datasets import load_from_disk

# # Load the saved dataset
# dataset = load_from_disk("/kaggle/working/preprocessed_dataset")

# # Verify it loaded correctly
# print(dataset)
# print(dataset["train"][0])  # Check a sample

In [25]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": f["input_features"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100)
        batch["labels"] = labels

        return batch

In [26]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
# Freeze encoder layers
for param in model.model.encoder.parameters():
    param.requires_grad = False

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

In [28]:

training_args = TrainingArguments(
    output_dir="./whisper-medical",
    per_device_train_batch_size=8,
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch",
    num_train_epochs=5,
    logging_dir="./logs",
    learning_rate=1e-4,
    warmup_steps=500,
    fp16=True,
    push_to_hub=False,
)
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
)

  trainer = Trainer(
