In [None]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q transformers datasets jiwer torchaudio soundfile accelerate evaluate tqdm pandas audiomentations

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.1/86.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.4/109.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.5/248.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, re, json, requests, torch
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, load_dataset
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
import torchaudio

# Paths
CSV_PATH = "/content/drive/MyDrive/hindi_asr_finetuning/FT_data.csv"
OUTPUT_DIR = "/content/drive/MyDrive/hindi_asr_finetuning/whisper-small-hi-ft"
AUDIO_DIR = "/content/drive/MyDrive/hindi_asr_finetuning/Data/audio"
TEXT_DIR = "/content/drive/MyDrive/hindi_asr_finetuning/Data/text"
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)

print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


GPU available: True
GPU: Tesla T4


In [None]:
def download_file(url, out_path):
    if os.path.exists(out_path):
        return out_path
    try:
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        with open(out_path, "wb") as f:
            f.write(r.content)
        return out_path
    except Exception as e:
        print("Download failed:", url, e)
        return None

df = pd.read_csv(CSV_PATH)
records = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    rec_id = str(row["recording_id"])
    audio_url = row.get("rec_url_gcp") or row.get("rec_url") or row.get("rec_url_s3")
    trans_url = row.get("transcription_url") or row.get("transcription_url_gcp")

    audio_local = os.path.join(AUDIO_DIR, f"{rec_id}.wav")
    text_local = os.path.join(TEXT_DIR, f"{rec_id}.json")

    if audio_url:
        download_file(audio_url, audio_local)
    if trans_url:
        download_file(trans_url, text_local)

    # Read transcription
    text = ""
    if os.path.exists(text_local):
        try:
            with open(text_local, "r", encoding="utf-8") as f:
                j = json.load(f)
            if isinstance(j, list):
                text = " ".join(seg.get("text", "") for seg in j)
            elif isinstance(j, dict) and "text" in j:
                text = j["text"]
        except:
            pass

    if not text and "text" in row:
        text = str(row["text"])
    if os.path.exists(audio_local) and text.strip():
        records.append({"audio_path": audio_local, "text": text.strip()})

manifest_df = pd.DataFrame(records)
print("Total usable samples:", len(manifest_df))


100%|██████████| 104/104 [14:20<00:00,  8.27s/it]

Total usable samples: 104





In [None]:
manifest_csv = "/content/drive/MyDrive/hindi_asr_finetuning/processed_manifest.csv"

df = pd.read_csv(manifest_csv)
print("Rows:", len(df))
print("Columns:", list(df.columns))
df.head(10)

Rows: 104
Columns: ['audio_path', 'text']


Unnamed: 0,audio_path,text
0,/content/drive/MyDrive/hindi_asr_finetuning/Da...,अब काफी अच्छा होता है क्योंकि उनकी जनसंख्या बह...
1,/content/drive/MyDrive/hindi_asr_finetuning/Da...,जी जी जी जी जी । जी जी जी हां उधर हां जी हा हा...
2,/content/drive/MyDrive/hindi_asr_finetuning/Da...,लेकिन हम लोग इसे छुपछुप के लोगों के घर जाकर खे...
3,/content/drive/MyDrive/hindi_asr_finetuning/Da...,जी जी जी जी जी मेरे तो जैसे बहुत सारी यादे हैं...
4,/content/drive/MyDrive/hindi_asr_finetuning/Da...,हां जी पहले बात करते हैं विवाह की तो इस मुवी म...
5,/content/drive/MyDrive/hindi_asr_finetuning/Da...,जी जी जी। जी जी जीजी जी जी जी जी जी बिल्कुल अच...
6,/content/drive/MyDrive/hindi_asr_finetuning/Da...,हेलो हम्म हां जी जी आवाज आ रही है अब जी। जी जी...
7,/content/drive/MyDrive/hindi_asr_finetuning/Da...,नहीं ऐसी तो कोई कहानी नहीं बोल्ड आउटफिट को लेक...
8,/content/drive/MyDrive/hindi_asr_finetuning/Da...,हम सर लता मंगेशकर के गाना सुनना पसंद करते हैं ...
9,/content/drive/MyDrive/hindi_asr_finetuning/Da...,लता मंगेश्वर के गाने सुनना पसंद करते हैं हां व...


In [None]:
def clean_repeated_hindi_phrases(text: str) -> str:
    if not isinstance(text, str):
        return ""

    # Normalize and keep only Hindi chars + spaces
    text = text.lower().strip()
    text = re.sub(r"[^\u0900-\u097F\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Merge accidentally combined words like "जीजी" → "जी जी"
    text = re.sub(r"([^\s])(?=\1)", r"\1 ", text)

    # Tokenize and remove consecutive duplicates or partial duplicates
    words = text.split()
    cleaned = []
    prev = ""

    for word in words:
        # If word is very similar to previous (like जी and जीजी)
        if prev and (word == prev or word.startswith(prev) or prev.startswith(word)):
            continue
        cleaned.append(word)
        prev = word

    # Collapse extra spaces
    text = " ".join(cleaned)
    text = re.sub(r"\s+", " ", text).strip()

    return text

manifest_df["text"] = df["text"].astype(str).apply(clean_repeated_hindi_phrases)
manifest_df = manifest_df[manifest_df["text"].str.strip() != ""]

In [None]:
print(manifest_df.columns)
print(len(manifest_df))
manifest_df.head(10)

Index(['audio_path', 'text'], dtype='object')
104


Unnamed: 0,audio_path,text
0,/content/drive/MyDrive/hindi_asr_finetuning/Da...,अब काफी अच्छा होता है क्योंकि उनकी जनसंख्या बह...
1,/content/drive/MyDrive/hindi_asr_finetuning/Da...,जी । जी हां उधर हां जी हा बार था पहली बार थाक्...
2,/content/drive/MyDrive/hindi_asr_finetuning/Da...,लेकिन हम लोग इसे छुपछुप के लोगों के घर जाकर खे...
3,/content/drive/MyDrive/hindi_asr_finetuning/Da...,जी मेरे तो जैसे बहुत सारी यादे हैं कि मैं बताओ...
4,/content/drive/MyDrive/hindi_asr_finetuning/Da...,हां जी पहले बात करते हैं विवाह की तो इस मुवी म...
5,/content/drive/MyDrive/hindi_asr_finetuning/Da...,जी बिल्कुल अच्छा आ जी मैम वैसे मैं एक बेब सीरी...
6,/content/drive/MyDrive/hindi_asr_finetuning/Da...,हेलो हम्म हां जी आवाज रही है अब जी। बताइए आपकी...
7,/content/drive/MyDrive/hindi_asr_finetuning/Da...,नहीं ऐसी तो कोई कहानी नहीं बोल्ड आउटफिट को लेक...
8,/content/drive/MyDrive/hindi_asr_finetuning/Da...,हम सर लता मंगेशकर के गाना सुन ना पसंद करते हैं...
9,/content/drive/MyDrive/hindi_asr_finetuning/Da...,लता मंगेश्वर के गाने सुन ना पसंद करते हैं हां ...


In [None]:
manifest_csv = "/content/drive/MyDrive/hindi_asr_finetuning/processed_manifest_clean.csv"
manifest_df.to_csv(manifest_csv, index=False)
print("Cleaned manifest saved:", manifest_csv)


Cleaned manifest saved: /content/drive/MyDrive/hindi_asr_finetuning/processed_manifest_clean.csv


In [None]:
ds = Dataset.from_pandas(manifest_df)
split_ds = ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = split_ds["train"], split_ds["test"]

print(train_ds.shape, eval_ds.shape)

(93, 2) (11, 2)


In [None]:
MODEL_NAME = "openai/whisper-small"
LANG_CODE = "hi"

processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=LANG_CODE, task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=LANG_CODE, task="transcribe")
model.config.suppress_tokens = []
for param in model.model.encoder.parameters():
    param.requires_grad = True  # freeze encoder for faster fine-tuning


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

In [None]:
def preprocess_function(batch):
    speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
    if speech_array.shape[0] > 1:
        speech_array = torch.mean(speech_array, dim=0, keepdim=True)
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
        speech_array = resampler(speech_array)
    speech = speech_array.squeeze().numpy()

    input_features = processor.feature_extractor(speech, sampling_rate=16000).input_features[0]
    labels = processor.tokenizer(batch["text"], truncation=True, max_length=448).input_ids
    return {"input_features": input_features, "labels": labels}

train_ds = train_ds.map(preprocess_function, remove_columns=train_ds.column_names)
eval_ds = eval_ds.map(preprocess_function, remove_columns=eval_ds.column_names)

Map:   0%|          | 0/93 [00:00<?, ? examples/s]

  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if labels.shape[1] > 0 and (labels[:, 0] == self.processor.tokenizer.bos_token_id).all():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]

    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    def normalize_hindi(s):
        s = s.lower().strip()
        s = re.sub(r"[^\u0900-\u097F\s]", "", s)
        s = re.sub(r"\s+", " ", s).strip()
        s = clean_repeated_hindi_phrases(s)
        return s


    pred_str = [normalize_hindi(s) for s in pred_str]
    label_str = [normalize_hindi(s) for s in label_str]
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    logging_steps=25,
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=1000,
    predict_with_generate=True,
    generation_max_length=225,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    processing_class=processor,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()
trainer.save_model(OUTPUT_DIR)

Step,Training Loss
25,1.5485
50,1.4201
75,1.2801
100,1.1244
125,0.9847
150,0.892
175,0.8298
200,0.7243
225,0.6493
250,0.5816


