In [1]:
import torch
print(torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)


False None


In [None]:
!pip install -q transformers datasets jiwer soundfile accelerate


In [None]:
import os
import re
import csv
import time
import soundfile as sf
import numpy as np
from datasets import load_dataset, Audio, Features, Value
from transformers import pipeline
import torch
from jiwer import wer

# ---------------------------------------------
# Text Normalization (Hindi-friendly)
# ---------------------------------------------
def normalize_text_hindi(t: str) -> str:
    if t is None:
        t = ""
    t = t.strip()

    # Remove punctuation + Hindi danda
    t = re.sub(r"[।,.\-?!”\"'`:;()\[\]{}…]", "", t)

    # Collapse multiple spaces
    t = re.sub(r"\s+", " ", t)

    return t.lower()


# ---------------------------------------------
# Safe audio loader (avoids torchcodec)
# ---------------------------------------------
def read_audio_from_path(path):
    audio, sr = sf.read(path)

    # Convert stereo → mono
    if audio.ndim > 1:
        audio = audio.mean(axis=1)

    # Convert int WAV → float32 [-1,1]
    if np.issubdtype(audio.dtype, np.integer):
        maxv = float(2 ** (8 * audio.dtype.itemsize - 1))
        audio = audio.astype("float32") / maxv

    return audio.astype("float32"), sr


In [None]:
# Safe Colab cell: load FLEURS Hindi test split via parquet and cast audio->Audio(decode=False)
from datasets import load_dataset, Audio
import os
import shutil
import traceback

PARQUET_URI = "hf://datasets/google/fleurs@refs%2Fconvert%2Fparquet/hi_in/test/*.parquet"

def try_load_and_cast(parquet_uri):
    print("Attempting to load FLEURS test split from parquet URI:")
    print(" ", parquet_uri)
    ds = load_dataset("parquet", data_files={"test": parquet_uri})
    fleurs_test = ds["test"]
    print("Loaded dataset. Raw features:\n", fleurs_test.features)
    # Cast the audio column to metadata-only (path) to avoid torchcodec
    fleurs_test = fleurs_test.cast_column("audio", Audio(decode=False))
    print("Casted 'audio' column to Audio(decode=False). Dataset length:", len(fleurs_test))
    # show a few audio path examples
    for i in range(min(5, len(fleurs_test))):
        ainfo = fleurs_test[i]["audio"]
        if isinstance(ainfo, dict):
            print(f"{i} audio.path ->", ainfo.get("path"))
        else:
            print(f"{i} audio entry ->", ainfo)
    return fleurs_test

# Try once, if it fails attempt to clear local cache for this dataset and retry once
try:
    fleurs_test = try_load_and_cast(PARQUET_URI)
except Exception as e:
    print("\nFirst load attempt failed with exception:")
    traceback.print_exc()
    # Clear possible local cache for the google/fleurs parquet snapshot and retry
    cache_root = os.path.expanduser("~/.cache/huggingface/hub")
    print("\nAttempting to clear local HF hub cache (this may help if the cached shards are corrupted).")
    # Be conservative: remove only matching fleurs directories if present
    removed_any = False
    for root, dirs, files in os.walk(cache_root):
        for d in dirs:
            if "fleurs" in d.lower() or "datasets--google--fleurs" in d.lower():
                full = os.path.join(root, d)
                try:
                    shutil.rmtree(full)
                    print("Removed cache dir:", full)
                    removed_any = True
                except Exception as e2:
                    print("Failed to remove", full, ":", e2)
    if not removed_any:
        print("No fleurs cache dir found under", cache_root, "- skipping cache cleanup.")
    print("\nRetrying load once more...")
    try:
        fleurs_test = try_load_and_cast(PARQUET_URI)
    except Exception:
        print("\nSecond attempt failed. Please restart the Colab runtime (Runtime → Restart runtime) and run this cell again.")
        raise


Attempting to load FLEURS test split from parquet URI:
  hf://datasets/google/fleurs@refs%2Fconvert%2Fparquet/hi_in/test/*.parquet
Loaded dataset. Raw features:
 {'id': Value('int32'), 'num_samples': Value('int32'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, decode=True, stream_index=None), 'transcription': Value('string'), 'raw_transcription': Value('string'), 'gender': ClassLabel(names=['male', 'female', 'other']), 'lang_id': ClassLabel(names=['af_za', 'am_et', 'ar_eg', 'as_in', 'ast_es', 'az_az', 'be_by', 'bg_bg', 'bn_in', 'bs_ba', 'ca_es', 'ceb_ph', 'ckb_iq', 'cmn_hans_cn', 'cs_cz', 'cy_gb', 'da_dk', 'de_de', 'el_gr', 'en_us', 'es_419', 'et_ee', 'fa_ir', 'ff_sn', 'fi_fi', 'fil_ph', 'fr_fr', 'ga_ie', 'gl_es', 'gu_in', 'ha_ng', 'he_il', 'hi_in', 'hr_hr', 'hu_hu', 'hy_am', 'id_id', 'ig_ng', 'is_is', 'it_it', 'ja_jp', 'jv_id', 'ka_ge', 'kam_ke', 'kea_cv', 'kk_kz', 'km_kh', 'kn_in', 'ko_kr', 'ky_kg', 'lb_lu', 'lg_ug', 'ln_cd', 'lo_la', 'lt_lt', 'luo_ke', 'lv_lv', 'mi_n

In [None]:
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")

print("Loading openai/whisper-small...")
pipe_base = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    chunk_length_s=30,
    device=device
)

print("✓ Whisper-small loaded!")


Using device: GPU
Loading openai/whisper-small...


Device set to use cuda:0


✓ Whisper-small loaded!


In [None]:
import os, glob, tempfile, shutil
import time

base_predictions = []
references = []
paths = []
failed_idx = []

n = len(fleurs_test)
print(f"Starting inference on {n} samples...")

start = time.time()

# Helpers to locate audio
def find_in_hf_cache(filename):
    """Search common huggingface cache dirs for filename and return full path or None."""
    cache_roots = [
        os.path.expanduser("~/.cache/huggingface/hub"),
        os.path.expanduser("~/.cache/huggingface/datasets"),
        os.path.expanduser("~/.cache/huggingface/modules"),
    ]
    for root in cache_roots:
        if not os.path.exists(root):
            continue
        # glob search (could be heavy but filenames are short) — stop early on first match
        for path in glob.iglob(os.path.join(root, "**", filename), recursive=True):
            if os.path.isfile(path):
                return path
    return None

# main loop with robust audio resolution
for idx in range(n):
    if idx % 50 == 0:
        print(f"Progress: {idx}/{n} | Elapsed: {(time.time()-start)/60:.1f} min")

    sample = fleurs_test[idx]
    audio_info = sample.get("audio", {})

    audio_path = None
    audio_temp_written = None

    # 1) Try the path field directly
    if isinstance(audio_info, dict):
        audio_path = audio_info.get("path", None)
    elif isinstance(audio_info, str) and audio_info:
        audio_path = audio_info

    # 2) If audio_path doesn't exist on disk, try to locate it in HF cache
    if audio_path:
        # If it's not absolute, make an absolute candidate
        if not os.path.isabs(audio_path):
            # Sometimes path is just a basename like '12345.wav'
            candidate = os.path.abspath(audio_path)
            if os.path.exists(candidate):
                audio_path = candidate
            else:
                # search cache
                basename = os.path.basename(audio_path)
                found = find_in_hf_cache(basename)
                if found:
                    audio_path = found
                else:
                    # also try if the path itself (as given) exists relative to cache root
                    found2 = find_in_hf_cache(audio_path)
                    if found2:
                        audio_path = found2
                    else:
                        # mark as not found for now (we will try bytes fallback below)
                        audio_path = None

    # 3) If path is still None, but audio bytes present, write temp file
    if audio_path is None and isinstance(audio_info, dict) and audio_info.get("bytes", None):
        try:
            b = audio_info["bytes"]
            # bytes may be a Python bytes object or memoryview; normalize
            if isinstance(b, memoryview):
                b = b.tobytes()
            if not isinstance(b, (bytes, bytearray)):
                # may be returned as a list of ints — convert
                try:
                    b = bytes(b)
                except Exception:
                    b = None
            if b:
                tmpfd, tmpname = tempfile.mkstemp(suffix=".wav", prefix="fleurs_")
                os.close(tmpfd)
                with open(tmpname, "wb") as fh:
                    fh.write(b)
                audio_temp_written = tmpname
                audio_path = tmpname
        except Exception as e:
            print(f"Warning: failed to write audio bytes for idx={idx}: {e}")
            audio_path = None

    # 4) If we still don't have an audio_path, skip sample
    if not audio_path or not os.path.exists(audio_path):
        print(f"Skipping idx={idx}: audio file not found (original audio.path maybe '{audio_info}').")
        base_predictions.append("")   # keep alignment
        references.append(sample.get("transcription", "") or "")
        paths.append(None)
        failed_idx.append(idx)
        # cleanup temp if any
        if audio_temp_written and os.path.exists(audio_temp_written):
            os.remove(audio_temp_written)
        continue

    # 5) Read audio safely and run pipeline
    try:
        audio_array, sr = read_audio_from_path(audio_path)
    except Exception as e:
        print(f"Warning: soundfile failed to read idx={idx}, path={audio_path}: {e}")
        base_predictions.append("")
        references.append(sample.get("transcription", "") or "")
        paths.append(audio_path)
        failed_idx.append(idx)
        if audio_temp_written and os.path.exists(audio_temp_written):
            os.remove(audio_temp_written)
        continue

    # 6) Run model inference
    try:
        out = pipe_base(audio_array)
        text = out.get("text", "") if isinstance(out, dict) else str(out)
    except Exception as e:
        print(f"Pipeline error idx={idx}: {e}")
        text = ""

    base_predictions.append(text)
    references.append(sample.get("transcription", "") or "")
    paths.append(audio_path)

    # cleanup temp file if we wrote one
    if audio_temp_written and os.path.exists(audio_temp_written):
        os.remove(audio_temp_written)

elapsed_total = (time.time()-start)/60.0
print(f"\n✓ Inference finished. Processed {len(base_predictions)} samples in {elapsed_total:.1f} minutes.")
print(f"Skipped/failed samples: {len(failed_idx)}; indices: {failed_idx[:20]}")


Starting inference on 418 samples...
Progress: 0/418 | Elapsed: 0.0 min


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Progress: 50/418 | Elapsed: 4.0 min
Progress: 100/418 | Elapsed: 6.9 min
Progress: 150/418 | Elapsed: 10.0 min
Progress: 200/418 | Elapsed: 13.3 min
Progress: 250/418 | Elapsed: 16.0 min
Progress: 300/418 | Elapsed: 18.8 min
Progress: 350/418 | Elapsed: 21.7 min
Progress: 400/418 | Elapsed: 24.1 min

✓ Inference finished. Processed 418 samples in 25.0 minutes.
Skipped/failed samples: 0; indices: []


In [None]:
# Cell 6 (Corrected)
print("Normalizing texts and computing WER...")

refs_norm = [normalize_text_hindi(t) for t in references]
base_norm = [normalize_text_hindi(t) for t in base_predictions]   # FIXED

# Calculate WER using jiwer.wer (returns float fraction)
wer_base = wer(refs_norm, base_norm)

# Print summary table
print("\n" + "="*60)
print("Results on FLEURS Hindi test split")
print("="*60)
print(f"Model: openai/whisper-small (pretrained baseline)")
print(f"  WER (fraction): {wer_base:.6f}")
print(f"  WER (percent) : {wer_base*100:.2f}%")
print("="*60)

# Save detailed CSV: idx, path, reference, prediction, ref_norm, pred_norm
out_csv = "fleurs_whisper_small_results.csv"
with open(out_csv, "w", newline="", encoding="utf-8") as fh:
    writer = csv.writer(fh)
    writer.writerow(["idx","audio_path","reference_raw","prediction_raw","reference_norm","prediction_norm"])
    for i, (p, r, ap) in enumerate(zip(base_predictions, references, paths)):
        writer.writerow([i, ap or "", r, p, refs_norm[i], base_norm[i]])
print("Wrote detailed results to", out_csv)


Normalizing texts and computing WER...

Results on FLEURS Hindi test split
Model: openai/whisper-small (pretrained baseline)
  WER (fraction): 0.660154
  WER (percent) : 66.02%
Wrote detailed results to fleurs_whisper_small_results.csv


In [None]:
print("Normalizing texts & computing WER...")

refs_norm = [normalize_text_hindi(t) for t in references]
preds_norm = [normalize_text_hindi(t) for t in base_predictions]

wer_value = wer(refs_norm, preds_norm)

print("\n" + "="*60)
print("WHISPER-SMALL BASELINE — FLEURS HINDI TEST")
print("="*60)
print(f"WER (fraction): {wer_value:.5f}")
print(f"WER (percent) : {wer_value*100:.2f}%")
print("="*60)


Normalizing texts & computing WER...

WHISPER-SMALL BASELINE — FLEURS HINDI TEST
WER (fraction): 0.66015
WER (percent) : 66.02%


In [None]:
csv_path = "whisper_small_fleurs_hi_results.csv"

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["idx","audio_path","reference","prediction","ref_norm","pred_norm"])
    for i in range(n):
        writer.writerow([
            i,
            paths[i],
            references[i],
            base_predictions[i],
            refs_norm[i],
            preds_norm[i]
        ])

print("Saved CSV →", csv_path)


Saved CSV → whisper_small_fleurs_hi_results.csv


In [None]:
for i in range(5):
    print(f"\nSample {i}")
    print("Audio:", paths[i])
    print("Reference:", references[i])
    print("Prediction:", base_predictions[i])
    print("Normalized ref:", refs_norm[i])
    print("Normalized pred:", preds_norm[i])
    print("-"*60)



Sample 0
Audio: /tmp/fleurs_nvs9zw0l.wav
Reference: कुछ अणुओं में अस्थिर केंद्रक होता है जिसका मतलब यह है कि उनमें थोड़े या बिना किसी झटके से टूटने की प्रवृत्ति होती है
Prediction:  कुछ आँगो में आज्टर केंद्रक होता है, जिस का मतलब यहां की उन में थोडे या बिना किसी जटके से तुटनें की प्रवत्ती होती है.
Normalized ref: कुछ अणुओं में अस्थिर केंद्रक होता है जिसका मतलब यह है कि उनमें थोड़े या बिना किसी झटके से टूटने की प्रवृत्ति होती है
Normalized pred: कुछ आँगो में आज्टर केंद्रक होता है जिस का मतलब यहां की उन में थोडे या बिना किसी जटके से तुटनें की प्रवत्ती होती है
------------------------------------------------------------

Sample 1
Audio: /tmp/fleurs_i9885jm5.wav
Reference: ग्रीनलैंड को बहुत कम जगह बसाया गया था नॉर्स सगास में वे कहते हैं कि एरिक रेड हत्या के लिए आइसलैंड से निर्वासित किया गया था और आगे पश्चिम की यात्रा करते समय ग्रीनलैंड मिला जिसे ग्रीनलैंड नाम दिया गया
Prediction:  ग्रीन लेंड को बहुत कुम जगर बसाया गया ता नोर्ष सगास में भे कहते हैं कि एरेक रेड रेद हत्या के लिए आइस लेंड से न

## minimal optimization

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/FT Data - data.csv')

In [None]:
df

Unnamed: 0,user_id,recording_id,language,duration,rec_url_gcp,transcription_url_gcp,metadata_url_gcp
0,245746,825780,hi,443,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
1,291038,825727,hi,443,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
2,246004,988596,hi,475,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
3,93626,990175,hi,475,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
4,286851,526266,hi,522,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
...,...,...,...,...,...,...,...
99,278010,753435,hi,589,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
100,413240,1021370,hi,1194,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
101,11057,1020918,hi,1194,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
102,93299,840793,hi,1146,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...


In [None]:
import pandas as pd
import re

# assuming your CSV is called 'josh_data.csv'
df = pd.read_csv("/content/FT Data - data.csv")

def convert_url(old_url):
    """
    Convert old JoshTalks GCP URLs to new upload_goai bucket format.
    Example input:
        https://storage.googleapis.com/joshtalks-data-collection/hq_data/hi/967179/825780_audio.wav
    Output:
        https://storage.googleapis.com/upload_goai/967179/825780_audio.wav
    """
    # Extract "<folder>/<id_filename>" part
    # Example: "967179/825780_audio.wav"
    match = re.search(r"/hi/(\d+/\d+_.+)$", old_url)
    if not match:
        return old_url  # fallback, in case the format is unexpected

    tail = match.group(1)  # e.g. "967179/825780_audio.wav"
    new_url = f"https://storage.googleapis.com/upload_goai/{tail}"
    return new_url

# Apply for all three URL columns
df["rec_url_gcp"] = df["rec_url_gcp"].apply(convert_url)
df["transcription_url_gcp"] = df["transcription_url_gcp"].apply(convert_url)
df["metadata_url_gcp"] = df["metadata_url_gcp"].apply(convert_url)

# Save cleaned dataset
df.to_csv("josh_data_cleaned.csv", index=False)
print("✓ Cleaned URLs and saved to josh_data_cleaned.csv")

# Display preview
df.head()


✓ Cleaned URLs and saved to josh_data_cleaned.csv


Unnamed: 0,user_id,recording_id,language,duration,rec_url_gcp,transcription_url_gcp,metadata_url_gcp
0,245746,825780,hi,443,https://storage.googleapis.com/upload_goai/967...,https://storage.googleapis.com/upload_goai/967...,https://storage.googleapis.com/upload_goai/967...
1,291038,825727,hi,443,https://storage.googleapis.com/upload_goai/967...,https://storage.googleapis.com/upload_goai/967...,https://storage.googleapis.com/upload_goai/967...
2,246004,988596,hi,475,https://storage.googleapis.com/upload_goai/114...,https://storage.googleapis.com/upload_goai/114...,https://storage.googleapis.com/upload_goai/114...
3,93626,990175,hi,475,https://storage.googleapis.com/upload_goai/114...,https://storage.googleapis.com/upload_goai/114...,https://storage.googleapis.com/upload_goai/114...
4,286851,526266,hi,522,https://storage.googleapis.com/upload_goai/639...,https://storage.googleapis.com/upload_goai/639...,https://storage.googleapis.com/upload_goai/639...
