In [None]:
!pip install -U openai-whisper jiwer librosa soundfile


Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m118.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding 

In [None]:
import numpy as np
import pandas as pd
import whisper
import os,math,re,io
import soundfile as sf
import librosa
from tqdm import tqdm

In [None]:
DATASET_PKL = "/content/drive/MyDrive/joshtalk_content/ft_dataset.pkl"   # your pickle
LEXICON_XLSX = "/content/drive/MyDrive/joshtalk_content/Speech Disfluencies List.xlsx"                   # your xlsx lexicon
OUT_DIR = "/content/drive/MyDrive/joshtalk_content/disfluency_dataset"

In [None]:
lex_df  =pd.read_excel(LEXICON_XLSX)

In [None]:
lex_df.head()

Unnamed: 0,Filled Pause,Repetition,False Start,Prolongation,Self-Correction
0,अं,मैं-मैं,जा—,अच्छ्छ्छा,कल—
1,अँ,वो-वो,कर—,हम्म्म,नहीं—
2,उम्,ये-ये,ले—,आाा,परसों—
3,,जी-जी,कह—,अरे रे रे,माफ़—
4,हम्,उह उह,वो तो,अह,नहीं तो हां तो ये


Unnamed: 0,audio,text,info
0,"[5.660588e-05, 0.000104444494, 6.564668e-05, 3...",अब काफी अच्छा होता है क्योंकि उनकी जनसंख्या बह...,"{'user_id': 245746, 'recording_id': 825780, 'l..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",जी जी जी जी जी । जी जी जी हां उधर हां जी हा हा...,"{'user_id': 291038, 'recording_id': 825727, 'l..."
2,"[5.3611493e-05, 4.5976667e-05, 3.947672e-05, 6...",लेकिन हम लोग इसे छुपछुप के लोगों के घर जाकर खे...,"{'user_id': 246004, 'recording_id': 988596, 'l..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",जी जी जी जी जी मेरे तो जैसे बहुत सारी यादे हैं...,"{'user_id': 93626, 'recording_id': 990175, 'la..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",हां जी पहले बात करते हैं विवाह की तो इस मुवी म...,"{'user_id': 286851, 'recording_id': 526266, 'l..."


In [None]:
CLIPS_DIR = os.path.join(OUT_DIR, "clips")
META_CSV = os.path.join(OUT_DIR, "metadata.csv")

In [None]:
WHISPER_SIZE = "small"
TARGET_SR=16000

In [None]:
os.makedirs(CLIPS_DIR, exist_ok=True)


In [None]:
categories = [c for c in lex_df.columns if pd.notna(c)]
disfluency_dict = {}

In [None]:
for cat in categories:
    # flatten column values into a list
    values = lex_df[cat].dropna().astype(str).tolist()
    # clean items and remove empties
    values = [v.strip() for v in values if v and v.strip()]
    # escape and create alternation; attempt to match either whole token or substring as needed
    # Use word-boundary when token contains letters/digits only; else allow substring (for hyphens, elongation marks)
    patterns = []
    for v in values:
        v_esc = re.escape(v)
        # if v contains hyphen or non-word char, allow substring match; else use word boundary
        if re.search(r"[^\w\u0900-\u097F]", v):  # includes Devanagari unicode range
            patterns.append(v_esc)
        else:
            patterns.append(r"\b" + v_esc + r"\b")
    if patterns:
        disfluency_dict[cat] = re.compile("|".join(patterns))

In [None]:
print("Loaded disfluency lexicon categories:", list(disfluency_dict.keys()))


Loaded disfluency lexicon categories: ['Filled Pause', 'Repetition', 'False Start', 'Prolongation', 'Self-Correction']


In [None]:
def ensure_np_audio(x):
    """Return numpy 1D float32 array for audio (works for list or np.array)."""
    if isinstance(x, list):
        a = np.array(x, dtype=np.float32)
    else:
        a = np.asarray(x, dtype=np.float32)
    # if 2D and shape (n,1) flatten
    if a.ndim > 1:
        a = a.reshape(-1)
    return a


In [None]:
def resample_if_needed(y, orig_sr, target_sr=TARGET_SR):
    if orig_sr != target_sr:
        return librosa.resample(y, orig_sr, target_sr)
    return y


In [None]:
def clean_text(t):
    if t is None:
        return ""
    t = str(t)
    t = t.replace("।", " ")  # replace Hindi danda with space
    t = re.sub(r"\s+", " ", t).strip()
    return t

In [None]:
print("Loading dataset:", DATASET_PKL)
dataset_df = pd.read_pickle(DATASET_PKL)
print("Dataset rows:", len(dataset_df))
display_cols = [c for c in dataset_df.columns]
print("Columns in dataset:", display_cols)


Loading dataset: /content/drive/MyDrive/joshtalk_content/ft_dataset.pkl
Dataset rows: 104
Columns in dataset: ['audio', 'text', 'info']


In [None]:
print("Loading Whisper model:", WHISPER_SIZE)
whisper_model = whisper.load_model(WHISPER_SIZE)


Loading Whisper model: small


100%|███████████████████████████████████████| 461M/461M [00:11<00:00, 43.7MiB/s]


In [None]:
metadata = []
clip_count = 0

for row_idx, row in tqdm(dataset_df.iterrows(), total=len(dataset_df), desc="Rows"):
    # --- prepare metadata from row ---
    info = row.get("info", {}) if isinstance(row, dict) or hasattr(row, "get") else row.get("info", {})
    # robust extraction of recording id
    recording_id = None
    if isinstance(info, dict):
        recording_id = info.get("recording_id") or info.get("record_id") or info.get("user_id") or row_idx
    if recording_id is None:
        recording_id = row_idx

    # --- load full audio waveform ---
    audio_raw = row.get("audio", None)
    if audio_raw is None:
        # skip if no audio
        continue
    audio_np = ensure_np_audio(audio_raw)
    # try to get sampling rate from info, fallback to 16000
    orig_sr = None
    if isinstance(info, dict) and ("sampling_rate" in info or "sr" in info):
        orig_sr = info.get("sampling_rate") or info.get("sr")
    if orig_sr is None:
        # assume 16000 if not provided
        orig_sr = TARGET_SR

    # resample to TARGET_SR (we want clips at TARGET_SR)
    if orig_sr != TARGET_SR:
        try:
            audio_np = resample_if_needed(audio_np, orig_sr, TARGET_SR)
            orig_sr = TARGET_SR
        except Exception as e:
            # fallback: continue with original array but warn
            print(f"Resample failed for row {row_idx}: {e}. Proceeding with orig_sr={orig_sr}")

    full_text = clean_text(row.get("text", "") or "")

    # --- obtain segments ---
    # prefer row['segments'] or info['segments'] if present; else use whisper transcription segmentation
    segments = None
    if isinstance(row, dict) and "segments" in row:
        segments = row["segments"]
    elif isinstance(info, dict) and info.get("segments"):
        segments = info.get("segments")
    else:
        # create a temporary wav and transcribe with word/segment timestamps
        temp_wav = f"temp_record_{row_idx}.wav"
        try:
            sf.write(temp_wav, audio_np, TARGET_SR)
            # whisper transcribe -> segments list contains 'start','end','text' and may contain 'words' depending on model
            res = whisper_model.transcribe(temp_wav, language="hi", word_timestamps=False)  # segments sufficient
            # res['segments'] is a list with start/end/text
            segments = res.get("segments", [])
        finally:
            if os.path.exists(temp_wav):
                os.remove(temp_wav)

    if not segments:
        # if no segments found, treat the entire recording as one segment
        segments = [{"id": f"{row_idx}_0", "start": 0.0, "end": len(audio_np)/TARGET_SR, "text": full_text}]

    # If segments are dict-like but missing ids/text, normalize them
    normalized_segments = []
    for si, seg in enumerate(segments):
        # possible formats: {start, end, text} or (start,end,text)
        s = None; e = None; t = ""
        if isinstance(seg, dict):
            s = seg.get("start", seg.get("start_time", None))
            e = seg.get("end", seg.get("end_time", None))
            t = seg.get("text", seg.get("transcript", seg.get("utterance", "")))
        elif isinstance(seg, (list, tuple)) and len(seg) >= 3:
            s, e, t = seg[0], seg[1], seg[2]
        else:
            # fallback: skip
            continue
        if s is None or e is None:
            continue
        t = clean_text(t)
        normalized_segments.append({"segment_id": f"{recording_id}_{si}", "start": float(s), "end": float(e), "text": t})

    # --- For each segment, detect disfluency using lexicon ---
    for seg in normalized_segments:
        seg_text = seg["text"]
        if not seg_text:
            seg_text = full_text  # fallback
        found_labels = []
        found_matches = []
        # first try whole-segment match on each category
        for cat, regex in disfluency_dict.items():
            if regex.search(seg_text):
                found_labels.append(cat)
                # extract matched substring(s)
                matches = regex.findall(seg_text)
                found_matches.extend(matches if isinstance(matches, list) else [matches])
        # if no whole-segment match, try word-level checks (split on whitespace, punctuation)
        if not found_labels:
            words = re.findall(r"[\w\u0900-\u097F\-—]+", seg_text)  # include Devanagari range
            for w in words:
                for cat, regex in disfluency_dict.items():
                    if regex.fullmatch(w) or regex.search(w):
                        found_labels.append(cat)
                        found_matches.append(w)

        # if any disfluency found, extract clip and save metadata row(s)
        if found_labels:
            for i_label, label in enumerate(found_labels):
                match_text = found_matches[i_label] if i_label < len(found_matches) else seg_text
                start_t = max(0.0, seg["start"])
                end_t = min(seg["end"], len(audio_np)/TARGET_SR)
                if end_t <= start_t:
                    continue
                # create clip name and path
                clip_name = f"{recording_id}_seg{seg['segment_id']}_t{int(start_t*1000)}_{label.replace(' ','_')}_{clip_count}.wav"
                clip_path = os.path.join(CLIPS_DIR, clip_name)
                sidx = int(math.floor(start_t * TARGET_SR))
                eidx = int(math.ceil(end_t * TARGET_SR))
                clip_audio = audio_np[sidx:eidx]
                # if clip is empty skip
                if clip_audio.size == 0:
                    continue
                # write clip as 16k wav
                sf.write(clip_path, clip_audio, TARGET_SR)
                clip_count += 1

                metadata.append({
                    "recording_id": recording_id,
                    "segment_id": seg["segment_id"],
                    "clip_id": clip_name,
                    "label": label,
                    "matched_text": match_text,
                    "start_time": start_t,
                    "end_time": end_t,
                    "clip_path": clip_path,
                    "original_text": full_text
                })


Rows:  52%|█████▏    | 54/104 [3:15:24<1:33:29, 112.19s/it]

In [None]:
meta_df = pd.DataFrame(metadata)
meta_df.to_csv(META_CSV, index=False)
print(f"\nDone. Extracted {len(meta_df)} disfluency clips.")
print("Clips folder:", CLIPS_DIR)
print("Metadata CSV:", META_CSV)