In [None]:
# --- Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# --- Point to your Parquet folder (adjust if needed)
PARQUET_DIR = "/content/drive/MyDrive/problem_statement_6/training_data/english"

import os, json, itertools
import pyarrow.parquet as pq
import pandas as pd

# Find parquet files
parquet_files = [os.path.join(PARQUET_DIR, f) for f in os.listdir(PARQUET_DIR) if f.endswith(".parquet")]
assert parquet_files, f"No .parquet files found in {PARQUET_DIR}"
print(f"Found {len(parquet_files)} parquet files.")
for f in parquet_files[:5]:
    print("•", os.path.basename(f))

def show_schema_and_samples(pq_path, n=5):
    print("\n=== FILE:", os.path.basename(pq_path), "===")
    pf = pq.ParquetFile(pq_path)
    print("\n--- Arrow Schema ---")
    print(pf.schema)

    # Read first n rows as pandas for easy viewing
    df = pf.read_row_groups([0], columns=None).to_pandas()
    print("\n--- Columns ---")
    print(list(df.columns))
    print("\n--- dtypes ---")
    print(df.dtypes)

    # Show a few rows (truncate long fields)
    def trunc(x):
        s = str(x)
        return s if len(s) < 200 else s[:200] + " ...[truncated]..."
    print("\n--- Sample rows ---")
    for i in range(min(n, len(df))):
        row = {c: trunc(df.iloc[i][c]) for c in df.columns}
        print(json.dumps(row, ensure_ascii=False))

# Probe first file (repeat if partitions differ)
show_schema_and_samples(parquet_files[0], n=5)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 3 parquet files.
• test-00001-of-00003.parquet
• test-00002-of-00003.parquet
• test-00000-of-00003.parquet

=== FILE: test-00001-of-00003.parquet ===

--- Arrow Schema ---
<pyarrow._parquet.ParquetSchema object at 0x7a73cc71e540>
required group field_id=-1 schema {
  optional group field_id=-1 audio_filepath {
    optional binary field_id=-1 bytes;
    optional binary field_id=-1 path (String);
  }
  optional double field_id=-1 duration;
  optional binary field_id=-1 text (String);
  optional binary field_id=-1 gender (String);
  optional binary field_id=-1 age-group (String);
  optional binary field_id=-1 primary_language (String);
  optional binary field_id=-1 native_place_state (String);
  optional binary field_id=-1 native_place_district (String);
  optional binary field_id=-1 highest_qualification (String);
  optional binary field_id=-1 job_categor

In [None]:
!pip install soundfile pyarrow pandas

import os, json, hashlib
import pandas as pd
import numpy as np
import soundfile as sf
import io
import pyarrow.parquet as pq

# Paths
PARQUET_DIR = "/content/drive/MyDrive/problem_statement_6/training_data/english"
OUT_DIR = "/content/drive/MyDrive/problem_statement_6/training_data/english_wav"
os.makedirs(OUT_DIR, exist_ok=True)

manifest_rows = []

# Iterate over all parquet files
parquet_files = [os.path.join(PARQUET_DIR, f) for f in os.listdir(PARQUET_DIR) if f.endswith(".parquet")]
for fidx, pq_path in enumerate(parquet_files):
    print(f"\nProcessing {fidx+1}/{len(parquet_files)}: {os.path.basename(pq_path)}")
    df = pd.read_parquet(pq_path)

    for i, row in df.iterrows():
        try:
            # audio_filepath is a dict with "bytes" (raw WAV)
            audio_dict = row["audio_filepath"]
            if not isinstance(audio_dict, dict) or "bytes" not in audio_dict:
                continue

            wav_bytes = audio_dict["bytes"]
            # Decode WAV bytes
            audio_data, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32", always_2d=False)

            # Flatten stereo to mono if needed
            if audio_data.ndim == 2:
                audio_data = audio_data.mean(axis=1)

            # Build a deterministic filename
            uid = hashlib.md5(f"{os.path.basename(pq_path)}::{i}".encode("utf-8")).hexdigest()[:12]
            out_wav = os.path.join(OUT_DIR, f"iitm_{uid}.wav")

            # Save .wav
            sf.write(out_wav, audio_data, sr)

            # Collect metadata for manifest
            manifest_rows.append({
                "audio_file": out_wav,
                "duration": row.get("duration", ""),
                "transcript": row.get("text", ""),
                "gender": row.get("gender", ""),
                "age_group": row.get("age-group", ""),
                "primary_language": row.get("primary_language", ""),
                "native_place_state": row.get("native_place_state", ""),
                "native_place_district": row.get("native_place_district", ""),
                "highest_qualification": row.get("highest_qualification", ""),
                "job_category": row.get("job_category", ""),
                "occupation_domain": row.get("occupation_domain", "")
            })

        except Exception as e:
            print(f"  ! Error row {i}: {e}")

# Save manifest CSV
manifest_path = os.path.join(OUT_DIR, "manifest.csv")
pd.DataFrame(manifest_rows).to_csv(manifest_path, index=False, encoding="utf-8")
print("\n✅ Done. Wrote", len(manifest_rows), "rows to", manifest_path)



Processing 1/3: test-00001-of-00003.parquet

Processing 2/3: test-00002-of-00003.parquet

Processing 3/3: test-00000-of-00003.parquet

✅ Done. Wrote 6656 rows to /content/drive/MyDrive/problem_statement_6/training_data/english_wav/manifest.csv


In [None]:
import os

pa_dir = "/content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN"
hi_dir = "/content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/hi"

print("Punjabi folder contents:")
print(os.listdir(pa_dir))

print("\nHindi folder contents:")
print(os.listdir(hi_dir))


Punjabi folder contents:
['test_manifest.json', 'clips_16k']

Hindi folder contents:
['test_manifest.json', 'clips_16k']


In [None]:
# --- Config: set your two folders
PA_DIR = "/content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN"
HI_DIR = "/content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/hi"

import os, json, pandas as pd

def peek_manifest(manifest_path, n=5):
    print(f"\nPreview: {manifest_path}")
    with open(manifest_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= n: break
            print(line.strip())

def load_manifest_to_rows(lang_dir, lang_label, manifest_basename="test_manifest.json"):
    """
    Reads JSON-lines manifest and returns a list of rows with absolute audio paths.
    Expected keys (common in CV prepared exports):
      - audio_filepath (relative path like 'clips_16k/xxx.wav')
      - duration (seconds)
      - text (transcript)
      - optional: client_id, gender, age, accent, locale
    """
    mpath = os.path.join(lang_dir, manifest_basename)
    assert os.path.exists(mpath), f"Manifest not found: {mpath}"

    rows, missing = [], 0
    with open(mpath, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            rel = rec.get("audio_filepath") or rec.get("path")  # fallback key
            if not rel:
                continue
            # Build absolute path
            wav_abs = os.path.join(lang_dir, rel)
            if not os.path.isabs(wav_abs):
                wav_abs = os.path.abspath(wav_abs)
            exists = os.path.exists(wav_abs)
            if not exists:
                missing += 1

            rows.append({
                "audio_file": wav_abs,
                "exists": exists,
                "duration": rec.get("duration", ""),
                "transcript": rec.get("text", ""),
                "language": lang_label,
                "client_id": rec.get("client_id", ""),
                "gender": rec.get("gender", ""),
                "age": rec.get("age", ""),
                "accent": rec.get("accent", ""),
                "locale": rec.get("locale", ""),
                "source": f"common_voice_11:{lang_label}"
            })
    print(f"{lang_label}: parsed {len(rows)} rows | missing files: {missing}")
    return rows

# 1) Peek the first few lines to confirm schema
peek_manifest(os.path.join(PA_DIR, "test_manifest.json"))
peek_manifest(os.path.join(HI_DIR, "test_manifest.json"))

# 2) Convert to CSV manifests
pa_rows = load_manifest_to_rows(PA_DIR, lang_label="punjabi", manifest_basename="test_manifest.json")
hi_rows = load_manifest_to_rows(HI_DIR, lang_label="hindi",   manifest_basename="test_manifest.json")

pa_df = pd.DataFrame(pa_rows)
hi_df = pd.DataFrame(hi_rows)

pa_csv = os.path.join(PA_DIR, "manifest_test.csv")
hi_csv = os.path.join(HI_DIR, "manifest_test.csv")

# Keep only rows whose audio exists
pa_df = pa_df[pa_df["exists"]].drop(columns=["exists"])
hi_df = hi_df[hi_df["exists"]].drop(columns=["exists"])

pa_df.to_csv(pa_csv, index=False, encoding="utf-8")
hi_df.to_csv(hi_csv, index=False, encoding="utf-8")

print("\n✅ Wrote:")
print(" -", pa_csv, f"({len(pa_df)} rows)")
print(" -", hi_csv, f"({len(hi_df)} rows)")



Preview: /content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN/test_manifest.json
{"audio_filepath": "/nlsasfs/home/ai4bharat/ai4bharat-pr/speechteam/asr_datasets/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN/clips_16k/common_voice_pa-IN_25183731.wav", "duration": 3.924, "text": "\u0a26\u0a47\u0a16\u0a26\u0a47 \u0a26\u0a47\u0a16\u0a26\u0a47 \u0a35\u0a3f\u0a39\u0a5c\u0a3e \u0a32\u0a4b\u0a15\u0a3e\u0a02 \u0a28\u0a3e\u0a32 \u0a2d\u0a30 \u0a17\u0a3f\u0a06"}
{"audio_filepath": "/nlsasfs/home/ai4bharat/ai4bharat-pr/speechteam/asr_datasets/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN/clips_16k/common_voice_pa-IN_24666519.wav", "duration": 3.96, "text": "\u0a2a\u0a3e\u0a23\u0a40 \u0a09\u0a2c\u0a3e\u0a32 \u0a15\u0a47 \u0a39\u0a40 \u0a2a\u0a40\u0a23\u0a3e \u0a1a\u0a3e\u0a39\u0a40\u0a26\u0a3e \u0a39\u0a48"}
{"audio_filepath": "/nlsasfs/home/ai4bharat/ai4bharat-pr/speechteam/asr_datasets/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN/clips_16k/

In [None]:
import os, json, pandas as pd

# Folders you showed
PA_DIR = "/content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN"
HI_DIR = "/content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/hi"

def fix_manifest(lang_dir, lang_label, in_name="test_manifest.json", out_name="manifest_test.csv"):
    manifest_path = os.path.join(lang_dir, in_name)
    clips_dir = os.path.join(lang_dir, "clips_16k")

    # Index local files by basename for fast lookup
    local_files = {}
    for fn in os.listdir(clips_dir):
        if fn.lower().endswith((".wav", ".mp3", ".flac", ".ogg")):
            local_files[fn] = os.path.join(clips_dir, fn)

    rows, missing = [], []
    with open(manifest_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            # Get the remote path and collapse to basename
            remote = rec.get("audio_filepath") or rec.get("path")
            if not remote:
                continue
            base = os.path.basename(remote)  # e.g., common_voice_hi_31389203.wav

            # Map to local path
            wav_abs = local_files.get(base)
            if not wav_abs:
                missing.append(base)
                continue

            rows.append({
                "audio_file": wav_abs,
                "duration": rec.get("duration", ""),
                "transcript": rec.get("text", "").strip(),
                "language": lang_label,
                "client_id": rec.get("client_id", ""),
                "gender": rec.get("gender", ""),
                "age": rec.get("age", ""),
                "accent": rec.get("accent", ""),
                "locale": rec.get("locale", ""),
                "source": f"common_voice_11:{lang_label}"
            })

    df = pd.DataFrame(rows)
    out_csv = os.path.join(lang_dir, out_name)
    df.to_csv(out_csv, index=False, encoding="utf-8")

    print(f"{lang_label}: kept {len(df)} rows; missing matches: {len(missing)}")
    if missing[:5]:
        print("  sample missing basenames:", missing[:5])
    print("  wrote:", out_csv)
    return df

pa_df = fix_manifest(PA_DIR, "punjabi")
hi_df = fix_manifest(HI_DIR, "hindi")


punjabi: kept 171 rows; missing matches: 0
  wrote: /content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN/manifest_test.csv
hindi: kept 1727 rows; missing matches: 0
  wrote: /content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/hi/manifest_test.csv


In [None]:
import os, pandas as pd

# Paths (adjust english_manifest if your IIT-M export lives elsewhere)
pa_csv = "/content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/pa-IN/manifest_test.csv"
hi_csv = "/content/drive/MyDrive/problem_statement_6/training_data/commonvoice/commonvoice/cv-corpus-11.0-2022-09-21/hi/manifest_test.csv"
english_manifest = "/content/drive/MyDrive/problem_statement_6/training_data/english_wav/manifest.csv"  # from your IIT-M parquet conversion step
out_merged = "/content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv"

dfs = []
for p in [english_manifest, hi_csv, pa_csv]:
    if not os.path.exists(p):
        print("⚠️ Missing:", p)
        continue
    df = pd.read_csv(p)
    # Harmonize columns
    rename_map = {
        "text":"transcript",
        "age-group":"speaker_age_group",
        "age":"speaker_age_group",
        "gender":"speaker_gender"
    }
    df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})
    # Ensure required fields
    for c in ["audio_file","transcript","language"]:
        if c not in df.columns:
            df[c] = ""
    # Backfill language if absent
    if (df["language"] == "").any():
        # infer from path when possible
        df.loc[df["language"]=="", "language"] = df["audio_file"].str.extract(r"/(hi|pa-IN)/", expand=False).map({"hi":"hindi","pa-IN":"punjabi"})
        # default English for IIT-M if still empty
        df.loc[(df["language"]==""), "language"] = "english"
    # Strip whitespace
    df["transcript"] = df["transcript"].fillna("").astype(str).str.strip()
    df["audio_file"] = df["audio_file"].astype(str)
    # Keep only rows with real files and non-empty transcript
    df = df[df["audio_file"].apply(os.path.exists)]
    df = df[df["transcript"].str.len() > 0]
    dfs.append(df)

assert dfs, "No input manifests found. Check the paths."

full = pd.concat(dfs, ignore_index=True)

# Optional: drop dupes by identical (audio_file, transcript)
full = full.drop_duplicates(subset=["audio_file","transcript"])

# Tidy language labels
full["language"] = full["language"].str.lower().replace({
    "en":"english","eng":"english",
    "hi":"hindi","hin":"hindi",
    "pa":"punjabi","pa-in":"punjabi","pan":"punjabi"
})

# Save
full.to_csv(out_merged, index=False, encoding="utf-8")
print(f"✅ Saved unified manifest: {out_merged}")
print("Rows:", len(full))

# Quick stats
print("\n— Language counts —")
print(full["language"].value_counts())

if "speaker_gender" in full.columns:
    print("\n— Gender (non-empty) —")
    print(full["speaker_gender"].replace({"":None}).dropna().value_counts().head())

if "speaker_age_group" in full.columns:
    print("\n— Age group (non-empty) —")
    print(full["speaker_age_group"].replace({"":None}).dropna().value_counts().head())

# Peek a few rows
print("\n— Sample rows —")
print(full[["audio_file","language","transcript"]].head(5).to_string(index=False))


✅ Saved unified manifest: /content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv
Rows: 8554

— Language counts —
language
hindi      1727
punjabi     171
Name: count, dtype: int64

— Gender (non-empty) —
speaker_gender
Female    3579
Male      3077
Name: count, dtype: int64

— Age group (non-empty) —
Series([], Name: count, dtype: int64)

— Sample rows —
                                                                                audio_file language                                                                                             transcript
/content/drive/MyDrive/problem_statement_6/training_data/english_wav/iitm_9aafcf3bab2d.wav      NaN some in the starting, then again I poured and finally it happened that it became so wet that to have a
/content/drive/MyDrive/problem_statement_6/training_data/english_wav/iitm_bc3b8ad955ee.wav      NaN                                   North 24 Parganas, South 24 Parganas, Murshidabad, Birbhum, Hooghly.
/conte

In [None]:
import os

eng_dir = "/content/drive/MyDrive/problem_statement_6/training_data/english_wav"

if not os.path.exists(eng_dir):
    print("⚠️ English folder not found:", eng_dir)
else:
    files = os.listdir(eng_dir)
    print(f"English folder has {len(files)} entries")
    # Show a sample of what’s inside
    for f in files[:20]:
        print(" •", f)
    # Check if a manifest is present
    has_manifest = [f for f in files if f.lower().startswith("manifest") and f.endswith(".csv")]
    if has_manifest:
        print("\n✅ Found manifest(s):", has_manifest)
    else:
        print("\n⚠️ No manifest CSV found in english_wav/")


English folder has 6657 entries
 • iitm_69aa211ed1cb.wav
 • iitm_4fd8444d89f5.wav
 • iitm_8500b9c8f3fa.wav
 • iitm_bcffc123b4a8.wav
 • iitm_5cf0cb61e89e.wav
 • iitm_670e3c6f232d.wav
 • iitm_2a5c402e12a9.wav
 • iitm_6050c01ac604.wav
 • iitm_d2333bdb39fe.wav
 • iitm_ec1815aa399d.wav
 • iitm_39c5815f9616.wav
 • iitm_e8f2b2ba9f17.wav
 • iitm_b21e95467404.wav
 • iitm_a313cfd0fc84.wav
 • iitm_2a4a25b16a75.wav
 • iitm_70224a33a7cf.wav
 • iitm_89ef1863fc77.wav
 • iitm_dd257c8e6087.wav
 • iitm_c9a96975976d.wav
 • iitm_71662b714dcc.wav

✅ Found manifest(s): ['manifest.csv']


In [None]:
import os, pandas as pd

merged_path = "/content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv"
en_manifest = "/content/drive/MyDrive/problem_statement_6/training_data/english_wav/manifest.csv"

# Load
full = pd.read_csv(merged_path)
en   = pd.read_csv(en_manifest)

# Harmonize column names
en = en.rename(columns={
    "text": "transcript",
    "age-group": "speaker_age_group",
    "gender": "speaker_gender"
})

# Ensure required columns exist
for c in ["audio_file","transcript","language"]:
    if c not in en.columns:
        en[c] = ""
# Backfill language (English set)
en.loc[(en["language"]=="") | (en["language"].isna()), "language"] = "english"

# Keep valid rows only
en["transcript"] = en["transcript"].fillna("").astype(str).str.strip()
en = en[(en["audio_file"].apply(os.path.exists)) & (en["transcript"].str.len() > 0)]

# Merge & tidy
out = pd.concat([full, en], ignore_index=True).drop_duplicates(subset=["audio_file","transcript"])
out["language"] = out["language"].str.lower().replace({
    "en":"english","eng":"english",
    "hi":"hindi","hin":"hindi",
    "pa":"punjabi","pa-in":"punjabi","pan":"punjabi"
})

# Save
out.to_csv(merged_path, index=False, encoding="utf-8")
print("✅ Updated unified manifest:", merged_path)
print("Rows:", len(out))
print("\n— Language counts —")
print(out["language"].value_counts())


✅ Updated unified manifest: /content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv
Rows: 8554

— Language counts —
language
hindi      1727
punjabi     171
Name: count, dtype: int64


In [None]:
import pandas as pd

manifest_path = "/content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv"
df = pd.read_csv(manifest_path)

print("Columns:", list(df.columns))
print("Total rows:", len(df))

# Unique values in language column
print("\nUnique values in 'language' column (first 20):")
print(df["language"].dropna().unique()[:20])

# Counts including NaN/blank
print("\nValue counts (raw):")
print(df["language"].value_counts(dropna=False))

# Peek a few rows that are missing language
print("\nSample rows where language is blank or NaN:")
print(df[df["language"].isna() | (df["language"].astype(str).str.strip()=="")].head(10)[["audio_file","language","transcript"]])


Columns: ['audio_file', 'duration', 'transcript', 'speaker_gender', 'age_group', 'primary_language', 'native_place_state', 'native_place_district', 'highest_qualification', 'job_category', 'occupation_domain', 'language', 'client_id', 'speaker_age_group', 'accent', 'locale', 'source']
Total rows: 8554

Unique values in 'language' column (first 20):
['hindi' 'punjabi']

Value counts (raw):
language
NaN        6656
hindi      1727
punjabi     171
Name: count, dtype: int64

Sample rows where language is blank or NaN:
                                          audio_file language  \
0  /content/drive/MyDrive/problem_statement_6/tra...      NaN   
1  /content/drive/MyDrive/problem_statement_6/tra...      NaN   
2  /content/drive/MyDrive/problem_statement_6/tra...      NaN   
3  /content/drive/MyDrive/problem_statement_6/tra...      NaN   
4  /content/drive/MyDrive/problem_statement_6/tra...      NaN   
5  /content/drive/MyDrive/problem_statement_6/tra...      NaN   
6  /content/drive/MyDrive

In [None]:
import pandas as pd

manifest_path = "/content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv"
df = pd.read_csv(manifest_path)

print("Before fix:", df["language"].value_counts(dropna=False))

# Fill NaN or blank with "english"
df["language"] = df["language"].fillna("").astype(str).str.strip().replace({"": "english"})

# Normalize to lowercase
df["language"] = df["language"].str.lower()

df.to_csv(manifest_path, index=False, encoding="utf-8")

print("\nAfter fix:", df["language"].value_counts())


Before fix: language
NaN        6656
hindi      1727
punjabi     171
Name: count, dtype: int64

After fix: language
english    6656
hindi      1727
punjabi     171
Name: count, dtype: int64


In [None]:
# Duration stats for foundational_manifest.csv

!pip -q install soundfile
import os, pandas as pd, numpy as np, soundfile as sf
from tqdm import tqdm

MANIFEST = "/content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv"

df = pd.read_csv(MANIFEST)
assert {"audio_file","language","transcript"}.issubset(df.columns), "manifest is missing required columns"

# --- Ensure a numeric duration column ---
if "duration" not in df.columns:
    df["duration"] = np.nan

# Only compute for rows where duration is missing/invalid
mask_need = df["duration"].isna() | (df["duration"] <= 0)
print(f"Rows needing duration compute: {mask_need.sum()} / {len(df)}")

def get_duration_sec(path):
    try:
        info = sf.info(path)
        return float(info.frames) / float(info.samplerate)
    except Exception:
        return np.nan  # keep as NaN if unreadable (bad file, missing, etc.)

if mask_need.any():
    for idx in tqdm(df[mask_need].index, desc="Computing durations"):
        p = df.at[idx, "audio_file"]
        df.at[idx, "duration"] = get_duration_sec(p)

# Drop rows with missing files or zero/NaN duration/transcript
before = len(df)
df = df[df["audio_file"].apply(os.path.exists)]
df = df[df["transcript"].fillna("").astype(str).str.strip().ne("")]
df = df[df["duration"].notna() & (df["duration"] > 0)]
after = len(df)
print(f"Filtered invalid rows: {before - after} removed, {after} remain")

# --- Aggregate stats ---
df["language"] = df["language"].astype(str).str.strip().str.lower()
counts = df["language"].value_counts().sort_index()
hours  = (df.groupby("language")["duration"].sum() / 3600.0).sort_index().round(2)
mean_s = df.groupby("language")["duration"].mean().round(2)
med_s  = df.groupby("language")["duration"].median().round(2)
p95_s  = df.groupby("language")["duration"].quantile(0.95).round(2)

print("\n=== Clips per language ===")
print(counts)
print("\n=== Hours per language ===")
print(hours)
print("\n=== Duration stats (seconds) ===")
stats = pd.DataFrame({"mean": mean_s, "median": med_s, "p95": p95_s})
print(stats)

# Totals
print("\n=== Totals ===")
print(f"Total clips: {int(counts.sum())}")
print(f"Total hours: {hours.sum():.2f}")

# (Optional) Save the manifest back with filled durations
df.to_csv(MANIFEST, index=False, encoding="utf-8")
print(f"\n✅ Updated manifest (durations filled where missing): {MANIFEST}")


Rows needing duration compute: 0 / 8554
Filtered invalid rows: 0 removed, 8554 remain

=== Clips per language ===
language
english    6656
hindi      1727
punjabi     171
Name: count, dtype: int64

=== Hours per language ===
language
english    9.61
hindi      2.46
punjabi    0.20
Name: duration, dtype: float64

=== Duration stats (seconds) ===
          mean  median    p95
language                     
english   5.20    4.21  13.86
hindi     5.13    5.08   7.72
punjabi   4.25    4.10   5.78

=== Totals ===
Total clips: 8554
Total hours: 12.27

✅ Updated manifest (durations filled where missing): /content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv


In [None]:
# === Simple 10-min session mixer with quotas & light overlaps ===
!pip -q install soundfile numpy pandas

import os, json, math, random
import numpy as np
import pandas as pd
import soundfile as sf

MANIFEST = "/content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv"
OUT_DIR  = "/content/drive/MyDrive/problem_statement_6/sessions_smooth"
os.makedirs(OUT_DIR, exist_ok=True)

# --- Mixer config ---
SESSION_SEC        = 600            # target ~10 minutes
GAP_RANGE_SEC      = (0.2, 0.8)     # gaps between turns
OVERLAP_PROB       = 0.12           # ~12% turns overlap
OVERLAP_SEC_RANGE  = (0.5, 1.5)
SPEAKERS_PER_SESS  = (3, 4)
SR_TARGET          = 16000          # assume files are at 16 kHz (CV already is; normalize EN earlier if needed)

# Language quotas per session (fractions; will be normalized)
LANG_QUOTAS = {
    "english": 0.34,
    "hindi":   0.33,
    "punjabi": 0.33,  # oversample PA despite low hours
}

# Sessions to create
N_SESSIONS = 20

# --- Load & pre-bucket by language ---
df = pd.read_csv(MANIFEST)
df = df[df["language"].isin(["english","hindi","punjabi"])]
df = df[df["transcript"].fillna("").astype(str).str.strip().ne("")]

# Optional: trim extreme durations
if "duration" in df.columns:
    df = df[(df["duration"] > 0.5) & (df["duration"] < 30)]

buckets = {lang: df[df["language"]==lang].copy() for lang in ["english","hindi","punjabi"]}

def load_wav(path, target_sr=SR_TARGET):
    data, sr = sf.read(path, always_2d=False)
    if data.ndim == 2:
        data = data.mean(axis=1)
    if sr != target_sr:
        # Simple resample via linear interpolation to avoid heavy deps
        x = np.arange(len(data))
        new_len = int(len(data) * (target_sr / sr))
        xi = np.linspace(0, len(data)-1, new_len)
        data = np.interp(xi, x, data).astype(np.float32)
        sr = target_sr
    return data.astype(np.float32), sr

def pick_speakers(rows, k):
    # Use file path stem as provisional speaker; ensures multiple turns per file may map to same spk
    # For simplicity, pick k *unique stems* across languages
    stems = rows["audio_file"].apply(lambda p: os.path.splitext(os.path.basename(p))[0])
    uniq = rows.assign(stem=stems).drop_duplicates(subset=["stem"])
    return uniq.sample(min(k, len(uniq)), random_state=random.randint(0,1_000_000))["stem"].tolist()

def build_session(session_idx):
    # Decide speakers per session
    k = random.randint(*SPEAKERS_PER_SESS)

    # Compute target seconds per language
    total_weight = sum(LANG_QUOTAS.values())
    quotas = {l: (v/total_weight) * SESSION_SEC for l,v in LANG_QUOTAS.items()}

    # Candidate pool for each language
    pools = {l: buckets[l].sample(frac=1.0, random_state=random.randint(0,1_000_000)).reset_index(drop=True) for l in buckets}
    # Choose provisional speakers from the whole df for variety
    spk_list = pick_speakers(df, k)
    spk_map = {f"spk{i+1:02d}": s for i, s in enumerate(spk_list)}

    timeline = []  # list of segments with audio arrays and metadata
    cur_t = 0.0

    # Greedy fill by language quotas
    while cur_t < SESSION_SEC - 1.0:
        # Choose language weighted by remaining quota
        rem = {l: max(0.0, quotas[l]) for l in quotas}
        lang = random.choices(list(rem.keys()), weights=list(rem.values()), k=1)[0]

        # Draw a clip from that language
        pool = pools[lang]
        if pool.empty:
            # fallback to any language with remaining quota
            lang = random.choice(["english","hindi","punjabi"])
            pool = pools[lang]
            if pool.empty:
                break

        row = pool.iloc[0]
        pools[lang] = pool.iloc[1:]  # pop

        # Assign a session speaker randomly
        sess_spk = random.choice(list(spk_map.keys()))
        # Load audio
        wav, sr = load_wav(row["audio_file"], SR_TARGET)
        dur = len(wav) / SR_TARGET

        # Decide overlap or gap
        if timeline and random.random() < OVERLAP_PROB:
            ov = random.uniform(*OVERLAP_SEC_RANGE)
            start_ts = max(0.0, cur_t - ov)
        else:
            gap = random.uniform(*GAP_RANGE_SEC)
            start_ts = cur_t + gap

        end_ts = start_ts + dur
        cur_t = max(cur_t, end_ts)

        timeline.append({
            "audio": wav,
            "start_ts": start_ts,
            "end_ts": end_ts,
            "session_speaker": sess_spk,
            "speaker_proxy": spk_map[sess_spk],
            "language": lang,
            "transcript": str(row["transcript"]),
            "source_file": row["audio_file"]
        })

        # Reduce language quota
        quotas[lang] -= dur
        if cur_t >= SESSION_SEC:
            break

    # Render mixed audio to buffer
    if not timeline:
        return None

    # Determine final length
    T = math.ceil(max(seg["end_ts"] for seg in timeline) * SR_TARGET)
    mix = np.zeros(T, dtype=np.float32)

    for seg in timeline:
        a = seg["audio"]
        s = int(round(seg["start_ts"] * SR_TARGET))
        e = s + len(a)
        if e > len(mix):
            mix = np.pad(mix, (0, e - len(mix)))
        mix[s:e] += a  # naive sum; could clamp later to prevent clipping

    # Light peak limiting to avoid clipping
    peak = np.max(np.abs(mix)) if mix.size else 1.0
    if peak > 1.0:
        mix = mix / peak

    # Write files
    out_wav = os.path.join(OUT_DIR, f"session_{session_idx:03d}.wav")
    sf.write(out_wav, mix, SR_TARGET)

    # Segment manifest line-per-line
    out_manifest = os.path.join(OUT_DIR, f"session_{session_idx:03d}.jsonl")
    with open(out_manifest, "w", encoding="utf-8") as f:
        for seg in timeline:
            rec = {
                "audio_file": os.path.basename(out_wav),
                "start_ts": round(seg["start_ts"], 3),
                "end_ts": round(seg["end_ts"], 3),
                "speaker": seg["session_speaker"],      # for SD
                "language": seg["language"],             # for LID
                "asr_text": seg["transcript"],           # for ASR
                "translation_en": "",                    # fill later via NMT if needed
                "source_path": seg["source_file"],
                "speaker_proxy": seg["speaker_proxy"]
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    return out_wav, out_manifest, len(timeline)

# Build sessions
made = 0
for i in range(N_SESSIONS):
    res = build_session(i+1)
    if res:
        print(f"✅ Session {i+1:03d}: wrote {res[0]} & {res[1]} with {res[2]} segments")
        made += 1
    else:
        print(f"⚠️ Session {i+1:03d}: no content")

print(f"\nDone. Sessions created: {made}/{N_SESSIONS}")
print("Output dir:", OUT_DIR)

✅ Session 001: wrote /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_001.wav & /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_001.jsonl with 113 segments
✅ Session 002: wrote /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_002.wav & /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_002.jsonl with 113 segments
✅ Session 003: wrote /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_003.wav & /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_003.jsonl with 113 segments
✅ Session 004: wrote /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_004.wav & /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_004.jsonl with 124 segments
✅ Session 005: wrote /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_005.wav & /content/drive/MyDrive/problem_statement_6/sessions_smooth/session_005.jsonl with 112 segments
✅ Session 006: wrote /content/drive

In [None]:
# === Simple 10-min session mixer with quotas & light overlaps ===
!pip -q install soundfile numpy pandas

import os, json, math, random
import numpy as np
import pandas as pd
import soundfile as sf

MANIFEST = "/content/drive/MyDrive/problem_statement_6/training_data/foundational_manifest.csv"
OUT_DIR  = "/content/drive/MyDrive/problem_statement_6/sessions_balanced"
os.makedirs(OUT_DIR, exist_ok=True)

# --- Mixer config ---
SESSION_SEC        = 600            # target ~10 minutes
GAP_RANGE_SEC      = (0.2, 0.8)     # gaps between turns
OVERLAP_PROB       = 0.12           # ~12% turns overlap
OVERLAP_SEC_RANGE  = (0.5, 1.5)
SPEAKERS_PER_SESS  = (3, 4)
SR_TARGET          = 16000          # assume files are at 16 kHz (CV already is; normalize EN earlier if needed)

# Language quotas per session (fractions; will be normalized)
LANG_QUOTAS = {
    "english": 0.60,
    "hindi":   0.30,
    "punjabi": 0.10,  # oversample PA despite low hours
}

# Sessions to create
N_SESSIONS = 20

# --- Load & pre-bucket by language ---
df = pd.read_csv(MANIFEST)
df = df[df["language"].isin(["english","hindi","punjabi"])]
df = df[df["transcript"].fillna("").astype(str).str.strip().ne("")]

# Optional: trim extreme durations
if "duration" in df.columns:
    df = df[(df["duration"] > 0.5) & (df["duration"] < 30)]

buckets = {lang: df[df["language"]==lang].copy() for lang in ["english","hindi","punjabi"]}

def load_wav(path, target_sr=SR_TARGET):
    data, sr = sf.read(path, always_2d=False)
    if data.ndim == 2:
        data = data.mean(axis=1)
    if sr != target_sr:
        # Simple resample via linear interpolation to avoid heavy deps
        x = np.arange(len(data))
        new_len = int(len(data) * (target_sr / sr))
        xi = np.linspace(0, len(data)-1, new_len)
        data = np.interp(xi, x, data).astype(np.float32)
        sr = target_sr
    return data.astype(np.float32), sr

def pick_speakers(rows, k):
    # Use file path stem as provisional speaker; ensures multiple turns per file may map to same spk
    # For simplicity, pick k *unique stems* across languages
    stems = rows["audio_file"].apply(lambda p: os.path.splitext(os.path.basename(p))[0])
    uniq = rows.assign(stem=stems).drop_duplicates(subset=["stem"])
    return uniq.sample(min(k, len(uniq)), random_state=random.randint(0,1_000_000))["stem"].tolist()

def build_session(session_idx):
    # Decide speakers per session
    k = random.randint(*SPEAKERS_PER_SESS)

    # Compute target seconds per language
    total_weight = sum(LANG_QUOTAS.values())
    quotas = {l: (v/total_weight) * SESSION_SEC for l,v in LANG_QUOTAS.items()}

    # Candidate pool for each language
    pools = {l: buckets[l].sample(frac=1.0, random_state=random.randint(0,1_000_000)).reset_index(drop=True) for l in buckets}
    # Choose provisional speakers from the whole df for variety
    spk_list = pick_speakers(df, k)
    spk_map = {f"spk{i+1:02d}": s for i, s in enumerate(spk_list)}

    timeline = []  # list of segments with audio arrays and metadata
    cur_t = 0.0

    # Greedy fill by language quotas
    while cur_t < SESSION_SEC - 1.0:
        # Choose language weighted by remaining quota
        rem = {l: max(0.0, quotas[l]) for l in quotas}
        lang = random.choices(list(rem.keys()), weights=list(rem.values()), k=1)[0]

        # Draw a clip from that language
        pool = pools[lang]
        if pool.empty:
            # fallback to any language with remaining quota
            lang = random.choice(["english","hindi","punjabi"])
            pool = pools[lang]
            if pool.empty:
                break

        row = pool.iloc[0]
        pools[lang] = pool.iloc[1:]  # pop

        # Assign a session speaker randomly
        sess_spk = random.choice(list(spk_map.keys()))
        # Load audio
        wav, sr = load_wav(row["audio_file"], SR_TARGET)
        dur = len(wav) / SR_TARGET

        # Decide overlap or gap
        if timeline and random.random() < OVERLAP_PROB:
            ov = random.uniform(*OVERLAP_SEC_RANGE)
            start_ts = max(0.0, cur_t - ov)
        else:
            gap = random.uniform(*GAP_RANGE_SEC)
            start_ts = cur_t + gap

        end_ts = start_ts + dur
        cur_t = max(cur_t, end_ts)

        timeline.append({
            "audio": wav,
            "start_ts": start_ts,
            "end_ts": end_ts,
            "session_speaker": sess_spk,
            "speaker_proxy": spk_map[sess_spk],
            "language": lang,
            "transcript": str(row["transcript"]),
            "source_file": row["audio_file"]
        })

        # Reduce language quota
        quotas[lang] -= dur
        if cur_t >= SESSION_SEC:
            break

    # Render mixed audio to buffer
    if not timeline:
        return None

    # Determine final length
    T = math.ceil(max(seg["end_ts"] for seg in timeline) * SR_TARGET)
    mix = np.zeros(T, dtype=np.float32)

    for seg in timeline:
        a = seg["audio"]
        s = int(round(seg["start_ts"] * SR_TARGET))
        e = s + len(a)
        if e > len(mix):
            mix = np.pad(mix, (0, e - len(mix)))
        mix[s:e] += a  # naive sum; could clamp later to prevent clipping

    # Light peak limiting to avoid clipping
    peak = np.max(np.abs(mix)) if mix.size else 1.0
    if peak > 1.0:
        mix = mix / peak

    # Write files
    out_wav = os.path.join(OUT_DIR, f"session_{session_idx:03d}.wav")
    sf.write(out_wav, mix, SR_TARGET)

    # Segment manifest line-per-line
    out_manifest = os.path.join(OUT_DIR, f"session_{session_idx:03d}.jsonl")
    with open(out_manifest, "w", encoding="utf-8") as f:
        for seg in timeline:
            rec = {
                "audio_file": os.path.basename(out_wav),
                "start_ts": round(seg["start_ts"], 3),
                "end_ts": round(seg["end_ts"], 3),
                "speaker": seg["session_speaker"],      # for SD
                "language": seg["language"],             # for LID
                "asr_text": seg["transcript"],           # for ASR
                "translation_en": "",                    # fill later via NMT if needed
                "source_path": seg["source_file"],
                "speaker_proxy": seg["speaker_proxy"]
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    return out_wav, out_manifest, len(timeline)

# Build sessions
made = 0
for i in range(N_SESSIONS):
    res = build_session(i+1)
    if res:
        print(f"✅ Session {i+1:03d}: wrote {res[0]} & {res[1]} with {res[2]} segments")
        made += 1
    else:
        print(f"⚠️ Session {i+1:03d}: no content")

print(f"\nDone. Sessions created: {made}/{N_SESSIONS}")
print("Output dir:", OUT_DIR)


✅ Session 001: wrote /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_001.wav & /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_001.jsonl with 118 segments
✅ Session 002: wrote /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_002.wav & /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_002.jsonl with 101 segments
✅ Session 003: wrote /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_003.wav & /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_003.jsonl with 107 segments
✅ Session 004: wrote /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_004.wav & /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_004.jsonl with 109 segments
✅ Session 005: wrote /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_005.wav & /content/drive/MyDrive/problem_statement_6/sessions_balanced/session_005.jsonl with 103 segments
✅ Session 006: 

In [None]:
# --- Session QA: format, timing, language mix, overlap %, speaker consistency ---
!pip -q install soundfile
import os, json, math, collections
import soundfile as sf

SESS_DIRS = [
    "/content/drive/MyDrive/problem_statement_6/sessions_smooth",
    "/content/drive/MyDrive/problem_statement_6/sessions_balanced",
]

def merge_intervals(ivls):
    # ivls: list of (start, end)
    if not ivls: return 0.0
    ivls = sorted(ivls, key=lambda x: x[0])
    merged = []
    cs, ce = ivls[0]
    for s,e in ivls[1:]:
        if s <= ce:
            ce = max(ce, e)
        else:
            merged.append((cs, ce))
            cs, ce = s, e
    merged.append((cs, ce))
    return sum(e - s for s, e in merged)

def qa_folder(folder):
    print(f"\n====== QA: {folder} ======")
    sessions = sorted([f for f in os.listdir(folder) if f.endswith(".jsonl")])
    if not sessions:
        print("No .jsonl found.")
        return
    summary = []
    bad_format_examples = []
    spk_mix_warnings = 0

    for jfile in sessions:
        base = jfile[:-6]
        wav = os.path.join(folder, base + ".wav")
        jpath = os.path.join(folder, jfile)

        # Audio info
        if not os.path.exists(wav):
            print(f"  ! Missing WAV for {jfile}")
            continue
        try:
            info = sf.info(wav)
        except Exception as e:
            print(f"  ! Unreadable WAV {wav}: {e}")
            continue

        fmt_ok = (info.samplerate == 16000 and info.channels == 1 and "PCM_16" in info.subtype)
        if not fmt_ok and len(bad_format_examples) < 3:
            bad_format_examples.append((os.path.basename(wav), info.samplerate, info.channels, info.subtype))

        # Manifest parse
        segs, langs, spk2proxies = [], collections.Counter(), {}
        with open(jpath, "r", encoding="utf-8") as f:
            for line in f:
                if not line.strip(): continue
                rec = json.loads(line)
                s, e = float(rec["start_ts"]), float(rec["end_ts"])
                if e <= s:  # skip bogus
                    continue
                segs.append((s, e, rec.get("language","").lower(), rec.get("speaker",""), rec.get("speaker_proxy","")))
                langs[rec.get("language","").lower()] += (e - s)
                spk = rec.get("speaker","")
                spk2proxies.setdefault(spk, set()).add(rec.get("speaker_proxy",""))

        if not segs:
            print(f"  ! No valid segments in {jfile}")
            continue

        # Duration checks
        seg_total = sum(e - s for s, e, *_ in segs)
        union_dur = merge_intervals([(s,e) for s, e, *_ in segs])
        overlap_dur = max(0.0, seg_total - union_dur)
        overlap_pct = 100.0 * overlap_dur / union_dur if union_dur > 0 else 0.0

        # Speaker consistency: any spkXX mapped to multiple proxies?
        mixed = {spk: len(proxies) for spk, proxies in spk2proxies.items() if len(proxies) > 1}
        if mixed:
            spk_mix_warnings += 1

        # Compare union duration to actual audio length
        audio_len = info.frames / info.samplerate if info.samplerate else 0.0
        dur_gap_sec = abs(audio_len - union_dur)

        summary.append({
            "session": base,
            "segments": len(segs),
            "audio_sec": round(audio_len, 1),
            "union_sec": round(union_dur, 1),
            "dur_gap_s": round(dur_gap_sec, 2),
            "overlap_pct": round(overlap_pct, 1),
            "fmt_ok": fmt_ok,
            "lang_share": {k: round(100*v/union_dur,1) for k,v in langs.items() if union_dur>0},
            "spk_mixed": bool(mixed)
        })

    # Print a compact report
    ok = sum(1 for r in summary if r["fmt_ok"])
    print(f"Sessions scanned: {len(summary)}")
    print(f" PCM_16/16k/mono OK: {ok}/{len(summary)}")
    if bad_format_examples:
        print(" Example non-PCM16 files (name, sr, ch, subtype):", bad_format_examples)

    avg_overlap = round(sum(r["overlap_pct"] for r in summary)/max(1,len(summary)), 1)
    avg_segs = round(sum(r["segments"] for r in summary)/max(1,len(summary)), 1)
    avg_dur_gap = round(sum(r["dur_gap_s"] for r in summary)/max(1,len(summary)), 2)
    print(f" Avg segments/session: {avg_segs}")
    print(f" Avg overlap %: {avg_overlap}")
    print(f" Avg |audio_len - union_len| (s): {avg_dur_gap}")
    print(f" Sessions with speaker-mix warning (spkXX→multiple proxies): {spk_mix_warnings}/{len(summary)}")

    # Show first 5 sessions with key stats
    print("\nSample sessions:")
    for r in summary[:5]:
        print(f"  {r['session']}: segs={r['segments']}, dur={r['audio_sec']}s, overlap={r['overlap_pct']}%, fmt_ok={r['fmt_ok']}, spk_mixed={r['spk_mixed']}, langs={r['lang_share']}")

for d in SESS_DIRS:
    qa_folder(d)



Sessions scanned: 20
 PCM_16/16k/mono OK: 20/20
 Avg segments/session: 114.5
 Avg overlap %: 2.3
 Avg |audio_len - union_len| (s): 50.68

Sample sessions:
  session_001: segs=113, dur=603.4s, overlap=2.0%, fmt_ok=True, spk_mixed=False, langs={'english': 36.4, 'hindi': 32.0, 'punjabi': 33.6}
  session_002: segs=113, dur=601.2s, overlap=1.8%, fmt_ok=True, spk_mixed=False, langs={'punjabi': 31.8, 'english': 36.2, 'hindi': 33.8}
  session_003: segs=113, dur=599.5s, overlap=2.4%, fmt_ok=True, spk_mixed=False, langs={'english': 35.5, 'punjabi': 32.2, 'hindi': 34.7}
  session_004: segs=124, dur=603.1s, overlap=3.6%, fmt_ok=True, spk_mixed=False, langs={'punjabi': 34.2, 'english': 35.0, 'hindi': 34.4}
  session_005: segs=112, dur=606.6s, overlap=2.4%, fmt_ok=True, spk_mixed=False, langs={'punjabi': 32.2, 'hindi': 35.5, 'english': 34.6}

Sessions scanned: 20
 PCM_16/16k/mono OK: 20/20
 Avg segments/session: 106.6
 Avg overlap %: 2.1
 Avg |audio_len - union_len| (s): 46.91

Sample sessions:
  s

In [None]:
# Parameterized 10-min session generator
!pip -q install soundfile numpy pandas

import os, json, math, random, hashlib, re
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import soundfile as sf


# --------- Small DSP helpers (no heavy deps) ---------

def linear_resample(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
    if sr_in == sr_out or x.size == 0:
        return x.astype(np.float32)
    N = x.shape[-1]
    new_N = max(1, int(round(N * sr_out / sr_in)))
    xp = np.linspace(0, N - 1, num=N, dtype=np.float64)
    xq = np.linspace(0, N - 1, num=new_N, dtype=np.float64)
    y = np.interp(xq, xp, x.astype(np.float32)).astype(np.float32)
    return y

def to_mono(x: np.ndarray) -> np.ndarray:
    if x.ndim == 2:
        return x.mean(axis=1).astype(np.float32)
    return x.astype(np.float32)

def add_awgn(x: np.ndarray, snr_db: float) -> np.ndarray:
    # Adds white noise at target SNR (approximate)
    if snr_db is None:
        return x
    p_signal = np.mean(x**2) + 1e-9
    p_noise = p_signal / (10 ** (snr_db / 10.0))
    noise = np.random.normal(0.0, np.sqrt(p_noise), size=x.shape).astype(np.float32)
    y = x + noise
    # light peak guard
    peak = np.max(np.abs(y)) + 1e-9
    if peak > 1.0:
        y = y / peak
    return y

def bandlimit_8k_then_back_16k(x: np.ndarray, sr: int) -> np.ndarray:
    # crude telephone channel: downsample to 8k, then back to 16k
    if sr != 16000:
        x = linear_resample(x, sr, 16000)
        sr = 16000
    y8 = linear_resample(x, 16000, 8000)
    y16 = linear_resample(y8, 8000, 16000)
    return y16

# --------- Config ---------

@dataclass
class GenConfig:
    # Core
    session_seconds: int = 600
    sr_target: int = 16000
    speakers_per_session: Tuple[int, int] = (3, 4)

    # Language mix (weights; will be normalized)
    lang_weights: Dict[str, float] = None  # e.g., {"english":0.6,"hindi":0.3,"punjabi":0.1}

    # Timing / structure
    gap_sec_range: Tuple[float, float] = (0.2, 0.8)
    overlap_prob: float = 0.12
    overlap_sec_range: Tuple[float, float] = (0.5, 1.5)

    # Clip duration constraints
    min_clip_sec: float = 0.5
    max_clip_sec: float = 30.0

    # Speaker / turn shaping
    min_turns_per_speaker: int = 8
    max_turns_per_speaker: int = 99999
    bind_key_by_lang: Dict[str, str] = None  # e.g., {"hindi":"client_id","punjabi":"client_id","english":"speaker_proxy"}

    # Code-switching control
    force_code_switch: bool = False         # if True, encourages alternating languages per speaker where possible
    code_switch_prob: float = 0.5           # probability a speaker switches language for next turn (if available)

    # Augmentation
    snr_db_range: Optional[Tuple[float, float]] = None  # e.g., (15, 30) for mild AWGN
    apply_tel_bandlimit_prob: float = 0.0               # e.g., 0.2 to simulate phone channel sometimes
    global_gain_db: float = 0.0                         # e.g., -3.0 to reduce overall loudness

    # Output
    out_dir: str = ""
    scenario_name: str = "sessions_custom"
    seed: int = 1337


class SessionGenerator:
    def __init__(self, manifest_csv: str, config: GenConfig):
        self.manifest_csv = manifest_csv
        self.cfg = config
        self.rng = random.Random(self.cfg.seed)
        np.random.seed(self.cfg.seed)

        assert os.path.exists(self.manifest_csv), f"Manifest not found: {self.manifest_csv}"
        self.df = pd.read_csv(self.manifest_csv)
        req_cols = {"audio_file","transcript","language"}
        assert req_cols.issubset(self.df.columns), f"Manifest must have columns: {req_cols}"

        # Clean & filter
        self.df["language"] = self.df["language"].astype(str).str.strip().str.lower()
        self.df = self.df[self.df["transcript"].fillna("").astype(str).str.strip().ne("")]
        if "duration" in self.df.columns:
            self.df = self.df[(self.df["duration"] > self.cfg.min_clip_sec) & (self.df["duration"] < self.cfg.max_clip_sec)]

        # Bind columns (speaker identity per language)
        self.bind_key_by_lang = self.cfg.bind_key_by_lang or {}
        # Create a robust fallback speaker proxy:
        def default_proxy(row):
            # stem as last resort
            stem = os.path.splitext(os.path.basename(str(row["audio_file"])))[0]
            return stem
        # Build a unified column "spk_proxy"
        proxies = []
        for _, r in self.df.iterrows():
            lang = r["language"]
            key = self.bind_key_by_lang.get(lang, None)
            if key and key in self.df.columns:
                proxies.append(str(r[key]) if pd.notna(r[key]) else default_proxy(r))
            else:
                proxies.append(default_proxy(r))
        self.df["spk_proxy"] = proxies

        # Pre-buckets by language; also index by speaker proxy per language
        self.langs = sorted(self.df["language"].unique())
        self.lang_buckets = {l: self.df[self.df["language"]==l].copy() for l in self.langs}
        self.lang_spk_to_rows = {l: {} for l in self.langs}
        for l in self.langs:
            g = self.lang_buckets[l].groupby("spk_proxy")
            self.lang_spk_to_rows[l] = {spk: grp.sample(frac=1.0, random_state=self.rng.randint(0, 1_000_000)).reset_index(drop=True)
                                        for spk, grp in g}

        # Normalize language weights
        if self.cfg.lang_weights is None:
            # default: equal weights across present languages
            self.cfg.lang_weights = {l: 1.0 for l in self.langs}
        total_w = sum(max(0.0, w) for w in self.cfg.lang_weights.values())
        self.lang_targets = {l: (self.cfg.lang_weights.get(l, 0.0) / total_w) * self.cfg.session_seconds for l in self.langs}

        # Output dir
        self.out_root = self.cfg.out_dir or os.path.join(os.path.dirname(self.manifest_csv), self.cfg.scenario_name)
        os.makedirs(self.out_root, exist_ok=True)

    # --------------- Audio IO & Aug ---------------

    def load_mono_16k(self, path: str) -> np.ndarray:
        x, sr = sf.read(path, always_2d=False)
        x = to_mono(np.asarray(x))
        if sr != self.cfg.sr_target:
            x = linear_resample(x, sr, self.cfg.sr_target)
        # Augment: telephone bandlimit?
        if self.rng.random() < self.cfg.apply_tel_bandlimit_prob:
            x = bandlimit_8k_then_back_16k(x, self.cfg.sr_target)
        # Augment: AWGN?
        if self.cfg.snr_db_range:
            snr = self.rng.uniform(*self.cfg.snr_db_range)
            x = add_awgn(x, snr)
        # Gain
        if self.cfg.global_gain_db != 0.0:
            g = 10 ** (self.cfg.global_gain_db / 20.0)
            x = x * g
        # Soft peak guard
        peak = np.max(np.abs(x)) + 1e-9
        if peak > 1.0:
            x = x / peak
        return x.astype(np.float32)

    # --------------- Speaker roster & language plan ---------------

    def pick_session_speakers(self, k: int) -> Dict[str, Tuple[str, str]]:
        """
        Returns mapping: spk01 -> (lang, spk_proxy), each tied to a *single* real proxy for that lang.
        If code-switching is on, we only bind an initial language; speakers may switch later.
        """
        # Start from languages with more target seconds first (to ensure coverage)
        langs_sorted = sorted(self.langs, key=lambda l: self.lang_targets.get(l, 0.0), reverse=True)
        roster = {}
        tried = set()
        for i in range(k):
            # choose a language weighted by target seconds
            weights = [max(1e-6, self.lang_targets.get(l, 0.0)) for l in langs_sorted]
            lang = self.rng.choices(langs_sorted, weights=weights, k=1)[0]
            # pick a speaker proxy for that language with enough remaining rows
            spk_pool = list(self.lang_spk_to_rows[lang].keys())
            self.rng.shuffle(spk_pool)
            chosen = None
            for spk in spk_pool:
                if (lang, spk) in tried:  # avoid repeats in the same session
                    continue
                if len(self.lang_spk_to_rows[lang][spk]) >= self.cfg.min_turns_per_speaker:
                    chosen = spk
                    break
            if not chosen:
                # fallback to any speaker with at least 1 row
                for spk in spk_pool:
                    if (lang, spk) not in tried and len(self.lang_spk_to_rows[lang][spk]) > 0:
                        chosen = spk
                        break
            if chosen is None:
                # fallback to any language
                for L in self.langs:
                    spk_pool = list(self.lang_spk_to_rows[L].keys())
                    if spk_pool:
                        chosen = spk_pool[0]
                        lang = L
                        break
            roster[f"spk{i+1:02d}"] = (lang, chosen)
            tried.add((lang, chosen))
        return roster

    def pick_clip_for(self, lang: str, spk_proxy: str) -> Optional[pd.Series]:
        rows = self.lang_spk_to_rows.get(lang, {}).get(spk_proxy, None)
        if rows is None or len(rows) == 0:
            return None
        row = rows.iloc[0]
        # pop
        self.lang_spk_to_rows[lang][spk_proxy] = rows.iloc[1:]
        return row

    # --------------- Build one session ---------------

    def build_session(self, session_idx: int) -> Optional[Tuple[str, str, int]]:
        k = self.rng.randint(*self.cfg.speakers_per_session)
        roster = self.pick_session_speakers(k)  # spkXX -> (lang, spk_proxy)

        # Make a mutable language target seconds copy
        lang_remain = dict(self.lang_targets)
        # Create per-speaker language state (for code-switching)
        spk_lang_state = {spk: roster[spk][0] for spk in roster}

        timeline = []
        cur_t = 0.0

        while cur_t < self.cfg.session_seconds - 1.0:
            # Choose next speaker to keep min/max turns roughly satisfied
            spk_order = list(roster.keys())
            self.rng.shuffle(spk_order)
            chosen_spk = spk_order[0]
            # Decide language for this turn
            lang = spk_lang_state[chosen_spk]
            if self.cfg.force_code_switch and self.rng.random() < self.cfg.code_switch_prob:
                # attempt to switch language for this speaker (choose language with remaining quota)
                lang_cands = sorted(self.langs, key=lambda l: lang_remain.get(l, 0.0), reverse=True)
                for L in lang_cands:
                    if L != lang:
                        lang = L
                        break
                spk_lang_state[chosen_spk] = lang

            # Pick a clip for (lang, speaker_proxy); if empty, relax to any speaker in lang; else any lang
            _, spk_proxy = roster[chosen_spk]
            row = self.pick_clip_for(lang, spk_proxy)
            if row is None:
                # try any speaker in this language
                for other_spk in self.lang_spk_to_rows.get(lang, {}):
                    row = self.pick_clip_for(lang, other_spk)
                    if row is not None:
                        # rebind chosen speaker to this proxy for session consistency
                        roster[chosen_spk] = (lang, other_spk)
                        break
            if row is None:
                # try any language weighted by remaining seconds
                avail_langs = [L for L in self.langs if any(len(v) > 0 for v in self.lang_spk_to_rows[L].values())]
                if not avail_langs:
                    break
                lang = self.rng.choices(avail_langs, weights=[max(1e-6, lang_remain.get(L, 0.0)) for L in avail_langs], k=1)[0]
                # choose any speaker in that language
                for other_spk in self.lang_spk_to_rows.get(lang, {}):
                    row = self.pick_clip_for(lang, other_spk)
                    if row is not None:
                        roster[chosen_spk] = (lang, other_spk)
                        spk_lang_state[chosen_spk] = lang
                        break
            if row is None:
                break  # nothing left

            # Load audio
            wav = self.load_mono_16k(row["audio_file"])
            dur = len(wav) / self.cfg.sr_target

            # Decide overlap/gap
            if timeline and self.rng.random() < self.cfg.overlap_prob:
                ov = self.rng.uniform(*self.cfg.overlap_sec_range)
                start_ts = max(0.0, cur_t - ov)
            else:
                gap = self.rng.uniform(*self.cfg.gap_sec_range)
                start_ts = cur_t + gap

            end_ts = start_ts + dur
            cur_t = max(cur_t, end_ts)

            timeline.append({
                "audio": wav,
                "start_ts": start_ts,
                "end_ts": end_ts,
                "session_speaker": chosen_spk,
                "speaker_proxy": roster[chosen_spk][1],
                "language": lang,
                "transcript": str(row["transcript"]),
                "source_file": row["audio_file"]
            })

            # reduce remaining language target
            lang_remain[lang] = max(0.0, lang_remain.get(lang, 0.0) - dur)
            if cur_t >= self.cfg.session_seconds:
                break

        if not timeline:
            return None

        # Mix
        T = int(math.ceil(max(seg["end_ts"] for seg in timeline) * self.cfg.sr_target))
        mix = np.zeros(T, dtype=np.float32)
        for seg in timeline:
            a = seg["audio"]
            s = int(round(seg["start_ts"] * self.cfg.sr_target))
            e = s + len(a)
            if e > len(mix):
                mix = np.pad(mix, (0, e - len(mix)))
            mix[s:e] += a

        # Peak limit
        peak = np.max(np.abs(mix)) + 1e-9
        if peak > 1.0:
            mix = mix / peak

        # Write outputs (PCM16)
        out_wav = os.path.join(self.out_root, f"session_{session_idx:03d}.wav")
        sf.write(out_wav, (mix * 32767.0).astype(np.int16), self.cfg.sr_target, subtype='PCM_16')

        out_manifest = os.path.join(self.out_root, f"session_{session_idx:03d}.jsonl")
        with open(out_manifest, "w", encoding="utf-8") as f:
            for seg in timeline:
                rec = {
                    "audio_file": os.path.basename(out_wav),
                    "start_ts": round(seg["start_ts"], 3),
                    "end_ts": round(seg["end_ts"], 3),
                    "speaker": seg["session_speaker"],
                    "language": seg["language"],
                    "asr_text": seg["transcript"],
                    "translation_en": "",
                    "source_path": seg["source_file"],
                    "speaker_proxy": seg["speaker_proxy"]
                }
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")

        # Write a roster sidecar for audit
        roster_map = {spk: {"initial_lang": lang, "speaker_proxy": proxy} for spk,(lang,proxy) in roster.items()}
        with open(os.path.join(self.out_root, f"session_{session_idx:03d}_roster.json"), "w", encoding="utf-8") as f:
            json.dump(roster_map, f, ensure_ascii=False, indent=2)

        return out_wav, out_manifest, len(timeline)

    # --------------- Batch API ---------------

    def generate(self, n_sessions: int) -> None:
        made = 0
        for i in range(1, n_sessions + 1):
            res = self.build_session(i)
            if res:
                print(f"✅ {self.cfg.scenario_name} {i:03d}: wrote {res[0]} & {res[1]} with {res[2]} segments")
                made += 1
            else:
                print(f"⚠️ {self.cfg.scenario_name} {i:03d}: no content")
        print(f"\nDone. Sessions created: {made}/{n_sessions} → {self.out_root}")


# ------------------- EXAMPLES (create scenarios) -------------------

BASE = "/content/drive/MyDrive/problem_statement_6"
MANIFEST = f"{BASE}/training_data/foundational_manifest.csv"

# 1) Balanced 33/33/33 with low overlap (baseline)
cfg_bal = GenConfig(
    session_seconds=600,
    lang_weights={"english":1,"hindi":1,"punjabi":1},
    gap_sec_range=(0.2, 0.8),
    overlap_prob=0.08,
    overlap_sec_range=(0.4, 1.2),
    speakers_per_session=(3,4),
    min_clip_sec=0.5, max_clip_sec=20.0,
    bind_key_by_lang={"hindi":"client_id","punjabi":"client_id","english":"spk_proxy"},
    force_code_switch=False,
    snr_db_range=None,
    apply_tel_bandlimit_prob=0.0,
    global_gain_db=0.0,
    out_dir=f"{BASE}/sessions_balanced",
    scenario_name="sessions_balanced",
    seed=123
)

# 2) Smoothed 60/30/10 with modest overlap
cfg_smooth = GenConfig(
    session_seconds=600,
    lang_weights={"english":0.60,"hindi":0.30,"punjabi":0.10},
    gap_sec_range=(0.2, 0.8),
    overlap_prob=0.12,
    overlap_sec_range=(0.5, 1.5),
    speakers_per_session=(3,4),
    min_clip_sec=0.5, max_clip_sec=20.0,
    bind_key_by_lang={"hindi":"client_id","punjabi":"client_id","english":"spk_proxy"},
    force_code_switch=False,
    snr_db_range=None,
    apply_tel_bandlimit_prob=0.0,
    global_gain_db=0.0,
    out_dir=f"{BASE}/sessions_smooth",
    scenario_name="sessions_smooth",
    seed=124
)

# 3) Proportional ~78/20/2 (raw prior)
cfg_prop = GenConfig(
    session_seconds=600,
    lang_weights={"english":0.78,"hindi":0.20,"punjabi":0.02},
    gap_sec_range=(0.2, 0.8),
    overlap_prob=0.10,
    overlap_sec_range=(0.5, 1.5),
    speakers_per_session=(3,4),
    min_clip_sec=0.5, max_clip_sec=20.0,
    bind_key_by_lang={"hindi":"client_id","punjabi":"client_id","english":"spk_proxy"},
    out_dir=f"{BASE}/sessions_prop",
    scenario_name="sessions_prop",
    seed=125
)

# 4) High-overlap stress (~20%)
cfg_overlap = GenConfig(
    session_seconds=600,
    lang_weights={"english":1,"hindi":1,"punjabi":1},
    gap_sec_range=(0.0, 0.3),             # less gap
    overlap_prob=0.40,                     # more overlaps
    overlap_sec_range=(0.7, 2.0),
    speakers_per_session=(3,4),
    min_clip_sec=0.5, max_clip_sec=15.0,   # slightly shorter to pack more
    bind_key_by_lang={"hindi":"client_id","punjabi":"client_id","english":"spk_proxy"},
    out_dir=f"{BASE}/sessions_high_overlap",
    scenario_name="sessions_high_overlap",
    seed=126
)

# 5) Code-switch heavy (alternating languages)
cfg_codesw = GenConfig(
    session_seconds=600,
    lang_weights={"english":1,"hindi":1,"punjabi":1},  # equal total per session
    gap_sec_range=(0.2, 0.8),
    overlap_prob=0.10,
    overlap_sec_range=(0.5, 1.5),
    speakers_per_session=(3,4),
    min_clip_sec=0.5, max_clip_sec=15.0,
    bind_key_by_lang={"hindi":"client_id","punjabi":"client_id","english":"spk_proxy"},
    force_code_switch=True,
    code_switch_prob=0.6,
    out_dir=f"{BASE}/sessions_codeswitch",
    scenario_name="sessions_codeswitch",
    seed=127
)

# ---- Choose which to generate (uncomment to run) ----
# SessionGenerator(MANIFEST, cfg_bal).generate(n_sessions=20)
# SessionGenerator(MANIFEST, cfg_smooth).generate(n_sessions=20)
# SessionGenerator(MANIFEST, cfg_prop).generate(n_sessions=20)
SessionGenerator(MANIFEST, cfg_overlap).generate(n_sessions=20)
SessionGenerator(MANIFEST, cfg_codesw).generate(n_sessions=20)


✅ sessions_high_overlap 001: wrote /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_001.wav & /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_001.jsonl with 134 segments
✅ sessions_high_overlap 002: wrote /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_002.wav & /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_002.jsonl with 161 segments
✅ sessions_high_overlap 003: wrote /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_003.wav & /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_003.jsonl with 140 segments
✅ sessions_high_overlap 004: wrote /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_004.wav & /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_004.jsonl with 139 segments
✅ sessions_high_overlap 005: wrote /content/drive/MyDrive/problem_statement_6/sessions_high_overlap/session_005.wav & /conte

In [None]:
import os, json, collections

BASE = "/content/drive/MyDrive/problem_statement_6"
SCENARIOS = {
    "balanced":   f"{BASE}/sessions_balanced",
    "smooth":     f"{BASE}/sessions_smooth",
    "high_overlap": f"{BASE}/sessions_high_overlap",
    "codeswitch": f"{BASE}/sessions_codeswitch"
}

for name, folder in SCENARIOS.items():
    if not os.path.isdir(folder):
        print(f"⚠️ Missing folder {folder}, skipping…")
        continue

    print(f"\n=== Language mix for {name} ===")
    lang_totals = collections.Counter()
    n_sessions = 0

    for f in sorted(os.listdir(folder)):
        if not f.endswith(".jsonl"):
            continue
        n_sessions += 1
        lang_counts = collections.Counter()
        total = 0.0
        with open(os.path.join(folder, f), "r", encoding="utf-8") as fh:
            for line in fh:
                rec = json.loads(line)
                lang = str(rec.get("language","")).lower()
                dur = float(rec.get("end_ts",0)) - float(rec.get("start_ts",0))
                if dur <= 0:
                    continue
                lang_counts[lang] += dur
                lang_totals[lang] += dur
                total += dur
        if total > 0:
            mix = {k: round(100*v/total,1) for k,v in lang_counts.items()}
            print(f" {f}: {mix}")

    total_all = sum(lang_totals.values())
    if total_all > 0:
        mix_all = {k: round(100*v/total_all,1) for k,v in lang_totals.items()}
        print(f"\n → Overall across {n_sessions} sessions: {mix_all}")



=== Language mix for balanced ===
 session_001.jsonl: {'hindi': 30.1, 'english': 60.2, 'punjabi': 9.7}
 session_002.jsonl: {'english': 59.4, 'hindi': 31.1, 'punjabi': 9.5}
 session_003.jsonl: {'hindi': 29.3, 'english': 60.8, 'punjabi': 9.9}
 session_004.jsonl: {'hindi': 29.1, 'english': 61.2, 'punjabi': 9.7}
 session_005.jsonl: {'english': 60.4, 'hindi': 29.5, 'punjabi': 10.1}
 session_006.jsonl: {'hindi': 30.0, 'punjabi': 8.3, 'english': 61.7}
 session_007.jsonl: {'punjabi': 9.5, 'hindi': 31.0, 'english': 59.5}
 session_008.jsonl: {'english': 60.8, 'punjabi': 9.7, 'hindi': 29.5}
 session_009.jsonl: {'hindi': 28.1, 'english': 61.4, 'punjabi': 10.4}
 session_010.jsonl: {'english': 59.9, 'hindi': 30.9, 'punjabi': 9.2}
 session_011.jsonl: {'english': 58.1, 'hindi': 31.4, 'punjabi': 10.5}
 session_012.jsonl: {'hindi': 29.3, 'punjabi': 9.6, 'english': 61.0}
 session_013.jsonl: {'english': 60.9, 'hindi': 30.2, 'punjabi': 8.9}
 session_014.jsonl: {'english': 61.6, 'hindi': 28.8, 'punjabi': 9