<a href="https://colab.research.google.com/github/Domaakshithareddy/NeuroSpeech/blob/main/Major_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# cell: detect_librispeech_and_preview.py
import os
from pathlib import Path

# candidate roots to search for LibriSpeech
candidates = [
    Path("/kaggle/input"),
    Path("/kaggle/working"),
    Path("/kaggle/working/data"),
    Path("/mnt/data"),
    Path(".")
]

found = None
for cand in candidates:
    if cand.exists():
        # search for a folder named 'LibriSpeech' or 'train-clean-100' directly
        libdir = cand / "LibriSpeech"
        if libdir.exists() and (libdir / "train-clean-100").exists():
            found = libdir
            break
        # maybe they uploaded as top-level train-clean-100
        if (cand / "train-clean-100").exists():
            found = cand
            break

if not found:
    # fallback: search recursively for a train-clean-100 folder
    for root, dirs, files in os.walk("/kaggle", topdown=True):
        if "train-clean-100" in dirs:
            found = Path(root)
            break
    if not found:
        for root, dirs, files in os.walk("/mnt", topdown=True):
            if "train-clean-100" in dirs:
                found = Path(root)
                break

if not found:
    print("Could not auto-detect LibriSpeech. Please paste the full path to the folder containing 'train-clean-100'.")
else:
    print("Detected dataset root:", found)
    t = (found / "train-clean-100")
    print("train-clean-100 exists:", t.exists())
    # list first-level children and show sample audio files for one speaker
    print("\nTop-level entries in dataset root:")
    for p in sorted(found.iterdir()):
        print(" -", p.name, "(dir)" if p.is_dir() else "(file)", )
    # show a few speaker folders
    if t.exists():
        speakers = sorted([d for d in t.iterdir() if d.is_dir()])[:8]
        print("\nSample speaker folders (first 8):", [s.name for s in speakers])
        # show one speaker/chapter contents
        if speakers:
            sp = speakers[0]
            chapters = sorted([d for d in sp.iterdir() if d.is_dir()])[:4]
            print(f"\nFor speaker {sp.name}, show up to 4 chapters:", [c.name for c in chapters])
            if chapters:
                ch = chapters[0]
                files = sorted(list(ch.glob("*.*")))[:10]
                print(f"\nExample files in {sp.name}/{ch.name}:")
                for f in files:
                    print("   ", f.name, "-", f.stat().st_size, "bytes")


In [None]:
# Build manifest directly from .trans.txt files (robust, no torchaudio)
from pathlib import Path
import csv
import sys

dataset_root = Path("/kaggle/input/librispeech-100/LibriSpeech")
split = "train-clean-100"
split_dir = dataset_root / split
if not split_dir.exists():
    raise RuntimeError(f"Expected {split_dir} to exist. Adjust dataset_root if needed.")

# 1) collect all .trans.txt files and build mapping utterance_id -> transcript
trans_map = {}
trans_files = list(split_dir.glob("**/*.trans.txt"))
print(f"Found {len(trans_files)} .trans.txt files (chapter-level transcript files).")

for tf in trans_files:
    try:
        with tf.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                # format: "<utterance-id> <transcript...>"
                parts = line.split(" ", 1)
                if len(parts) == 2:
                    uttid, text = parts
                    trans_map[uttid] = text
    except Exception as e:
        print("Warning: failed reading", tf, ":", e)

print("Total utterances in transcripts mapping:", len(trans_map))

# 2) find all .flac files and match to trans_map
flac_files = list(split_dir.glob("**/*.flac"))
print("Found", len(flac_files), "flac files under", split_dir)

manifest_out = Path("manifests")
manifest_out.mkdir(exist_ok=True)
out_file = manifest_out / f"{split}.tsv"

write_count = 0
missing_count = 0
with out_file.open("w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["audio_path", "transcript"])
    for fl in sorted(flac_files):
        # utterance id is filename without extension (e.g., 103-1240-0000)
        uttid = fl.stem
        if uttid in trans_map:
            writer.writerow([str(fl), trans_map[uttid]])
            write_count += 1
        else:
            missing_count += 1
            # Optionally write with empty transcript or skip. We'll skip but log.
            # writer.writerow([str(fl), ""])
if missing_count:
    print(f"Warning: {missing_count} flac files had no matching transcript (they were skipped).")
print(f"Wrote {write_count} entries to {out_file.resolve()}")

# 3) print a few sample rows
print("\nSample manifest lines:")
with out_file.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i >= 5: break

In [None]:
# Cell 1: env & imports
import os
from pathlib import Path
import csv
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor
import nltk
nltk.download("cmudict", quiet=True)
from nltk.corpus import cmudict

# Paths (edit if your locations differ)
DATA_ROOT = Path("/kaggle/input/librispeech-100/LibriSpeech")   # where the audio folders are
MANIFEST_PATH = Path("/kaggle/working/manifests/train-clean-100.tsv")          # your uploaded manifest in this chat

print("DATA_ROOT exists:", DATA_ROOT.exists(), "  path:", DATA_ROOT)
print("MANIFEST_PATH exists:", MANIFEST_PATH.exists(), "  path:", MANIFEST_PATH)

# quick device print
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor

LOCAL = "/kaggle/input/wav2vec2-large-robust-local/wav2vec2-large-robust-local"  # or wherever you unzipped it
model = Wav2Vec2Model.from_pretrained(LOCAL, local_files_only=True)
fe = Wav2Vec2FeatureExtractor.from_pretrained(LOCAL, local_files_only=True)

print("Model loaded:", model.config.hidden_size)
print("Feature extractor sr:", fe.sampling_rate)

In [None]:
# Single cell: small end-to-end sanity check (one prompt)
import os
from pathlib import Path
import csv
import torchaudio
import torch
import torch.nn as nn
import numpy as np
import nltk
nltk.download("cmudict", quiet=True)
from nltk.corpus import cmudict

# --- CONFIG: adjust if needed ---
# manifest may be in /kaggle/working/manifests or /mnt/data if you uploaded it here
manifest_candidates = [
    Path("/kaggle/working/manifests/train-clean-100.tsv"),
    Path("/mnt/data/train-clean-100.tsv"),
    Path("manifests/train-clean-100.tsv")
]
MANIFEST = next((p for p in manifest_candidates if p.exists()), None)
if MANIFEST is None:
    raise FileNotFoundError("Manifest not found. Expected at one of: " + ", ".join(str(p) for p in manifest_candidates))
print("Using manifest:", MANIFEST)

# small utilities: CMUdict phonemizer + minimal phoneme->features (covers common ARPAbet)
cmu = cmudict.dict()
def word_to_phonemes(word):
    w = word.lower()
    if w in cmu:
        phones = cmu[w][0]
        return [p.rstrip("012") for p in phones]
    return []

def sentence_to_phonemes(sentence):
    words = [w.strip(".,!?;:()[]'\"").lower() for w in sentence.split()]
    phs = []
    for w in words:
        phs += word_to_phonemes(w)
    return phs

FEATURE_LIST = [
 'consonant','sonorant','fricative','nasal','stop','approximant','affricate','liquid','vowel','semivowel','continuant',
 'alveolar','palatal','dental','glottal','labial','velar','mid','high','low','front','back','central','anterior','posterior','retroflex','bilabial','coronal','dorsal',
 'long','short','monophthong','diphthong','round','voiced'
]
N_FEATURES = len(FEATURE_LIST)
from collections import defaultdict
PHONEME_TO_FEATURES = defaultdict(set)
def add(ph, *feats):
    PHONEME_TO_FEATURES[ph].update(feats)

# minimal mapping (covers standard ARPAbet used by CMUdict)
# vowels
add('AA','vowel','low','back','monophthong'); add('AE','vowel','low','front','monophthong')
add('AH','vowel','mid','central','short','monophthong'); add('AO','vowel','low','back','monophthong')
add('AW','vowel','diphthong','back'); add('AY','vowel','diphthong','front')
add('EH','vowel','mid','front','monophthong'); add('ER','vowel','mid','central','monophthong','round')
add('EY','vowel','diphthong','front'); add('IH','vowel','high','front','short')
add('IY','vowel','high','front','long','monophthong'); add('OW','vowel','diphthong','back')
add('OY','vowel','diphthong','back'); add('UH','vowel','high','back','short'); add('UW','vowel','high','back','long','round')
# stops
add('P','consonant','stop','bilabial'); add('B','consonant','stop','bilabial','voiced')
add('T','consonant','stop','alveolar'); add('D','consonant','stop','alveolar','voiced')
add('K','consonant','stop','velar'); add('G','consonant','stop','velar','voiced')
# fricatives
add('F','consonant','fricative','labial'); add('V','consonant','fricative','labial','voiced')
add('TH','consonant','fricative','dental'); add('DH','consonant','fricative','dental','voiced')
add('S','consonant','fricative','alveolar'); add('Z','consonant','fricative','alveolar','voiced')
add('SH','consonant','fricative','palatal'); add('ZH','consonant','fricative','palatal','voiced'); add('HH','consonant','fricative','glottal')
# affricates / nasals / approximants
add('CH','consonant','affricate','palatal'); add('JH','consonant','affricate','palatal','voiced')
add('M','consonant','nasal','bilabial','voiced'); add('N','consonant','nasal','alveolar','voiced'); add('NG','consonant','nasal','velar','voiced')
add('L','consonant','liquid','alveolar','voiced'); add('R','consonant','liquid','coronal','voiced')
add('W','consonant','semivowel','labial','voiced'); add('Y','consonant','semivowel','palatal','voiced')

def phonemes_to_feature_sequences(phonemes):
    """Return list of N_FEATURES lists of global token ids: +att=2*i, -att=2*i+1"""
    seqs = [[] for _ in range(N_FEATURES)]
    for ph in phonemes:
        feats = PHONEME_TO_FEATURES.get(ph, set())
        for i, feat in enumerate(FEATURE_LIST):
            if feat in feats:
                seqs[i].append(2*i)
            else:
                seqs[i].append(2*i+1)
    return seqs

# --- Dataset (reads manifest, returns waveform numpy and phonemes) ---
class SmallLibriDataset(torch.utils.data.Dataset):
    def __init__(self, manifest_tsv, max_items=None):
        self.rows = []
        with open(manifest_tsv, "r", encoding="utf-8") as f:
            rdr = csv.DictReader(f, delimiter="\t")
            for i,row in enumerate(rdr):
                self.rows.append((row['audio_path'], row['transcript']))
                if max_items and i+1>=max_items:
                    break
    def __len__(self):
        return len(self.rows)
    def __getitem__(self, idx):
        path, transcript = self.rows[idx]
        waveform, sr = torchaudio.load(path)
        if sr != fe.sampling_rate:
            waveform = torchaudio.functional.resample(waveform, sr, fe.sampling_rate)
        if waveform.dim()>1:
            waveform = waveform.mean(dim=0)
        # return numpy 1D float32 (feature extractor accepts list of arrays)
        wav_np = waveform.numpy().astype(np.float32)
        # phonemes
        phs = sentence_to_phonemes(transcript)
        feat_seqs = phonemes_to_feature_sequences(phs)
        return wav_np, phs, feat_seqs, path

# collate: use feature-extractor `fe` to get input_values & attention_mask (pt tensors)
def collate_batch(batch):
    wavs = [item[0] for item in batch]  # numpy arrays
    phs = [item[1] for item in batch]
    feat_seqs = [item[2] for item in batch]
    paths = [item[3] for item in batch]
    inputs = fe(wavs, sampling_rate=fe.sampling_rate, return_tensors="pt", padding=True)
    return inputs, feat_seqs, phs, paths

# --- create small dataloader (2 examples) ---
ds = SmallLibriDataset(MANIFEST, max_items=200)   # limit to 200 reading time
print("Dataset size (using max_items=200):", len(ds))
dl = torch.utils.data.DataLoader(ds, batch_size=2, shuffle=True, collate_fn=collate_batch)

# --- build linear head and device setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
# freeze feature extractor of wav2vec if not already
for p in model.feature_extractor.parameters():
    p.requires_grad = False
hidden = model.config.hidden_size
out_dim = 2 * N_FEATURES + 1
linear_head = nn.Linear(hidden, out_dim).to(device)
model.to(device).eval()

# take one batch and run forward
batch_inputs, batch_feat_seqs, batch_phs, batch_paths = next(iter(dl))
input_values = batch_inputs["input_values"].to(device)   # shape (B, L)
attention_mask = batch_inputs.get("attention_mask", None)
if attention_mask is not None:
    attention_mask = attention_mask.to(device)

with torch.no_grad():
    wav2_out = model(input_values, attention_mask=attention_mask)
    hs = wav2_out.last_hidden_state     # (B, T, H)
    logits = linear_head(hs)            # (B, T, C)

print("Batch audio paths:", batch_paths)
print("Example phonemes (sample 0):", batch_phs[0][:60])
print("Example feature-seq lengths for sample 0 (per-feature):", [len(s) for s in batch_feat_seqs[0][:8]], " ...")
print("Hidden states shape (B, T, H):", hs.shape)
print("Logits shape (B, T, C):", logits.shape, " expected C:", out_dim)


In [None]:
# === Kaggle notebook cell: create features_mapping.py and load mapping ===
import os
from pathlib import Path

FEATURE_ORDER = [
    "consonantal", "sonorant", "approximant", "nasal", "voice",
    "labial", "coronal", "anterior", "distributed", "dorsal",
    "high", "low", "front", "back", "round",
    "continuant", "delayed_release", "lateral", "strident",
    "tense", "long", "stress", "syllabic",
    "spread_glottis", "constricted_glottis",
    "burst", "aspirated", "glide", "tap", "trill",
    "stop", "fricative", "affricate", "vowel", "silence"
]

# PHONEME_TO_FEATURES: map ARPAbet phonemes (no stress digits) -> set of +att features
PHONEME_TO_FEATURES = {
    # Vowels (approximate feature sets mapped into the 35 features above)
    "AA": {"vowel","low","back","tense","syllabic"},
    "AE": {"vowel","low","front","tense","syllabic"},
    "AH": {"vowel","low","syllabic"},            # central-ish low vowel
    "AO": {"vowel","low","back","round","syllabic"},
    "AW": {"vowel","low","back","round","syllabic"},
    "AY": {"vowel","low","front","syllabic"},
    "EH": {"vowel","front","syllabic"},
    "ER": {"vowel","syllabic"},                  # rhotic handled as vowel-like here
    "EY": {"vowel","front","syllabic"},
    "IH": {"vowel","high","front","syllabic"},
    "IY": {"vowel","high","front","tense","syllabic"},
    "OW": {"vowel","back","round","syllabic"},
    "OY": {"vowel","back","round","syllabic"},
    "UH": {"vowel","high","back","round","syllabic"},
    "UW": {"vowel","high","back","round","tense","syllabic"},

    # Stops
    "P": {"consonantal","labial","anterior","stop"},
    "B": {"consonantal","labial","anterior","stop","voice"},
    "T": {"consonantal","coronal","anterior","stop"},
    "D": {"consonantal","coronal","anterior","stop","voice"},
    "K": {"consonantal","dorsal","stop"},
    "G": {"consonantal","dorsal","stop","voice"},

    # Affricates
    "CH": {"consonantal","coronal","anterior","affricate","delayed_release"},
    "JH": {"consonantal","coronal","anterior","affricate","delayed_release","voice"},

    # Fricatives
    "F": {"consonantal","labial","anterior","fricative","continuant"},
    "V": {"consonantal","labial","anterior","fricative","continuant","voice"},
    "TH": {"consonantal","coronal","anterior","fricative","distributed"},
    "DH": {"consonantal","coronal","anterior","fricative","distributed","voice"},
    "S": {"consonantal","coronal","anterior","fricative","strident","continuant"},
    "Z": {"consonantal","coronal","anterior","fricative","strident","continuant","voice"},
    "SH": {"consonantal","coronal","distributed","fricative","strident","continuant"},
    "ZH": {"consonantal","coronal","distributed","fricative","strident","continuant","voice"},
    "HH": {"consonantal","fricative","spread_glottis"},

    # Nasals
    "M": {"consonantal","labial","anterior","nasal","voice"},
    "N": {"consonantal","coronal","anterior","nasal","voice"},
    "NG": {"consonantal","dorsal","nasal","voice"},

    # Liquids / approximants
    "L": {"consonantal","coronal","anterior","lateral","approximant","voice"},
    "R": {"consonantal","coronal","approximant","voice"},

    # Glides
    "Y": {"consonantal","dorsal","high","front","glide","approximant","voice"},
    "W": {"consonantal","labial","dorsal","high","back","round","glide","approximant","voice"},

    # Taps/trills/other
    "DX": {"consonantal","coronal","tap","voice"},
    "Q": {"consonantal","stop"},   # glottal stop marker if present
    "SIL": {"silence"}
}

# Ensure every phoneme's features are a subset of FEATURE_ORDER (sanity)
for ph, feats in list(PHONEME_TO_FEATURES.items()):
    unseen = [f for f in feats if f not in FEATURE_ORDER]
    if unseen:
        raise ValueError(f"Feature(s) {unseen} for phoneme {ph} are not in FEATURE_ORDER")

# Write to features_mapping.py file so other scripts can import it
out = Path("features_mapping.py")
out.write_text(
    "# Auto-generated by Kaggle notebook cell\n"
    "FEATURE_ORDER = " + repr(FEATURE_ORDER) + "\n\n"
    "PHONEME_TO_FEATURES = " + repr(PHONEME_TO_FEATURES) + "\n"
)

# Also expose mapping in the current notebook namespace
print("Wrote features_mapping.py to", out.resolve())
print("FEATURE_ORDER length:", len(FEATURE_ORDER))
print("Example mapping for 'P':", PHONEME_TO_FEATURES.get("P"))


In [None]:
# Diagnostic + Fix cell: fix transcripts and load local model
import os, glob, json
from pathlib import Path

# ---------- Edit these if your dataset names differ ----------
# Where your downloaded local wav2vec2 files live (from your screenshot)
LOCAL_MODEL_DIR = "/kaggle/input/wav2vec2-large-robust-local/wav2vec2-large-robust-local"
# Where the Kaggle LibriSpeech input folder is
LIBRISPEECH_ROOT = "/kaggle/input/librispeech-100/LibriSpeech/train-clean-100"
# Where your manifest is (the TSV you generated earlier)
MANIFEST = "/kaggle/working/manifests/train-clean-100.tsv"
# -------------------------------------------------------------

print("Exists LOCAL_MODEL_DIR:", os.path.exists(LOCAL_MODEL_DIR))
print("Exists LIBRISPEECH_ROOT:", os.path.exists(LIBRISPEECH_ROOT))
print("Exists MANIFEST:", os.path.exists(MANIFEST))
print()

# 1) show a few files in the model dir
if os.path.exists(LOCAL_MODEL_DIR):
    print("Local model directory listing (first 20):")
    for i, f in enumerate(sorted(os.listdir(LOCAL_MODEL_DIR))):
        print(" ", f)
        if i >= 19:
            break
    # check for expected files
    for expected in ("pytorch_model.bin", "config.json", "preprocessor_config.json"):
        print("  has", expected, "=>", os.path.exists(os.path.join(LOCAL_MODEL_DIR, expected)))
print()

# 2) find transcript files in LibriSpeech folder
trans_files = glob.glob(os.path.join(LIBRISPEECH_ROOT, "**/*.trans.txt"), recursive=True)
print("Found trans files (count):", len(trans_files))
if len(trans_files) > 0:
    print("Example trans file:", trans_files[0])
    with open(trans_files[0], "r", encoding="utf-8", errors="ignore") as fh:
        for i, line in enumerate(fh):
            if i >= 5: break
            print("  ", line.strip())
else:
    # also try uppercase TXT (some Kaggle packs use uppercase)
    trans_files2 = glob.glob(os.path.join(LIBRISPEECH_ROOT, "**/*.TXT"), recursive=True)
    print("Found uppercase .TXT files (count):", len(trans_files2))
    if len(trans_files2) > 0:
        print("Example:", trans_files2[0])
        with open(trans_files2[0], "r", encoding="utf-8", errors="ignore") as fh:
            for i, line in enumerate(fh):
                if i >= 5: break
                print("  ", line.strip())

# 3) If manifest root is a placeholder, replace it with LIBRISPEECH_ROOT
if os.path.exists(MANIFEST):
    import pandas as pd
    df = pd.read_csv(MANIFEST, sep="\t", header=None, dtype=str)
    root_print = df.iloc[0,0]
    print("\nManifest first-line root:", root_print)
    if root_print.strip().lower() in ("audio_path", "path", ""):
        print("Manifest root looks like a placeholder. Rewriting manifest paths to use actual LIBRISPEECH_ROOT.")
        # Build new dataframe using actual root
        df = df.iloc[1:].reset_index(drop=True)
        df.columns = ["relpath", "nframes"]
        df["path"] = df["relpath"].apply(lambda p: os.path.join(LIBRISPEECH_ROOT, p))
        # save a fixed manifest for training
        fixed_manifest = "/kaggle/working/manifests/train-clean-100.fixed.tsv"
        # write with first line = actual root, then relative paths (we'll store absolute paths)
        with open(fixed_manifest, "w") as out:
            out.write(LIBRISPEECH_ROOT + "\n")
            for _, r in df.iterrows():
                out.write(f"{r['relpath']}\t{r['nframes']}\n")
        print("Wrote fixed manifest to:", fixed_manifest)
    else:
        print("Manifest root appears valid:", root_print)
else:
    print("Manifest file not found at", MANIFEST)

# 4) Build transcripts dict from trans files (robust to .trans.txt or .TXT)
transcripts = {}
for tf in trans_files + glob.glob(os.path.join(LIBRISPEECH_ROOT, "**/*.TXT"), recursive=True):
    try:
        with open(tf, "r", encoding="utf-8", errors="ignore") as fh:
            for line in fh:
                parts = line.strip().split(" ", 1)
                if len(parts) == 2:
                    utt, txt = parts
                    transcripts[utt] = txt.lower()
    except Exception as e:
        print("skip", tf, "->", e)

print("\nTotal transcripts loaded:", len(transcripts))
if len(transcripts) == 0:
    print(">> No transcripts found. Double-check LIBRISPEECH_ROOT path or dataset contents.")
else:
    # show a couple of mappings
    i = 0
    for k in list(transcripts.keys())[:5]:
        print(" ", k, "->", transcripts[k][:80])
        i += 1

# 5) Load local model & feature extractor from LOCAL_MODEL_DIR (if exists)
print()
if os.path.exists(LOCAL_MODEL_DIR):
    from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
    try:
        print("Loading model from local dir (this may print many messages)...")
        backbone = Wav2Vec2Model.from_pretrained(LOCAL_MODEL_DIR, local_files_only=True)
        feat = Wav2Vec2FeatureExtractor.from_pretrained(LOCAL_MODEL_DIR, local_files_only=True)
        print("Loaded local model and feature extractor OK.")
        print("Model hidden_size:", backbone.config.hidden_size)
    except Exception as e:
        print("Error loading local model:", e)
        print("Check that LOCAL_MODEL_DIR path points to directory containing 'pytorch_model.bin' and config.json")
else:
    print("LOCAL_MODEL_DIR not found - cannot load local model. Make sure path is correct.")

print("\nDone. If transcripts=0 or model didn't load, adjust LIBRISPEECH_ROOT and LOCAL_MODEL_DIR variables above and re-run this cell.")


In [None]:
# Robust manifest parser (replace the failing pd.read_csv)
FIXED_MANIFEST='/kaggle/working/manifests/train-clean-100.fixed.tsv'
FALLBACK_MANIFEST='/kaggle/working/manifests/train-clean-100.tsv'
manifest_path = FIXED_MANIFEST if os.path.exists(FIXED_MANIFEST) else FALLBACK_MANIFEST
print("Parsing manifest:", manifest_path)

root = None
rows = []
with open(manifest_path, "r", encoding="utf-8", errors="ignore") as fh:
    for i, raw in enumerate(fh):
        line = raw.strip()
        if line == "":
            continue
        if i == 0:
            # first non-empty line is the root path
            root = line
            continue
        # robust split: relpath and the rest (nframes or anything else)
        parts = line.split(None, 1)  # split on any whitespace, max 1 split
        if len(parts) == 0:
            continue
        elif len(parts) == 1:
            rel = parts[0]
            nframes = ""
        else:
            rel, nframes = parts[0], parts[1]
        rows.append((rel, nframes))

# Build dataframe
import pandas as pd
df = pd.DataFrame(rows, columns=["relpath", "nframes"])
if root is None:
    raise RuntimeError("Manifest root could not be read. Check the manifest file.")
df["path"] = df["relpath"].apply(lambda p: os.path.join(root, p))
print("Parsed entries:", len(df))
print("Example rows:")
print(df.head(6))

# quick sanity checks
missing_paths = (~df["path"].apply(os.path.exists)).sum()
print(f"Files not found on disk (approx): {missing_paths} (expected if manifest used relative paths that we rewrote)")

# attach transcripts (existing code expects df after this), so leave df in scope

In [None]:
# Cell 1: create features_mapping.py and expose FEATURE_ORDER and PHONEME_TO_FEATURES
from pathlib import Path

FEATURE_ORDER = [
    "consonantal", "sonorant", "approximant", "nasal", "voice",
    "labial", "coronal", "anterior", "distributed", "dorsal",
    "high", "low", "front", "back", "round",
    "continuant", "delayed_release", "lateral", "strident",
    "tense", "long", "stress", "syllabic",
    "spread_glottis", "constricted_glottis",
    "burst", "aspirated", "glide", "tap", "trill",
    "stop", "fricative", "affricate", "vowel", "silence"
]

# Minimal comprehensive mapping for CMU ARPAbet symbols commonly in LibriSpeech
PHONEME_TO_FEATURES = {
    # Vowels
    "AA": {"vowel","low","back","tense","syllabic"},
    "AE": {"vowel","low","front","tense","syllabic"},
    "AH": {"vowel","low","syllabic"},
    "AO": {"vowel","low","back","round","syllabic"},
    "AW": {"vowel","low","back","round","syllabic"},
    "AY": {"vowel","low","front","syllabic"},
    "EH": {"vowel","front","syllabic"},
    "ER": {"vowel","syllabic"},
    "EY": {"vowel","front","syllabic"},
    "IH": {"vowel","high","front","syllabic"},
    "IY": {"vowel","high","front","tense","syllabic"},
    "OW": {"vowel","back","round","syllabic"},
    "OY": {"vowel","back","round","syllabic"},
    "UH": {"vowel","high","back","round","syllabic"},
    "UW": {"vowel","high","back","round","tense","syllabic"},

    # Stops
    "P": {"consonantal","labial","anterior","stop"},
    "B": {"consonantal","labial","anterior","stop","voice"},
    "T": {"consonantal","coronal","anterior","stop"},
    "D": {"consonantal","coronal","anterior","stop","voice"},
    "K": {"consonantal","dorsal","stop"},
    "G": {"consonantal","dorsal","stop","voice"},

    # Affricates
    "CH": {"consonantal","coronal","anterior","affricate","delayed_release"},
    "JH": {"consonantal","coronal","anterior","affricate","delayed_release","voice"},

    # Fricatives
    "F": {"consonantal","labial","anterior","fricative","continuant"},
    "V": {"consonantal","labial","anterior","fricative","continuant","voice"},
    "TH": {"consonantal","coronal","anterior","fricative","distributed"},
    "DH": {"consonantal","coronal","anterior","fricative","distributed","voice"},
    "S": {"consonantal","coronal","anterior","fricative","strident","continuant"},
    "Z": {"consonantal","coronal","anterior","fricative","strident","continuant","voice"},
    "SH": {"consonantal","coronal","distributed","fricative","strident","continuant"},
    "ZH": {"consonantal","coronal","distributed","fricative","strident","continuant","voice"},
    "HH": {"consonantal","fricative","spread_glottis"},


    # Nasals
    "M": {"consonantal","labial","anterior","nasal","voice"},
    "N": {"consonantal","coronal","anterior","nasal","voice"},
    "NG": {"consonantal","dorsal","nasal","voice"},

    # Liquids / approximants
    "L": {"consonantal","coronal","anterior","lateral","approximant","voice"},
    "R": {"consonantal","coronal","approximant","voice"},

    # Glides
    "Y": {"consonantal","dorsal","high","front","glide","approximant","voice"},
    "W": {"consonantal","labial","dorsal","high","back","round","glide","approximant","voice"},

    # Others / special / tokens
    "DX": {"consonantal","coronal","tap","voice"},
    "Q": {"consonantal","stop"},   # glottal stop marker if present
    "SIL": {"silence"}
}

# Sanity: check all features listed are in FEATURE_ORDER
_unseen = []
for ph, feats in PHONEME_TO_FEATURES.items():
    for f in feats:
        if f not in FEATURE_ORDER:
            _unseen.append((ph,f))
if _unseen:
    raise RuntimeError(f"Found features not in FEATURE_ORDER: {_unseen[:10]}")

# Write file for import by other cells
out = Path("features_mapping.py")
out.write_text(
    "# Auto-generated by notebook cell\n"
    "FEATURE_ORDER = " + repr(FEATURE_ORDER) + "\n\n"
    "PHONEME_TO_FEATURES = " + repr(PHONEME_TO_FEATURES) + "\n"
)
print("Wrote features_mapping.py (FEATURE_ORDER len = {})".format(len(FEATURE_ORDER)))


In [None]:
# ================= Single self-contained training cell =================
# Copy-paste and run. Adjust SMALL config at top if needed.

RUN_SMOKE_TEST = False   # True = run 1 training step and exit (useful to verify). Set False to run full training.
LOCAL_MODEL_DIR = "/kaggle/input/wav2vec2-large-robust-local/wav2vec2-large-robust-local"
FIXED_MANIFEST = "/kaggle/working/manifests/train-clean-100.fixed.tsv"
LIBRISPEECH_ROOT = "/kaggle/input/librispeech-100/LibriSpeech/train-clean-100"
SAVE_DIR = "/kaggle/working/checkpoints"
os.makedirs(SAVE_DIR, exist_ok=True)

# hyperparams
SAMPLE_RATE = 16000
BATCH_SIZE = 2       # safe default for smoke + GPU
EPOCHS = 3
ACCUM_STEPS = 1
LR = 1e-4
WEIGHT_DECAY = 0.005
NUM_WORKERS = 0      # set 0 in Kaggle to avoid worker spawn issues

# ---------- imports ----------
try:
    import pronouncing
except Exception:
    !pip install pronouncing
    import pronouncing

import os, glob, time, gc, random
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

import torchaudio
from transformers import Wav2Vec2Config, Wav2Vec2Model, Wav2Vec2FeatureExtractor, get_linear_schedule_with_warmup

# ---- G2P (grapheme-to-phoneme) using cmudict ----
import cmudict

# Load CMU dictionary (about 134k words)
cmu = cmudict.dict()
print("Using CMUdict for G2P. Entries:", len(cmu))

def word_to_phones(word):
    """Return a list of phonemes for a given word, without stress markers."""
    w = word.lower()
    if w in cmu:
        phs = cmu[w][0]  # first pronunciation
        return [p.rstrip("0123456789") for p in phs]
    return []  # unknown word

def text_to_phonemes(text):
    """Convert a sentence into a flat phoneme sequence."""
    phones = []
    for w in text.strip().split():
        phones.extend(word_to_phones(w))
    return phones

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# ---------- feature mapping (cell1 must have created features_mapping.py) ----------
from features_mapping import PHONEME_TO_FEATURES, FEATURE_ORDER
N_FEATURES = len(FEATURE_ORDER)
assert N_FEATURES == 35, f"Expected 35 features, got {N_FEATURES}"
OUTPUT_DIM = N_FEATURES * 2 + 1
CTC_BLANK_IDX = 2

# ---------- robust manifest parsing ----------
manifest_path = FIXED_MANIFEST
print("Parsing manifest:", manifest_path)
root = None
rows = []
with open(manifest_path, "r", encoding="utf-8", errors="ignore") as fh:
    for raw in fh:
        line = raw.strip()
        if line == "":
            continue
        if root is None:
            root = line
            continue
        parts = line.split(None, 1)
        if len(parts) == 0:
            continue
        elif len(parts) == 1:
            rel, nframes = parts[0], ""
        else:
            rel, nframes = parts[0], parts[1]
        rows.append((rel, nframes))
if root is None:
    raise RuntimeError("Manifest root not found.")
df = pd.DataFrame(rows, columns=["relpath", "nframes"])
df["path"] = df["relpath"].apply(lambda p: os.path.join(root, p))
print("Parsed entries:", len(df))
missing_files = (~df["path"].apply(os.path.exists)).sum()
print("Missing audio files (should be 0):", missing_files)

# ---------- transcripts ----------
transcripts = {}
for tf in glob.glob(os.path.join(LIBRISPEECH_ROOT, "**/*.trans.txt"), recursive=True) + \
          glob.glob(os.path.join(LIBRISPEECH_ROOT, "**/*.TXT"), recursive=True):
    try:
        with open(tf, "r", encoding="utf-8", errors="ignore") as fh:
            for line in fh:
                parts = line.strip().split(" ", 1)
                if len(parts) == 2:
                    utt, txt = parts
                    transcripts[utt] = txt.lower()
    except Exception:
        pass
print("Transcripts loaded:", len(transcripts))

def lookup_text(path):
    utt = os.path.basename(path).split(".")[0]
    return transcripts.get(utt, "")

df["text"] = df["path"].apply(lookup_text)
empty_count = (df["text"] == "").sum()
print("Utterances missing transcripts:", empty_count)

# ---------- G2P via pronouncing ----------
def word_to_phones(word):
    cand = pronouncing.phones_for_word(word.lower())
    if not cand:
        return []
    phs = cand[0].split()
    return [p.rstrip("0123456789") for p in phs]

def text_to_phonemes(text):
    phones=[]
    for w in text.strip().split():
        phones.extend(word_to_phones(w))
    return phones

def phonemes_to_feature_sequences(phonemes):
    seqs = [[] for _ in range(N_FEATURES)]
    for ph in phonemes:
        feats = PHONEME_TO_FEATURES.get(ph, set())
        for i, fname in enumerate(FEATURE_ORDER):
            seqs[i].append(0 if fname in feats else 1)
    out=[]
    for s in seqs:
        if len(s)==0:
            out.append(torch.LongTensor([1]))
        else:
            out.append(torch.LongTensor(s))
    return out

# ---------- dataset and loaders ----------
class LibriPhonoDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        wav, sr = torchaudio.load(r.path)
        if sr != SAMPLE_RATE:
            wav = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(wav)
        audio = wav.squeeze(0)
        phones = text_to_phonemes(r.text)
        feats = phonemes_to_feature_sequences(phones)
        return {"audio": audio, "features": feats}

# small split for speed
split_idx = int(len(df) * 0.98)
train_df = df.iloc[:split_idx].reset_index(drop=True)
val_df = df.iloc[split_idx:].reset_index(drop=True)
print("Train/Val sizes:", len(train_df), len(val_df))

train_ds = LibriPhonoDataset(train_df)
val_ds = LibriPhonoDataset(val_df)

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(LOCAL_MODEL_DIR, local_files_only=True)

def collate_fn(batch):
    audios = [b["audio"].numpy() for b in batch]
    features = [b["features"] for b in batch]
    inputs = feature_extractor(audios, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
    return {"input_values": inputs.input_values, "features": features}

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=NUM_WORKERS)

# ---------- safe model load (explicit CPU load, then move to GPU) ----------
print("Safe-load: reading config from", LOCAL_MODEL_DIR)
cfg = Wav2Vec2Config.from_pretrained(LOCAL_MODEL_DIR, local_files_only=True)
print("Config loaded. hidden_size =", cfg.hidden_size)

print("Instantiating model skeleton on CPU")
backbone = Wav2Vec2Model(cfg)
backbone.cpu()

state_path = os.path.join(LOCAL_MODEL_DIR, "pytorch_model.bin")
if not os.path.exists(state_path):
    raise FileNotFoundError(state_path)

t0 = time.time()
print("Loading state dict from disk (map_location='cpu') ...")
state_dict = torch.load(state_path, map_location="cpu")
print("Loaded state dict keys:", len(state_dict), " time:", time.time()-t0)

print("Loading state dict into model (strict=False) ...")
res = backbone.load_state_dict(state_dict, strict=False)
#print("load_state_dict result:", res)   # prints missing/unexpected keys lists

del state_dict
gc.collect()

# freeze CNN feature extractor
for p in backbone.feature_extractor.parameters():
    p.requires_grad = False
print("Feature extractor frozen.")

print("Moving model to device:", DEVICE)
backbone.to(DEVICE)

# attach linear head
class PhonoModel(nn.Module):
    def __init__(self, base, output_dim):
        super().__init__()
        self.base = base
        self.linear = nn.Linear(base.config.hidden_size, output_dim)
    def forward(self, x):
        last = self.base(x).last_hidden_state
        logits = self.linear(last)
        return logits, last

model = PhonoModel(backbone, OUTPUT_DIM).to(DEVICE)
print("Model ready. OUTPUT_DIM:", OUTPUT_DIM)

# ---------- optimizer / scheduler / loss ----------
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=WEIGHT_DECAY)
total_steps = max(1, (len(train_loader) * EPOCHS) // ACCUM_STEPS)
warmup = max(1, int(total_steps * 0.10))
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup, num_training_steps=total_steps)
ctc_loss = nn.CTCLoss(blank=CTC_BLANK_IDX, zero_infinity=True)
scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))

# ---------- SCTC-SB loss ----------
def sctc_sb_loss(logits, targets):
    B,T,D = logits.shape
    device = logits.device
    blank_logits = logits[:,:, -1]
    total = torch.tensor(0.0, device=device)
    for i in range(N_FEATURES):
        pos_idx = 2*i
        neg_idx = 2*i + 1
        cat_logits = torch.stack([logits[:,:,pos_idx], logits[:,:,neg_idx], blank_logits], dim=-1)
        logp = F.log_softmax(cat_logits.permute(1,0,2), dim=2)
        tgt_list=[]
        tgt_lens=[]
        for b in range(B):
            t = targets[b][i]
            tgt_list.append(t)
            tgt_lens.append(len(t))
        tgt_concat = torch.cat(tgt_list).to(device)
        tgt_lens = torch.LongTensor(tgt_lens).to(device)
        in_lens = torch.LongTensor([T]*B).to(device)
        loss_i = ctc_loss(logp, tgt_concat, in_lens, tgt_lens)
        total = total + loss_i
    return total

# ---------- training (or smoke test) ----------
if RUN_SMOKE_TEST:
    print("Running single-step smoke test...")
    model.train()
    batch = next(iter(train_loader))
    x = batch["input_values"].to(DEVICE)
    targets = batch["features"]
    with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
        logits, _ = model(x)
        loss = sctc_sb_loss(logits, targets)
    print("Smoke test loss:", loss.item())
    print("Smoke test completed â€” set RUN_SMOKE_TEST=False to run full training.")
else:
    print("Starting full training...")
    model.train()
    global_step = 0
    best_val = float("inf")
    for epoch in range(1, EPOCHS+1):
        pbar = tqdm(train_loader, desc=f"Train Epoch {epoch}/{EPOCHS}")
        running_loss = 0.0
        for step, batch in enumerate(pbar):
            inputs = batch["input_values"].to(DEVICE)
            targets = batch["features"]
            with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
                logits, _ = model(inputs)
                loss = sctc_sb_loss(logits, targets)
                loss_value = loss.item()
            scaler.scale(loss / ACCUM_STEPS).backward()
            if (step + 1) % ACCUM_STEPS == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1
            running_loss += loss_value
            pbar.set_postfix({"loss": f"{running_loss / (step+1):.4f}"})
        # validation after epoch
        model.eval()
        val_loss = 0.0
        val_steps = 0
        with torch.no_grad():
            for vb, vbatch in enumerate(tqdm(val_loader, desc="Val", leave=False)):
                vinputs = vbatch["input_values"].to(DEVICE)
                vtargets = vbatch["features"]
                v_logits, _ = model(vinputs)
                vl = sctc_sb_loss(v_logits, vtargets)
                val_loss += vl.item()
                val_steps += 1
                if val_steps >= 100:
                    break
        val_loss_avg = val_loss / max(1, val_steps)
        print(f"Epoch {epoch} avg val loss: {val_loss_avg:.4f}")
        ckpt = os.path.join(SAVE_DIR, f"phono_sctc_epoch{epoch}.pt")
        torch.save(model.state_dict(), ckpt)
        print("Saved:", ckpt)
        if val_loss_avg < best_val:
            best_val = val_loss_avg
            best_ckpt = os.path.join(SAVE_DIR, "phono_sctc_best.pt")
            torch.save(model.state_dict(), best_ckpt)
            print("Saved new best:", best_ckpt)
        model.train()
    print("Training finished. Best val loss:", best_val)
