In [1]:
from pathlib import Path
from collections import Counter

root = Path("./data")  # your --output-dir
def read_uris(name):
    return [u.strip() for u in (root/"lists"/f"{name}.lst").read_text().splitlines() if u.strip()]

splits = {k: read_uris(k) for k in ("train","dev","test")}

def call_id(uri):
    # URIs look like room__call__chunk_0001 → group by room__call
    parts = uri.split("__")
    return "__".join(parts[:-1])  # room__call

# 1) Ensure no call is in more than one split
calls = {k: set(call_id(u) for u in v) for k,v in splits.items()}
assert calls["train"].isdisjoint(calls["dev"])
assert calls["train"].isdisjoint(calls["test"])
assert calls["dev"].isdisjoint(calls["test"])
print("OK: no call leakage across splits.")

# 2) Show how many calls/chunks per split
for k in ("train","dev","test"):
    print(f"{k}: {len(calls[k])} calls, {len(splits[k])} chunks")

# 3) Inspect imbalance (top calls by #chunks) to see if one big call dominated a split
for k in ("train","dev","test"):
    cnt = Counter(call_id(u) for u in splits[k])
    print(f"{k} top calls:", cnt.most_common(5))


OK: no call leakage across splits.
train: 15 calls, 218 chunks
dev: 2 calls, 26 chunks
test: 2 calls, 22 chunks
train top calls: [('139__2025_04_10_08_26_51_139_77', 28), ('103__2025_04_19_15_17_55_103_77', 27), ('139__2025_04_14_19_44_40_139_77', 27), ('139__2025_04_16_17_18_09_139_77', 25), ('Jezan__2025_07_28_09_31_00_8410_77', 15)]
dev top calls: [('102__2025_03_10_21_21_46_102_77', 14), ('Jezan__2025_07_28_09_31_28_8405_77', 12)]
test top calls: [('102__2025_03_10_22_54_42_102_77', 12), ('Jezan__2025_07_28_09_45_30_8403_77', 10)]


In [3]:
from pyannote.database import registry, get_protocol
registry.load_database("/home/naimah/Documents/github/pyannote_train/New_train/data/database.yml")
p = get_protocol("MyDatabase.SpeakerDiarization.MyProtocol")
print("dev files:", len(list(p.development())))
print("test files:", len(list(p.test())))
print("train files:", len(list(p.train())))

dev files: 26
test files: 22
train files: 218




In [8]:
from pyannote.database import registry, get_protocol
cfg = "/home/naimah/Documents/github/pyannote_train/New_train/data/database.yml"
registry.reset()  # ensure a clean reload
registry.load_database(cfg)
p = get_protocol("MyDatabase.SpeakerDiarization.MyProtocol")
f = next(iter(p.development()))
print("database:", f.get("database"))
print("uri:", f.get("uri"))
print("audio:", f.get("audio"))
print("keys:", sorted(f.keys()))

AttributeError: 'Registry' object has no attribute 'reset'

In [9]:
# imports
from pyannote.database import registry, FileFinder

# when creating the protocol
cfg_path = Path(os.environ["PYANNOTE_DATABASE_CONFIG"])
audio_root = cfg_path.parent / "audio"
db_name = args.protocol.split(".", 1)[0]  # "MyDatabase"

pre = {"audio": FileFinder(paths={db_name: str(audio_root / "{uri}.wav")})}
registry.load_database(str(cfg_path))
proto = registry.get_protocol(args.protocol, preprocessors=pre)


KeyError: 'PYANNOTE_DATABASE_CONFIG'

In [None]:
from pathlib import Path
import os, json, glob, sys
import torch
from pyannote.audio import Model
from pyannote.audio.pipelines import SpeakerDiarization

# Optional perf on Ampere+ GPUs
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

# ---- avoid 401: force anonymous HF access for public models
os.environ.pop("HF_TOKEN", None)

root = Path("./exp")

# 1) load tuned params
opt_path = root / "01_tune_pipeline" / "optimizer.json"
if not opt_path.exists():
    sys.exit(f"Missing optimizer.json at: {opt_path}")
opt = json.loads(opt_path.read_text())
best_params = opt.get("best_params") or opt.get("best") or opt

# auto-nest legacy "segmentation>threshold" keys
if isinstance(best_params, dict) and any(">" in k for k in best_params):
    nested = {}
    for k, v in best_params.items():
        if ">" in k:
            a, b = k.split(">", 1)
            nested.setdefault(a, {})[b] = v
        else:
            nested[k] = v
    best_params = nested

print("Best params:", best_params)

# 2) choose checkpoint from 02_finetune_seg (or 01_finetune_seg fallback)
ckpts = sorted(glob.glob(str(root / "02_finetune_seg" / "*.ckpt")))
if not ckpts:
    ckpts = sorted(glob.glob(str(root / "01_finetune_seg" / "*.ckpt")))
assert ckpts, "No checkpoint found in 02_finetune_seg/ or 01_finetune_seg/"
best_ckpt = ckpts[-1]
print("Using checkpoint:", best_ckpt)

# 3) segmentation model (local, so no HF needed)
seg_model = Model.from_pretrained(best_ckpt)
seg_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# 4) pipeline: force anonymous hub access for the embedding model
pipe = SpeakerDiarization(segmentation=seg_model, use_auth_token=False)
try:
    pipe = pipe.instantiate(best_params)
except Exception:
    pipe.set_params(**best_params)

# 5) run
wav = "data/audio/102__2025_03_10_20_01_18_102_77__2025_03_10_20_01_18_102_77_chunk0.wav"
assert Path(wav).exists(), f"WAV not found: {wav}"
diar = pipe({"audio": wav})
print(diar)

# 6) save RTTM
out_rttm = Path("inference_out.rttm")
with out_rttm.open("w") as f:
    diar.write_rttm(f)
print("Saved:", out_rttm.resolve())


usage: ipykernel_launcher.py [-h] [--exp-dir EXP_DIR] --wav WAV
                             [--rttm-out RTTM_OUT] [--hf-token HF_TOKEN]
                             [--override-threshold OVERRIDE_THRESHOLD]
                             [--enable-tf32]
ipykernel_launcher.py: error: the following arguments are required: --wav


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


: 