In [3]:
import json
import os
from pydub import AudioSegment

ANNOTATION_FILE = "annotations.json"

OUT_HC = "HC_segments"
OUT_PD = "PD_segments"

os.makedirs(OUT_HC, exist_ok=True)
os.makedirs(OUT_PD, exist_ok=True)

with open(ANNOTATION_FILE, "r") as f:
    tasks = json.load(f)

print(f"Loaded {len(tasks)} tasks.\n")

for task in tasks:
    ls_path = task["data"]["audio"]               # e.g. /data/upload/2/73cc2f0e-ID00_hc_0_0_0.wav
    ls_filename = os.path.basename(ls_path)       # 73cc2f0e-ID00_hc_0_0_0.wav  

    # --- FIX: remove LS prefix before first dash ---
    real_filename = ls_filename.split("-", 1)[1]   # ID00_hc_0_0_0.wav

    # Determine the true folder
    if "_hc" in real_filename.lower():
        real_path = os.path.join("HC", real_filename)
    elif "_pd" in real_filename.lower():
        real_path = os.path.join("PD", real_filename)
    else:
        print("Could not detect class for file:", real_filename)
        continue

    if not os.path.exists(real_path):
        print("Missing audio file:", real_path)
        continue

    # Load the real audio
    audio = AudioSegment.from_file(real_path)

    for i, ann in enumerate(task["annotations"][0]["result"]):
        label = ann["value"]["labels"][0].lower()
        start_ms = int(ann["value"]["start"] * 1000)
        end_ms = int(ann["value"]["end"] * 1000)

        segment = audio[start_ms:end_ms]

        if label == "parkinson":
            out_dir = OUT_PD
        elif label == "health":
            out_dir = OUT_HC
        else:
            continue

        save_name = f"{real_filename}_seg{i}.wav"
        save_path = os.path.join(out_dir, save_name)

        segment.export(save_path, format="wav")
        print("Saved:", save_path)

print("\n✔ DONE — All segments extracted")


Loaded 32 tasks.

Saved: HC_segments\ID00_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID01_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID03_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID05_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID05_hc_0_0_0.wav_seg1.wav
Saved: HC_segments\ID08_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID09_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID09_hc_0_0_0.wav_seg1.wav
Saved: HC_segments\ID10_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID10_hc_0_0_0.wav_seg1.wav
Saved: HC_segments\ID11_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID11_hc_0_0_0.wav_seg1.wav
Saved: HC_segments\ID12_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID12_hc_0_0_0.wav_seg1.wav
Saved: HC_segments\ID14_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID15_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID15_hc_0_0_0.wav_seg1.wav
Saved: HC_segments\ID19_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID19_hc_0_0_0.wav_seg1.wav
Saved: HC_segments\ID21_hc_0_0_0.wav_seg0.wav
Saved: HC_segments\ID21_hc_0_0_0.wav_seg1.wav
Saved: HC_segmen