In [1]:
import os
import torchaudio
import pandas as pd
from tqdm import tqdm
import sounddevice as sd

In [2]:
def play_audio_segment(audio_tensor, sample_rate):
    # audio_tensor shape: [channels, samples]
    audio_np = audio_tensor.numpy().T  # Trasponi per shape (samples, channels)
    sd.play(audio_np, sample_rate)
    sd.wait()

def label_speakers(audio_path, transcript_path):
    df = pd.read_csv(transcript_path, sep="\t")
    waveform, sample_rate = torchaudio.load(audio_path)
    if df['speaker'].str.lower().isin({'ellie', 'participant', 'ignore'}).any():
        print(f"{transcript_path} Labeling already done, skipping.")
        return
    speaker_labels = {}
    speakers = df['speaker'].unique()

    for spk in speakers:
        # Trova il segmento più lungo per questo speaker
        spk_segments = df[df['speaker'] == spk].copy()
        spk_segments["duration"] = spk_segments["stop_time"] - spk_segments["start_time"]
        longest_seg = spk_segments.sort_values("duration", ascending=False).iloc[0]

        start_sample = int(longest_seg['start_time'] * sample_rate)
        end_sample = int(longest_seg['stop_time'] * sample_rate)
        segment_audio = waveform[:, start_sample:end_sample]

        print(f"\nSpeaker: {spk} - playing **longest** segment from {start_sample/sample_rate:.2f}s to {end_sample/sample_rate:.2f}s "
              f"({longest_seg['duration']:.2f}s long)")
        play_audio_segment(segment_audio, sample_rate)
        choice = input("Label this speaker as (E)llie, (P)articipant, (O)ther: ").strip().lower()
        if choice == 'e':
            speaker_labels[spk] = "Ellie"
        elif choice == 'p':
            speaker_labels[spk] = "Participant"
        else:
            speaker_labels[spk] = "ignore"

    df['speaker'] = df['speaker'].map(speaker_labels)
    df.to_csv(transcript_path, sep="\t", index=False)
    print(f"Updated transcript saved to {transcript_path}")

In [3]:
edaic_dir = "../datasets/EDAIC-WOZ"

# Process all sessions for speaker labeling
sessions = sorted([d for d in os.listdir(edaic_dir) if os.path.isdir(os.path.join(edaic_dir, d))])

for session in tqdm(sessions, desc="Labeling speakers"):
    session_path = os.path.join(edaic_dir, session)
    base_name = session.split("_")[0]
    audio_path = os.path.join(session_path, f"{base_name}_AUDIO.wav")
    transcript_path = os.path.join(session_path, f"{base_name}_TRANSCRIPT.csv")
    
    if os.path.exists(audio_path) and os.path.exists(transcript_path):
        print(f"\n=== Processing session {session} ===")
        try:
            label_speakers(audio_path, transcript_path)
        except Exception as e:
            print(f"Error processing {session}: {e}")
            continue
    else:
        print(f"Skipping {session}: missing audio or transcript file")

Labeling speakers:   0%|          | 0/30 [00:00<?, ?it/s]


=== Processing session 602_P ===
../datasets/EDAIC-WOZ\602_P\602_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 604_P ===
../datasets/EDAIC-WOZ\604_P\604_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 617_P ===
../datasets/EDAIC-WOZ\617_P\617_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 624_P ===
../datasets/EDAIC-WOZ\624_P\624_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 633_P ===


Labeling speakers:  17%|█▋        | 5/30 [00:00<00:01, 18.57it/s]

../datasets/EDAIC-WOZ\633_P\633_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 636_P ===
../datasets/EDAIC-WOZ\636_P\636_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 637_P ===


Labeling speakers:  30%|███       | 9/30 [00:00<00:01, 14.41it/s]

../datasets/EDAIC-WOZ\637_P\637_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 638_P ===
../datasets/EDAIC-WOZ\638_P\638_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 640_P ===
../datasets/EDAIC-WOZ\640_P\640_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 641_P ===
../datasets/EDAIC-WOZ\641_P\641_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 649_P ===


Labeling speakers:  37%|███▋      | 11/30 [00:00<00:01, 14.37it/s]

../datasets/EDAIC-WOZ\649_P\649_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 655_P ===
../datasets/EDAIC-WOZ\655_P\655_TRANSCRIPT.csv Labeling already done, skipping.

=== Processing session 658_P ===

Speaker: SPEAKER_02 - playing **longest** segment from 35.92s to 45.95s (10.02s long)


Labeling speakers:  37%|███▋      | 11/30 [00:20<00:01, 14.37it/s]


Speaker: SPEAKER_01 - playing **longest** segment from 143.25s to 179.81s (36.56s long)

Speaker: SPEAKER_00 - playing **longest** segment from 386.50s to 394.65s (8.15s long)


Labeling speakers:  43%|████▎     | 13/30 [01:45<04:44, 16.76s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\658_P\658_TRANSCRIPT.csv

=== Processing session 659_P ===

Speaker: SPEAKER_00 - playing **longest** segment from 1262.45s to 1276.76s (14.31s long)

Speaker: SPEAKER_01 - playing **longest** segment from 493.11s to 515.04s (21.93s long)


Labeling speakers:  47%|████▋     | 14/30 [02:48<06:46, 25.44s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\659_P\659_TRANSCRIPT.csv

=== Processing session 661_P ===

Speaker: SPEAKER_01 - playing **longest** segment from 495.21s to 499.39s (4.18s long)

Speaker: SPEAKER_00 - playing **longest** segment from 558.66s to 597.49s (38.83s long)

Speaker: SPEAKER_02 - playing **longest** segment from 36.18s to 44.43s (8.25s long)


Labeling speakers:  50%|█████     | 15/30 [04:49<11:21, 45.40s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\661_P\661_TRANSCRIPT.csv

=== Processing session 673_P ===

Speaker: SPEAKER_00 - playing **longest** segment from 0.74s to 21.13s (20.39s long)

Speaker: SPEAKER_02 - playing **longest** segment from 399.09s to 410.20s (11.11s long)

Speaker: SPEAKER_01 - playing **longest** segment from 78.28s to 97.90s (19.62s long)


Labeling speakers:  53%|█████▎    | 16/30 [06:29<13:32, 58.06s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\673_P\673_TRANSCRIPT.csv

=== Processing session 677_P ===

Speaker: SPEAKER_02 - playing **longest** segment from 34.35s to 60.44s (26.08s long)

Speaker: SPEAKER_00 - playing **longest** segment from 474.06s to 491.38s (17.33s long)

Speaker: SPEAKER_01 - playing **longest** segment from 829.93s to 830.03s (0.10s long)


Labeling speakers:  57%|█████▋    | 17/30 [07:42<13:20, 61.56s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\677_P\677_TRANSCRIPT.csv

=== Processing session 680_P ===

Speaker: SPEAKER_02 - playing **longest** segment from 98.92s to 121.36s (22.44s long)

Speaker: SPEAKER_00 - playing **longest** segment from 1364.15s to 1380.47s (16.31s long)

Speaker: SPEAKER_01 - playing **longest** segment from 11.58s to 23.88s (12.30s long)


Labeling speakers:  60%|██████    | 18/30 [09:33<14:53, 74.45s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\680_P\680_TRANSCRIPT.csv

=== Processing session 682_P ===

Speaker: SPEAKER_00 - playing **longest** segment from 88.81s to 110.11s (21.30s long)

Speaker: SPEAKER_02 - playing **longest** segment from 52.28s to 72.39s (20.11s long)

Speaker: SPEAKER_01 - playing **longest** segment from 482.18s to 489.93s (7.75s long)


Labeling speakers:  63%|██████▎   | 19/30 [11:08<14:42, 80.27s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\682_P\682_TRANSCRIPT.csv

=== Processing session 684_P ===

Speaker: SPEAKER_02 - playing **longest** segment from 60.16s to 117.58s (57.42s long)

Speaker: SPEAKER_01 - playing **longest** segment from 722.11s to 776.18s (54.07s long)

Speaker: SPEAKER_00 - playing **longest** segment from 684.62s to 686.91s (2.28s long)


Labeling speakers:  67%|██████▋   | 20/30 [13:31<16:15, 97.57s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\684_P\684_TRANSCRIPT.csv

=== Processing session 688_P ===

Speaker: SPEAKER_00 - playing **longest** segment from 3.35s to 28.94s (25.58s long)

Speaker: SPEAKER_01 - playing **longest** segment from 563.04s to 581.61s (18.57s long)


Labeling speakers:  70%|███████   | 21/30 [14:25<12:47, 85.28s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\688_P\688_TRANSCRIPT.csv

=== Processing session 689_P ===

Speaker: SPEAKER_02 - playing **longest** segment from 49.27s to 55.68s (6.41s long)

Speaker: SPEAKER_01 - playing **longest** segment from 304.57s to 311.62s (7.05s long)

Speaker: SPEAKER_00 - playing **longest** segment from 160.76s to 177.93s (17.16s long)


Labeling speakers:  73%|███████▎  | 22/30 [15:26<10:25, 78.24s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\689_P\689_TRANSCRIPT.csv

=== Processing session 691_P ===

Speaker: SPEAKER_02 - playing **longest** segment from 283.11s to 297.96s (14.86s long)

Speaker: SPEAKER_00 - playing **longest** segment from 627.14s to 652.15s (25.01s long)


Labeling speakers:  77%|███████▋  | 23/30 [16:23<08:24, 72.03s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\691_P\691_TRANSCRIPT.csv

=== Processing session 696_P ===

Speaker: SPEAKER_01 - playing **longest** segment from 288.49s to 322.80s (34.31s long)

Speaker: SPEAKER_02 - playing **longest** segment from 261.85s to 271.69s (9.84s long)

Speaker: SPEAKER_00 - playing **longest** segment from 369.02s to 369.43s (0.40s long)


Labeling speakers:  80%|████████  | 24/30 [17:48<07:35, 75.92s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\696_P\696_TRANSCRIPT.csv

=== Processing session 698_P ===

Speaker: SPEAKER_01 - playing **longest** segment from 0.03s to 41.34s (41.31s long)

Speaker: SPEAKER_00 - playing **longest** segment from 557.50s to 614.45s (56.96s long)


Labeling speakers:  83%|████████▎ | 25/30 [19:34<07:03, 84.79s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\698_P\698_TRANSCRIPT.csv

=== Processing session 699_P ===

Speaker: SPEAKER_00 - playing **longest** segment from 8.57s to 24.79s (16.22s long)

Speaker: SPEAKER_01 - playing **longest** segment from 103.53s to 125.31s (21.78s long)

Speaker: SPEAKER_02 - playing **longest** segment from 280.22s to 291.09s (10.88s long)


Labeling speakers:  87%|████████▋ | 26/30 [20:50<05:28, 82.22s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\699_P\699_TRANSCRIPT.csv

=== Processing session 705_P ===

Speaker: SPEAKER_01 - playing **longest** segment from 44.75s to 64.11s (19.36s long)

Speaker: SPEAKER_00 - playing **longest** segment from 660.45s to 671.14s (10.69s long)


Labeling speakers:  90%|█████████ | 27/30 [21:51<03:47, 75.77s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\705_P\705_TRANSCRIPT.csv

=== Processing session 709_P ===

Speaker: SPEAKER_02 - playing **longest** segment from 0.03s to 58.55s (58.52s long)

Speaker: SPEAKER_00 - playing **longest** segment from 93.50s to 123.07s (29.57s long)

Speaker: SPEAKER_01 - playing **longest** segment from 449.44s to 452.78s (3.34s long)


Labeling speakers:  93%|█████████▎| 28/30 [23:40<02:51, 85.68s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\709_P\709_TRANSCRIPT.csv

=== Processing session 716_P ===

Speaker: SPEAKER_01 - playing **longest** segment from 30.39s to 50.09s (19.70s long)

Speaker: SPEAKER_02 - playing **longest** segment from 60.90s to 86.86s (25.97s long)

Speaker: SPEAKER_00 - playing **longest** segment from 180.58s to 198.87s (18.29s long)


Labeling speakers: 100%|██████████| 30/30 [24:52<00:00, 49.74s/it]

Updated transcript saved to ../datasets/EDAIC-WOZ\716_P\716_TRANSCRIPT.csv
Skipping temp_results: missing audio or transcript file





In [4]:
daic_dir = "../datasets/DAIC-WOZ" 

sessions = ["318_P", "321_P", "341_P", "362_P"] # https://github.com/adbailey1/daic_woz_process/tree/master

for session in tqdm(sessions, desc="Labeling speakers"):
    session_path = os.path.join(daic_dir, session)
    base_name = session.split("_")[0]
    audio_path = os.path.join(session_path, f"{base_name}_AUDIO.wav")
    transcript_path = os.path.join(session_path, f"{base_name}_TRANSCRIPT.csv")
    
    if os.path.exists(audio_path) and os.path.exists(transcript_path):
        print(f"\n=== Processing session {session} ===")
        try:
            label_speakers(audio_path, transcript_path)
        except Exception as e:
            print(f"Error processing {session}: {e}")
            continue
    else:
        print(f"Skipping {session}: missing audio or transcript file")

Labeling speakers:   0%|          | 0/4 [00:00<?, ?it/s]


=== Processing session 318_P ===

Speaker: SPEAKER_00 - playing **longest** segment from 9.61s to 17.49s (7.89s long)

Speaker: SPEAKER_01 - playing **longest** segment from 391.36s to 412.69s (21.33s long)


Labeling speakers:  25%|██▌       | 1/4 [01:17<03:52, 77.64s/it]

Updated transcript saved to ../datasets/DAIC-WOZ\318_P\318_TRANSCRIPT.csv

=== Processing session 321_P ===

Speaker: SPEAKER_01 - playing **longest** segment from 11.08s to 17.87s (6.79s long)

Speaker: SPEAKER_00 - playing **longest** segment from 326.49s to 357.14s (30.65s long)

Speaker: SPEAKER_02 - playing **longest** segment from 37.85s to 65.64s (27.79s long)


Labeling speakers:  50%|█████     | 2/4 [04:02<04:17, 128.94s/it]

Updated transcript saved to ../datasets/DAIC-WOZ\321_P\321_TRANSCRIPT.csv

=== Processing session 341_P ===

Speaker: SPEAKER_01 - playing **longest** segment from 9.12s to 14.30s (5.19s long)

Speaker: SPEAKER_00 - playing **longest** segment from 385.29s to 405.64s (20.35s long)

Speaker: SPEAKER_02 - playing **longest** segment from 68.31s to 73.19s (4.89s long)


Labeling speakers:  75%|███████▌  | 3/4 [13:39<05:33, 333.52s/it]

Updated transcript saved to ../datasets/DAIC-WOZ\341_P\341_TRANSCRIPT.csv

=== Processing session 362_P ===

Speaker: SPEAKER_01 - playing **longest** segment from 479.98s to 491.53s (11.56s long)

Speaker: SPEAKER_00 - playing **longest** segment from 552.00s to 560.80s (8.79s long)


Labeling speakers: 100%|██████████| 4/4 [14:43<00:00, 220.82s/it]

Updated transcript saved to ../datasets/DAIC-WOZ\362_P\362_TRANSCRIPT.csv



