In [None]:
import os
import torchaudio
import pandas as pd
from tqdm import tqdm
import sounddevice as sd

In [None]:
def play_audio_segment(audio_tensor, sample_rate):
    # audio_tensor shape: [channels, samples]
    audio_np = audio_tensor.numpy().T  # Trasponi per shape (samples, channels)
    sd.play(audio_np, sample_rate)
    sd.wait()

def label_speakers(audio_path, transcript_path):
    df = pd.read_csv(transcript_path, sep="\t")
    waveform, sample_rate = torchaudio.load(audio_path)

    speaker_labels = {}
    speakers = df['speaker'].unique()

    for spk in speakers:
        # Trova il segmento più lungo per questo speaker
        spk_segments = df[df['speaker'] == spk].copy()
        spk_segments["duration"] = spk_segments["stop_time"] - spk_segments["start_time"]
        longest_seg = spk_segments.sort_values("duration", ascending=False).iloc[0]

        start_sample = int(longest_seg['start_time'] * sample_rate)
        end_sample = int(longest_seg['stop_time'] * sample_rate)
        segment_audio = waveform[:, start_sample:end_sample]

        print(f"\nSpeaker: {spk} - playing **longest** segment from {start_sample/sample_rate:.2f}s to {end_sample/sample_rate:.2f}s "
              f"({longest_seg['duration']:.2f}s long)")
        play_audio_segment(segment_audio, sample_rate)

        choice = input("Label this speaker as (E)llie, (P)articipant, (O)ther: ").strip().lower()
        if choice == 'e':
            speaker_labels[spk] = "Ellie"
        elif choice == 'p':
            speaker_labels[spk] = "Participant"
        else:
            speaker_labels[spk] = "ignore"

    df['speaker'] = df['speaker'].map(speaker_labels)
    df.to_csv(transcript_path, sep="\t", index=False)
    print(f"Updated transcript saved to {transcript_path}")

In [None]:
edaic_dir = "../datasets/EDAIC-WOZ"

# Process all sessions for speaker labeling
sessions = sorted([d for d in os.listdir(edaic_dir) if os.path.isdir(os.path.join(edaic_dir, d))])

for session in tqdm(sessions, desc="Labeling speakers"):
    session_path = os.path.join(edaic_dir, session)
    base_name = session.split("_")[0]
    audio_path = os.path.join(session_path, f"{base_name}_AUDIO.wav")
    transcript_path = os.path.join(session_path, f"{base_name}_TRANSCRIPT.csv")
    
    if os.path.exists(audio_path) and os.path.exists(transcript_path):
        print(f"\n=== Processing session {session} ===")
        try:
            label_speakers(audio_path, transcript_path)
        except Exception as e:
            print(f"Error processing {session}: {e}")
            continue
    else:
        print(f"Skipping {session}: missing audio or transcript file")

In [None]:
daic_dir = "../datasets/DAIC-WOZ" 

sessions = ["318", "321", "341", "362"] # https://github.com/adbailey1/daic_woz_process/tree/master

for session in tqdm(sessions, desc="Labeling speakers"):
    session_path = os.path.join(daic_dir, session)
    base_name = session.split("_")[0]
    audio_path = os.path.join(session_path, f"{base_name}_AUDIO.wav")
    transcript_path = os.path.join(session_path, f"{base_name}_TRANSCRIPT.csv")
    
    if os.path.exists(audio_path) and os.path.exists(transcript_path):
        print(f"\n=== Processing session {session} ===")
        try:
            label_speakers(audio_path, transcript_path)
        except Exception as e:
            print(f"Error processing {session}: {e}")
            continue
    else:
        print(f"Skipping {session}: missing audio or transcript file")