In [None]:
import os
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm
import librosa
import random

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [10]:
def clean_audio_and_transcript(input_audio_path, transcript_path, 
                               output_audio_path, output_transcript_path,
                               buffer_sec=0.1, min_gap_for_pause=2.0, 
                               pause_duration=0.5):
    """
    Clean audio and transcript by extracting only participant segments
    
    """
    # ------------------------------------------------------------------ I/O
    audio, sr = sf.read(input_audio_path)
    if audio.ndim > 1:
        audio = librosa.to_mono(audio)               
    print(audio.shape)
    transcript = pd.read_csv(transcript_path, sep='\t')

    # ---------- 1) porta l'asse dei tempi a 0 prima di qualunque elaborazione
    ### MOD: calcolo offset
    offset = transcript['start_time'].min()
    transcript['start_time'] -= offset
    transcript['stop_time']  -= offset
    transcript['start_time']= transcript['start_time'].clip(lower=0)             
    transcript['stop_time']= transcript['stop_time'].clip(lower=0) 
    # ---------- 2) filtra solo i segmenti del Participant (stessa tua logica)
    participant_segments = transcript[
        (transcript['speaker'] == 'Participant') &
        (~transcript['value'].str.contains('scrubbed_entry|<synch>|<sync>', 
                                           na=False))
    ].copy()

    # ---------- 3) indici in campioni, buffer
    buffer_samples = int(buffer_sec * sr)
    starts = ((participant_segments['start_time'] * sr).astype(np.int32)
              - buffer_samples).clip(lower=0)
    ends   = ((participant_segments['stop_time']  * sr).astype(np.int32)
              + buffer_samples).clip(upper=len(audio))

    # ---------- 4) pause tra segmenti (tua logica invariata)
    insert_pause = []
    for i in range(len(participant_segments) - 1):
        current_end = participant_segments.iloc[i]['stop_time']
        next_start  = participant_segments.iloc[i + 1]['start_time']
        gap_duration = next_start - current_end
        insert_pause.append(gap_duration >= min_gap_for_pause)

    # ---------- 5) estrai audio + eventuali pause
    audio_segments = []
    pause_samples = np.zeros(int(pause_duration * sr), dtype=np.float32)

    for i, (s, e) in enumerate(zip(starts, ends)):
        audio_segments.append(audio[s:e])

        if i < len(insert_pause) and insert_pause[i]:
            audio_segments.append(pause_samples)

    # gestisci caso senza segmenti validi
    if not audio_segments:
        raise ValueError("Nessun segmento 'Participant' nel transcript")

    cleaned_audio = np.concatenate(audio_segments)

    # ---------- 6) ricostruisci timestamp output
    current_time = 0.0
    new_starts, new_stops = [], []

    for i, (s, e) in enumerate(zip(starts, ends)):
        segment_duration = (e - s) / sr
        new_starts.append(current_time)
        current_time += segment_duration
        new_stops.append(current_time)

        if i < len(insert_pause) and insert_pause[i]:
            current_time += pause_duration

    participant_segments['start_time'] = new_starts
    participant_segments['stop_time']  = new_stops
    participant_segments = participant_segments.drop('speaker', axis=1)

    # ---------- 7) salva e ritorna
    sf.write(output_audio_path, cleaned_audio, sr, subtype='PCM_16')
    participant_segments.to_csv(output_transcript_path, sep='\t', index=False)

    original_duration = len(audio) / sr
    cleaned_duration  = len(cleaned_audio) / sr
    return original_duration, cleaned_duration

In [11]:
# Elaborazione dataset - salva in directory separate
dataset_configs = [
    {"input_dir": "../datasets/DAIC-WOZ", "output_dir": "../datasets/DAIC-WOZ-preprocessed"},
    {"input_dir": "../datasets/EDAIC-WOZ", "output_dir": "../datasets/EDAIC-WOZ-preprocessed"}
]

for config in dataset_configs:
    dataset_dir = config["input_dir"]
    output_dir = config["output_dir"]
    os.makedirs(output_dir, exist_ok=True)
    
    dataset_sessions = sorted([d for d in os.listdir(dataset_dir) \
                   if os.path.isdir(os.path.join(dataset_dir, d)) and d.endswith('_P')])

    print(f"Processing {dataset_dir} -> {output_dir}")
    for session in tqdm(dataset_sessions, desc=f"Processing {os.path.basename(dataset_dir)} sessions"):
        session_path = os.path.join(dataset_dir, session)
        session_id = session.replace("_P", "")

        # Percorsi file input
        audio_path = os.path.join(session_path, f"{session_id}_AUDIO.wav")
        transcript_path = os.path.join(session_path, f"{session_id}_TRANSCRIPT.csv")
        
        # Percorsi file output
        output_session_path = os.path.join(output_dir, session)
        os.makedirs(output_session_path, exist_ok=True)
        output_audio_path = os.path.join(output_session_path, f"{session_id}_AUDIO.wav")
        output_transcript_path = os.path.join(output_session_path, f"{session_id}_TRANSCRIPT.csv")
        
        # Processa sessione
        original_duration, cleaned_duration = clean_audio_and_transcript(
            audio_path, transcript_path, output_audio_path, output_transcript_path, 
            buffer_sec=0.1, min_gap_for_pause=2.0, pause_duration=0.5
        )

Processing ../datasets/DAIC-WOZ -> ../datasets/DAIC-WOZ-preprocessed


Processing DAIC-WOZ sessions:   0%|          | 0/189 [00:00<?, ?it/s]

(10376000,)
(13182400,)


Processing DAIC-WOZ sessions:   1%|          | 2/189 [00:00<00:22,  8.37it/s]

(12140800,)
(15764800,)


Processing DAIC-WOZ sessions:   3%|▎         | 5/189 [00:00<00:24,  7.40it/s]

(12681600,)


Processing DAIC-WOZ sessions:   3%|▎         | 6/189 [00:00<00:34,  5.31it/s]

(27264000,)


Processing DAIC-WOZ sessions:   4%|▎         | 7/189 [00:01<00:31,  5.83it/s]

(13729600,)
(19820800,)


Processing DAIC-WOZ sessions:   5%|▍         | 9/189 [00:01<00:28,  6.31it/s]

(13881600,)
(11292800,)
(13518400,)


Processing DAIC-WOZ sessions:   7%|▋         | 13/189 [00:01<00:19,  9.19it/s]

(12569600,)
(12640000,)
(12060800,)


Processing DAIC-WOZ sessions:   8%|▊         | 15/189 [00:02<00:22,  7.86it/s]

(24747200,)


Processing DAIC-WOZ sessions:   8%|▊         | 16/189 [00:02<00:22,  7.79it/s]

(15606400,)
(13904000,)


Processing DAIC-WOZ sessions:  11%|█         | 20/189 [00:02<00:16, 10.06it/s]

(12878400,)
(9414400,)
(10875200,)


Processing DAIC-WOZ sessions:  12%|█▏        | 22/189 [00:02<00:16, 10.17it/s]

(13451200,)
(13150400,)


Processing DAIC-WOZ sessions:  13%|█▎        | 24/189 [00:02<00:17,  9.57it/s]

(16753600,)
(13392000,)


Processing DAIC-WOZ sessions:  14%|█▍        | 26/189 [00:03<00:17,  9.43it/s]

(11540800,)
(14000000,)


Processing DAIC-WOZ sessions:  15%|█▍        | 28/189 [00:03<00:15, 10.45it/s]

(11187200,)
(10892800,)
(17043200,)


Processing DAIC-WOZ sessions:  16%|█▌        | 30/189 [00:03<00:16,  9.72it/s]

(11300800,)
(12344000,)
(13611200,)


Processing DAIC-WOZ sessions:  18%|█▊        | 34/189 [00:03<00:15, 10.05it/s]

(13985600,)
(15508800,)


Processing DAIC-WOZ sessions:  19%|█▉        | 36/189 [00:04<00:16,  9.55it/s]

(15680000,)
(13281600,)


Processing DAIC-WOZ sessions:  20%|█▉        | 37/189 [00:04<00:16,  9.41it/s]

(15112000,)
(29939200,)


Processing DAIC-WOZ sessions:  21%|██        | 40/189 [00:04<00:17,  8.36it/s]

(9548800,)
(13806400,)
(9588800,)


Processing DAIC-WOZ sessions:  22%|██▏       | 42/189 [00:04<00:16,  9.04it/s]

(13880000,)
(14836800,)


Processing DAIC-WOZ sessions:  24%|██▍       | 45/189 [00:05<00:16,  8.99it/s]

(17448000,)
(12702400,)


Processing DAIC-WOZ sessions:  24%|██▍       | 46/189 [00:05<00:17,  8.16it/s]

(19548800,)
(9785600,)
(11510400,)


Processing DAIC-WOZ sessions:  26%|██▋       | 50/189 [00:05<00:15,  8.98it/s]

(19456000,)
(14108800,)


Processing DAIC-WOZ sessions:  28%|██▊       | 52/189 [00:06<00:14,  9.18it/s]

(12313600,)
(12161600,)


Processing DAIC-WOZ sessions:  29%|██▉       | 55/189 [00:06<00:12, 10.86it/s]

(12576000,)
(9228800,)
(10795200,)


Processing DAIC-WOZ sessions:  30%|███       | 57/189 [00:06<00:11, 11.14it/s]

(15264000,)
(6636800,)
(10366400,)


Processing DAIC-WOZ sessions:  31%|███       | 59/189 [00:06<00:12, 10.36it/s]

(16537600,)
(7046400,)
(10270400,)


Processing DAIC-WOZ sessions:  32%|███▏      | 61/189 [00:06<00:11, 11.40it/s]

(9443200,)
(19633600,)


Processing DAIC-WOZ sessions:  33%|███▎      | 63/189 [00:07<00:12, 10.06it/s]

(28504000,)


Processing DAIC-WOZ sessions:  34%|███▍      | 65/189 [00:07<00:16,  7.34it/s]

(22177600,)
(21003200,)


Processing DAIC-WOZ sessions:  35%|███▌      | 67/189 [00:07<00:19,  6.23it/s]

(26192000,)


Processing DAIC-WOZ sessions:  36%|███▌      | 68/189 [00:08<00:20,  5.96it/s]

(21676800,)
(16657600,)


Processing DAIC-WOZ sessions:  37%|███▋      | 70/189 [00:08<00:20,  5.93it/s]

(19337600,)
(14587200,)


Processing DAIC-WOZ sessions:  38%|███▊      | 72/189 [00:08<00:18,  6.31it/s]

(24219200,)
(20241600,)


Processing DAIC-WOZ sessions:  39%|███▉      | 74/189 [00:09<00:19,  5.89it/s]

(20603200,)
(9939200,)


Processing DAIC-WOZ sessions:  40%|████      | 76/189 [00:09<00:16,  6.97it/s]

(16411200,)
(21252800,)


Processing DAIC-WOZ sessions:  41%|████▏     | 78/189 [00:09<00:15,  6.98it/s]

(13974400,)
(16033600,)


Processing DAIC-WOZ sessions:  42%|████▏     | 80/189 [00:09<00:19,  5.61it/s]

(31459200,)


Processing DAIC-WOZ sessions:  43%|████▎     | 81/189 [00:10<00:17,  6.05it/s]

(17428800,)
(13187200,)


Processing DAIC-WOZ sessions:  44%|████▍     | 83/189 [00:10<00:16,  6.51it/s]

(21905600,)
(16908800,)


Processing DAIC-WOZ sessions:  46%|████▌     | 86/189 [00:10<00:13,  7.86it/s]

(8580800,)
(16630400,)
(9644800,)


Processing DAIC-WOZ sessions:  47%|████▋     | 88/189 [00:10<00:10,  9.48it/s]

(13299200,)
(15251200,)
(21707200,)


Processing DAIC-WOZ sessions:  49%|████▊     | 92/189 [00:11<00:09,  9.94it/s]

(10881600,)
(10491200,)
(10080000,)


Processing DAIC-WOZ sessions:  50%|████▉     | 94/189 [00:11<00:09, 10.22it/s]

(14227200,)
(12486400,)


Processing DAIC-WOZ sessions:  51%|█████     | 96/189 [00:11<00:09,  9.83it/s]

(15252800,)
(11744000,)


Processing DAIC-WOZ sessions:  52%|█████▏    | 99/189 [00:12<00:09,  9.21it/s]

(14987200,)
(14947200,)


Processing DAIC-WOZ sessions:  53%|█████▎    | 101/189 [00:12<00:09,  8.85it/s]

(15287980,)
(13800000,)


Processing DAIC-WOZ sessions:  54%|█████▍    | 102/189 [00:12<00:10,  8.19it/s]

(18089600,)
(25123200,)


Processing DAIC-WOZ sessions:  54%|█████▍    | 103/189 [00:12<00:12,  6.65it/s]

(11548800,)
(20038400,)


Processing DAIC-WOZ sessions:  56%|█████▌    | 105/189 [00:12<00:12,  6.97it/s]

(11454400,)
(16518400,)


Processing DAIC-WOZ sessions:  57%|█████▋    | 108/189 [00:13<00:11,  7.21it/s]

(17040000,)
(22126400,)


Processing DAIC-WOZ sessions:  59%|█████▊    | 111/189 [00:13<00:10,  7.21it/s]

(13710400,)
(15948800,)


Processing DAIC-WOZ sessions:  59%|█████▉    | 112/189 [00:13<00:10,  7.25it/s]

(15748800,)
(12491200,)


Processing DAIC-WOZ sessions:  61%|██████    | 115/189 [00:14<00:09,  8.00it/s]

(13852800,)
(14180800,)


Processing DAIC-WOZ sessions:  61%|██████▏   | 116/189 [00:14<00:09,  7.62it/s]

(17427200,)
(14500800,)


Processing DAIC-WOZ sessions:  63%|██████▎   | 119/189 [00:14<00:08,  8.09it/s]

(16782400,)
(13392000,)


Processing DAIC-WOZ sessions:  63%|██████▎   | 120/189 [00:14<00:09,  7.20it/s]

(21353600,)
(15934400,)


Processing DAIC-WOZ sessions:  65%|██████▍   | 122/189 [00:15<00:09,  6.96it/s]

(17003200,)
(18476800,)


Processing DAIC-WOZ sessions:  66%|██████▌   | 125/189 [00:15<00:08,  7.93it/s]

(13708800,)
(13948800,)


Processing DAIC-WOZ sessions:  67%|██████▋   | 127/189 [00:15<00:07,  8.84it/s]

(11265600,)
(15320000,)


Processing DAIC-WOZ sessions:  68%|██████▊   | 129/189 [00:15<00:06,  8.88it/s]

(14540800,)
(13340800,)


Processing DAIC-WOZ sessions:  69%|██████▉   | 130/189 [00:16<00:06,  8.72it/s]

(14974400,)
(12830400,)


Processing DAIC-WOZ sessions:  70%|██████▉   | 132/189 [00:16<00:07,  7.95it/s]

(21118400,)
(18960000,)


Processing DAIC-WOZ sessions:  71%|███████▏  | 135/189 [00:16<00:06,  8.36it/s]

(11006400,)
(13593600,)


Processing DAIC-WOZ sessions:  72%|███████▏  | 136/189 [00:16<00:07,  7.11it/s]

(22878400,)


Processing DAIC-WOZ sessions:  72%|███████▏  | 137/189 [00:17<00:08,  6.48it/s]

(20553600,)


Processing DAIC-WOZ sessions:  73%|███████▎  | 138/189 [00:17<00:08,  6.25it/s]

(22625600,)
(15540800,)


Processing DAIC-WOZ sessions:  74%|███████▍  | 140/189 [00:17<00:06,  7.04it/s]

(15200000,)
(11220800,)


Processing DAIC-WOZ sessions:  75%|███████▌  | 142/189 [00:17<00:06,  7.14it/s]

(20968000,)
(11353600,)


Processing DAIC-WOZ sessions:  76%|███████▌  | 144/189 [00:18<00:05,  7.63it/s]

(15974400,)
(13124800,)


Processing DAIC-WOZ sessions:  77%|███████▋  | 146/189 [00:18<00:05,  7.23it/s]

(19408000,)
(16475200,)


Processing DAIC-WOZ sessions:  78%|███████▊  | 148/189 [00:18<00:06,  6.66it/s]

(20403200,)
(19014400,)


Processing DAIC-WOZ sessions:  80%|███████▉  | 151/189 [00:19<00:05,  7.49it/s]

(14225600,)
(16515200,)


Processing DAIC-WOZ sessions:  81%|████████  | 153/189 [00:19<00:04,  8.44it/s]

(12865600,)
(12006400,)
(14123200,)


Processing DAIC-WOZ sessions:  82%|████████▏ | 155/189 [00:19<00:04,  8.11it/s]

(15396800,)
(15195200,)


Processing DAIC-WOZ sessions:  84%|████████▎ | 158/189 [00:19<00:03,  8.60it/s]

(15601600,)
(15704000,)


Processing DAIC-WOZ sessions:  84%|████████▍ | 159/189 [00:20<00:03,  8.15it/s]

(15441600,)
(13355200,)


Processing DAIC-WOZ sessions:  85%|████████▌ | 161/189 [00:20<00:03,  8.29it/s]

(15684800,)
(18614400,)


Processing DAIC-WOZ sessions:  86%|████████▌ | 163/189 [00:20<00:04,  6.49it/s]

(25064000,)
(16038400,)


Processing DAIC-WOZ sessions:  87%|████████▋ | 165/189 [00:20<00:03,  7.06it/s]

(15036800,)
(18025600,)


Processing DAIC-WOZ sessions:  88%|████████▊ | 167/189 [00:21<00:03,  7.05it/s]

(15286400,)
(15924800,)


Processing DAIC-WOZ sessions:  89%|████████▉ | 169/189 [00:21<00:02,  7.64it/s]

(14448000,)
(8532800,)
(14838400,)


Processing DAIC-WOZ sessions:  92%|█████████▏| 173/189 [00:21<00:01, 10.85it/s]

(9395200,)
(9740800,)
(19915200,)


Processing DAIC-WOZ sessions:  93%|█████████▎| 175/189 [00:22<00:01,  9.02it/s]

(14979200,)
(14566400,)
(13851200,)


Processing DAIC-WOZ sessions:  94%|█████████▍| 178/189 [00:22<00:01,  8.67it/s]

(17862400,)
(16304000,)


Processing DAIC-WOZ sessions:  95%|█████████▌| 180/189 [00:22<00:01,  7.08it/s]

(25232000,)
(15928000,)


Processing DAIC-WOZ sessions:  97%|█████████▋| 183/189 [00:23<00:00,  8.59it/s]

(9643200,)
(10916800,)
(16376000,)


Processing DAIC-WOZ sessions:  98%|█████████▊| 185/189 [00:23<00:00,  8.40it/s]

(14158400,)
(11275200,)
(11060800,)


Processing DAIC-WOZ sessions: 100%|██████████| 189/189 [00:23<00:00,  8.00it/s]


(14107200,)
(14600000,)
Processing ../datasets/EDAIC-WOZ -> ../datasets/EDAIC-WOZ-preprocessed


Processing EDAIC-WOZ sessions:   3%|▎         | 1/29 [00:00<00:02,  9.88it/s]

(12446240,)
(10028960,)


Processing EDAIC-WOZ sessions:  10%|█         | 3/29 [00:00<00:03,  8.16it/s]

(21824480,)
(13589760,)


Processing EDAIC-WOZ sessions:  17%|█▋        | 5/29 [00:01<00:10,  2.19it/s]

(2,)
(26989280,)


Processing EDAIC-WOZ sessions:  24%|██▍       | 7/29 [00:03<00:15,  1.40it/s]

(2,)
(8401280,)


Processing EDAIC-WOZ sessions:  31%|███       | 9/29 [00:04<00:11,  1.74it/s]

(2,)
(12079360,)


Processing EDAIC-WOZ sessions:  38%|███▊      | 11/29 [00:05<00:09,  1.84it/s]

(2,)
(13681440,)
(15811040,)


Processing EDAIC-WOZ sessions:  48%|████▊     | 14/29 [00:07<00:08,  1.69it/s]

(2,)


Processing EDAIC-WOZ sessions:  52%|█████▏    | 15/29 [00:08<00:10,  1.38it/s]

(2,)
(9630560,)


Processing EDAIC-WOZ sessions:  59%|█████▊    | 17/29 [00:09<00:07,  1.62it/s]

(2,)


Processing EDAIC-WOZ sessions:  62%|██████▏   | 18/29 [00:10<00:08,  1.32it/s]

(2,)
(11759360,)


Processing EDAIC-WOZ sessions:  72%|███████▏  | 21/29 [00:11<00:04,  1.73it/s]

(2,)
(18032800,)


Processing EDAIC-WOZ sessions:  76%|███████▌  | 22/29 [00:12<00:04,  1.56it/s]

(2,)


Processing EDAIC-WOZ sessions:  83%|████████▎ | 24/29 [00:13<00:03,  1.65it/s]

(2,)
(12217280,)


Processing EDAIC-WOZ sessions:  93%|█████████▎| 27/29 [00:15<00:00,  2.10it/s]

(2,)
(11280000,)
(12575840,)


Processing EDAIC-WOZ sessions: 100%|██████████| 29/29 [00:16<00:00,  1.78it/s]

(2,)
(15515360,)





In [12]:
def analyze_audio_lengths(dataset_dir):
    """
    Analizza le lunghezze degli audio per determinare parametri ottimali
    
    Returns:
        dict: Statistiche delle durate
    """
    print("Analizzando lunghezze audio...")
    durations = []
    session_names = []
    session_dirs = sorted([d for d in os.listdir(dataset_dir) \
                   if os.path.isdir(os.path.join(dataset_dir, d)) and d.endswith('_P')])

    for session in tqdm(session_dirs, desc="Analisi campione"):
        session_path = os.path.join(dataset_dir, session)
        session_id = session.replace("_P", "")
        transcript_path = os.path.join(session_path, f"{session_id}_TRANSCRIPT.csv")
        
        transcript = pd.read_csv(transcript_path, sep='\t')
        durations.append(transcript['stop_time'].max())
        session_names.append(session_id) 
    
    durations = np.array(durations)
    min_idx = np.argmin(durations)
    max_idx = np.argmax(durations)
    
    stats = {
        'count': len(durations),
        'mean': np.mean(durations),
        'median': np.median(durations),
        'std': np.std(durations),
        'min': np.min(durations),
        'max': np.max(durations),
        'min_audio': session_names[min_idx],  # Nome audio con durata minima
        'max_audio': session_names[max_idx],  # Nome audio con durata massima
        'q25': np.percentile(durations, 25),
        'q75': np.percentile(durations, 75)
    }
    
    return stats

In [13]:
def apply_pitch_shift(audio, sr, n_steps):
    """
    Applica pitch shift all'audio
    
    Args:
        audio: Array audio
        sr: Sample rate
        n_steps: Numero di semitoni da spostare (+/- valori)
    
    Returns:
        Audio con pitch modificato
    """
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=-n_steps)

def create_tone_change_augmentation(dataset_dirs, labels_dict, dataset_csv_path, percentage=0.2, pitch_steps=2):
    """
    Crea augmentation con tone change per i campioni di classe 1
    
    Args:
        dataset_dirs: Lista delle directory dei dataset preprocessati
        labels_dict: Dizionario con le etichette {session_id: label}
        dataset_csv_path: Path del file dataset.csv da aggiornare
        percentage: Percentuale di campioni da modificare
        pitch_steps: Numero di semitoni per il pitch shift
    
    Returns:
        Lista delle nuove entry create
    """
    print(f"Creando augmentation con tone change per il {percentage*100}% dei campioni di classe 1...")
    
    # Carica il dataset CSV esistente
    dataset_df = pd.read_csv(dataset_csv_path)
    new_entries = []
    
    for dataset_dir in dataset_dirs:
        print(f"Processando {dataset_dir}...")
        
        # Trova tutti i campioni di classe 1 nel dataset
        class1_sessions = []
        session_dirs = sorted([d for d in os.listdir(dataset_dir) 
                              if os.path.isdir(os.path.join(dataset_dir, d)) and d.endswith('_P')])
        
        for session in session_dirs:
            session_id = session.replace('_P', '')
            if labels_dict.get(session_id, 0) == 1:  # Classe 1 (depresso)
                class1_sessions.append(session)
        
        print(f"Trovati {len(class1_sessions)} campioni di classe 1 in {os.path.basename(dataset_dir)}")
        
        if len(class1_sessions) == 0:
            print(f"Nessun campione di classe 1 trovato in {dataset_dir}")
            continue
            
        # Seleziona casualmente la percentuale specificata dei campioni di classe 1
        num_to_modify = int(len(class1_sessions) * percentage)
        if num_to_modify == 0:
            print(f"Nessun campione da modificare in {dataset_dir}")
            continue
            
        sessions_to_modify = random.sample(class1_sessions, num_to_modify)
        
        print(f"Creando {num_to_modify} nuovi campioni ({percentage*100}%) in {os.path.basename(dataset_dir)}")
        
        # Crea tone change per i campioni selezionati
        for session in tqdm(sessions_to_modify, desc=f"Tone change {os.path.basename(dataset_dir)}"):
            session_path = os.path.join(dataset_dir, session)
            session_id = session.replace('_P', '')
            
            # File originali
            original_audio_path = os.path.join(session_path, f"{session_id}_AUDIO.wav")
            original_transcript_path = os.path.join(session_path, f"{session_id}_TRANSCRIPT.csv")
            
            # Nuovi nomi per i file augmentati
            new_session_id = f"{session_id}_tone"
            new_session_dir = f"{session_id}_tone_P"
            
            # Crea directory per il nuovo campione
            new_session_path = os.path.join(dataset_dir, new_session_dir)
            os.makedirs(new_session_path, exist_ok=True)
            
            # Percorsi per i nuovi file
            new_audio_path = os.path.join(new_session_path, f"{new_session_id}_AUDIO.wav")
            new_transcript_path = os.path.join(new_session_path, f"{new_session_id}_TRANSCRIPT.csv")
            
            # Leggi e modifica audio
            audio, sr = sf.read(original_audio_path)
            modified_audio = apply_pitch_shift(audio, sr, pitch_steps)
            
            # Salva nuovo audio
            sf.write(new_audio_path, modified_audio, sr, subtype='PCM_16')
            
            # Copia il transcript (uguale all'originale)
            original_transcript = pd.read_csv(original_transcript_path, sep='\t')
            original_transcript.to_csv(new_transcript_path, sep='\t', index=False)
            
            # Crea entry per il dataset.csv
            dataset_df['Participant_ID'] = dataset_df['Participant_ID'].astype(str)
            original_entry = dataset_df[dataset_df['Participant_ID'] == session_id].iloc[0]
            new_entry = {
                'session_id': new_session_id,
                'label': original_entry['PHQ_Binary'],  # Mantiene la stessa etichetta
                'augmentation': 'tone_change'  # Aggiungi colonna per indicare l'augmentation
            }
            
            # Aggiungi altre colonne se presenti nel dataset originale
            for col in dataset_df.columns:
                if col not in new_entry:
                    new_entry[col] = original_entry[col]
            
            new_entries.append(new_entry)
            
        print(f"Tone change completato per {len(sessions_to_modify)} campioni in {os.path.basename(dataset_dir)}")
    
    # Aggiungi le nuove entry al dataset
    if new_entries:
        new_df = pd.DataFrame(new_entries)
        
        # Aggiungi colonna augmentation se non esiste
        if 'augmentation' not in dataset_df.columns:
            dataset_df['augmentation'] = 'none'
            
        # Concatena i nuovi dati
        updated_dataset_df = pd.concat([dataset_df, new_df], ignore_index=True)
        
        # Salva il dataset aggiornato
        updated_dataset_df.to_csv(dataset_csv_path, index=False)
        print(f"Dataset aggiornato con {len(new_entries)} nuove entry salvato in {dataset_csv_path}")
    
    return new_entries

In [14]:
# Carica il dataset CSV per ottenere le etichette e la percentuale
dataset_csv_path = "../datasets/dataset.csv"
dataset_df = pd.read_csv(dataset_csv_path)
print(dataset_df.dtypes)
# Crea dizionario delle etichette
labels_dict = dict(zip(dataset_df['Participant_ID'].astype(str), dataset_df['PHQ_Binary']))

# Leggi la percentuale dal CSV (assumendo che ci sia una colonna 'tone_change_percentage')
# Se non esiste, usa un valore di default
if 'tone_change_percentage' in dataset_df.columns:
    # Prendi il primo valore non-null della colonna percentuale
    percentage = dataset_df['tone_change_percentage'].dropna().iloc[0] if not dataset_df['tone_change_percentage'].dropna().empty else 0.2
else:
    percentage = 0.2  # Default 20%

print(f"Percentuale per tone change: {percentage*100}%")
print(f"Numero totale di sessioni: {len(labels_dict)}")
print(f"Sessioni di classe 1: {sum(1 for v in labels_dict.values() if v == 1)}")

# Directory dei dataset preprocessati
preprocessed_dirs = ["../datasets/DAIC-WOZ-preprocessed", "../datasets/EDAIC-WOZ-preprocessed"]

# Crea augmentation con tone change
new_entries = create_tone_change_augmentation(
    preprocessed_dirs, 
    labels_dict, 
    dataset_csv_path, 
    percentage=percentage, 
    pitch_steps=1.5
)

print(f"Creati {len(new_entries)} nuovi campioni con tone change")
for entry in new_entries[:5]:  # Mostra i primi 5 per verifica
    print(f"- {entry['Participant_ID']} , label: {entry['PHQ_Binary']})")

Participant_ID    int64
PHQ_Binary        int64
dtype: object
Percentuale per tone change: 20.0%
Numero totale di sessioni: 218
Sessioni di classe 1: 86
Creando augmentation con tone change per il 20.0% dei campioni di classe 1...
Processando ../datasets/DAIC-WOZ-preprocessed...
Trovati 57 campioni di classe 1 in DAIC-WOZ-preprocessed
Creando 11 nuovi campioni (20.0%) in DAIC-WOZ-preprocessed


Tone change DAIC-WOZ-preprocessed: 100%|██████████| 11/11 [00:17<00:00,  1.59s/it]


Tone change completato per 11 campioni in DAIC-WOZ-preprocessed
Processando ../datasets/EDAIC-WOZ-preprocessed...
Trovati 29 campioni di classe 1 in EDAIC-WOZ-preprocessed
Creando 5 nuovi campioni (20.0%) in EDAIC-WOZ-preprocessed


Tone change EDAIC-WOZ-preprocessed: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]

Tone change completato per 5 campioni in EDAIC-WOZ-preprocessed
Dataset aggiornato con 16 nuove entry salvato in ../datasets/dataset.csv
Creati 16 nuovi campioni con tone change
- 308 , label: 1)
- 388 , label: 1)
- 413 , label: 1)
- 330 , label: 1)
- 352 , label: 1)



