In [1]:
import os
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm
import librosa
import random

In [2]:
def clean_audio_and_transcript(input_audio_path, transcript_path, output_audio_path, output_transcript_path,
                               buffer_sec=0.1, min_gap_for_pause=2.0, pause_duration=0.5):
    """
    Clean audio and transcript by extracting only participant segments
    
    Args:
        input_audio_path: Path to input audio file
        transcript_path: Path to input transcript CSV
        output_audio_path: Path for cleaned audio output
        output_transcript_path: Path for cleaned transcript output
        buffer_sec: Buffer time in seconds to add around segments
        min_gap_for_pause: Minimum gap between segments to insert a pause (seconds)
        pause_duration: Duration of inserted pause (seconds)
    
    Returns:
        tuple: (original_duration, cleaned_duration)
    """
    # Lettura file
    audio, sr = sf.read(input_audio_path)
    if audio.ndim > 1:
        audio = librosa.to_mono(audio)
    transcript = pd.read_csv(transcript_path, sep='\t')

    # Filtra solo segmenti partecipante
    participant_segments = transcript[
        (transcript['speaker'] == 'Participant') &
        (~transcript['value'].str.contains('scrubbed_entry|<synch>|<sync>', na=False))
    ].copy()
    
    # Calcola posizioni in samples e aggiungi buffer
    buffer_samples = int(buffer_sec * sr)
    starts = ((participant_segments['start_time'] * sr).astype(np.int32) - buffer_samples).clip(lower=0)
    ends = ((participant_segments['stop_time'] * sr).astype(np.int32) + buffer_samples).clip(upper=len(audio))

    # Controlla gap tra segmenti consecutivi del partecipante
    insert_pause = []
    for i in range(len(participant_segments) - 1):
        current_end = participant_segments.iloc[i]['stop_time']
        next_start = participant_segments.iloc[i + 1]['start_time']
        gap_duration = next_start - current_end
        insert_pause.append(gap_duration >= min_gap_for_pause)

    # Estrai segmenti audio e inserisci pause quando necessario
    audio_segments = []
    pause_samples = np.zeros(int(pause_duration * sr), dtype=np.float32)

    for i, (s, e) in enumerate(zip(starts, ends)):
        audio_segments.append(audio[s:e])
        
        # Inserisci pausa se il gap era >= 2 secondi
        if i < len(insert_pause) and insert_pause[i]:
            audio_segments.append(pause_samples)
    
    cleaned_audio = np.concatenate(audio_segments)
    
    # Aggiorna timestamp considerando le pause inserite
    current_time = 0.0
    new_starts = []
    new_stops = []
    
    for i, (s, e) in enumerate(zip(starts, ends)):
        segment_duration = (e - s) / sr
        new_starts.append(current_time)
        current_time += segment_duration
        new_stops.append(current_time)
        
        # Aggiungi tempo di pausa se inserita
        if i < len(insert_pause) and insert_pause[i]:
            current_time += pause_duration

    # Aggiorna timestamp e rimuovi colonna speaker
    participant_segments['start_time'] = new_starts
    participant_segments['stop_time'] = new_stops
    participant_segments = participant_segments.drop('speaker', axis=1)
    
    # Salva
    sf.write(output_audio_path, cleaned_audio, sr, subtype='PCM_16')
    participant_segments.to_csv(output_transcript_path, sep='\t', index=False)
    
    original_duration = len(audio) / sr
    cleaned_duration = len(cleaned_audio) / sr

    return original_duration, cleaned_duration

In [3]:
# Elaborazione dataset - salva in directory separate
dataset_configs = [
    {"input_dir": "../datasets/DAIC-WOZ", "output_dir": "../datasets/DAIC-WOZ-preprocessed"},
    {"input_dir": "../datasets/EDAIC-WOZ", "output_dir": "../datasets/EDAIC-WOZ-preprocessed"}
]

for config in dataset_configs:
    dataset_dir = config["input_dir"]
    output_dir = config["output_dir"]
    os.makedirs(output_dir, exist_ok=True)
    
    dataset_sessions = sorted([d for d in os.listdir(dataset_dir) \
                   if os.path.isdir(os.path.join(dataset_dir, d)) and d.endswith('_P')])

    print(f"Processing {dataset_dir} -> {output_dir}")
    for session in tqdm(dataset_sessions, desc=f"Processing {os.path.basename(dataset_dir)} sessions"):
        session_path = os.path.join(dataset_dir, session)
        session_id = session.replace("_P", "")

        # Percorsi file input
        audio_path = os.path.join(session_path, f"{session_id}_AUDIO.wav")
        transcript_path = os.path.join(session_path, f"{session_id}_TRANSCRIPT.csv")
        
        # Percorsi file output
        output_session_path = os.path.join(output_dir, session)
        os.makedirs(output_session_path, exist_ok=True)
        output_audio_path = os.path.join(output_session_path, f"{session_id}_AUDIO.wav")
        output_transcript_path = os.path.join(output_session_path, f"{session_id}_TRANSCRIPT.csv")
        
        # Processa sessione
        original_duration, cleaned_duration = clean_audio_and_transcript(
            audio_path, transcript_path, output_audio_path, output_transcript_path, 
            buffer_sec=0.1, min_gap_for_pause=2.0, pause_duration=0.5
        )

Processing ../datasets/DAIC-WOZ -> ../datasets/DAIC-WOZ-preprocessed


Processing DAIC-WOZ sessions: 100%|██████████| 189/189 [00:15<00:00, 12.35it/s]


Processing ../datasets/EDAIC-WOZ -> ../datasets/EDAIC-WOZ-preprocessed


Processing EDAIC-WOZ sessions: 100%|██████████| 29/29 [00:10<00:00,  2.74it/s]


In [4]:
def analyze_audio_lengths(dataset_dir):
    """
    Analizza le lunghezze degli audio per determinare parametri ottimali
    
    Returns:
        dict: Statistiche delle durate
    """
    print("Analizzando lunghezze audio...")
    durations = []
    session_names = []
    session_dirs = sorted([d for d in os.listdir(dataset_dir) \
                   if os.path.isdir(os.path.join(dataset_dir, d)) and d.endswith('_P')])

    for session in tqdm(session_dirs, desc="Analisi campione"):
        session_path = os.path.join(dataset_dir, session)
        session_id = session.replace("_P", "")
        transcript_path = os.path.join(session_path, f"{session_id}_TRANSCRIPT.csv")
        
        transcript = pd.read_csv(transcript_path, sep='\t')
        durations.append(transcript['stop_time'].max())
        session_names.append(session_id) 
    
    durations = np.array(durations)
    min_idx = np.argmin(durations)
    max_idx = np.argmax(durations)
    
    stats = {
        'count': len(durations),
        'mean': np.mean(durations),
        'median': np.median(durations),
        'std': np.std(durations),
        'min': np.min(durations),
        'max': np.max(durations),
        'min_audio': session_names[min_idx],  # Nome audio con durata minima
        'max_audio': session_names[max_idx],  # Nome audio con durata massima
        'q25': np.percentile(durations, 25),
        'q75': np.percentile(durations, 75)
    }
    
    return stats

In [5]:
def apply_pitch_shift(audio, sr, n_steps):
    """
    Applica pitch shift all'audio
    
    Args:
        audio: Array audio
        sr: Sample rate
        n_steps: Numero di semitoni da spostare (+/- valori)
    
    Returns:
        Audio con pitch modificato
    """
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)

def create_tone_change_augmentation(dataset_dirs, labels_dict, dataset_csv_path, percentage=0.2, pitch_steps=2):
    """
    Crea augmentation con tone change per i campioni di classe 1
    
    Args:
        dataset_dirs: Lista delle directory dei dataset preprocessati
        labels_dict: Dizionario con le etichette {session_id: label}
        dataset_csv_path: Path del file dataset.csv da aggiornare
        percentage: Percentuale di campioni da modificare
        pitch_steps: Numero di semitoni per il pitch shift
    
    Returns:
        Lista delle nuove entry create
    """
    print(f"Creando augmentation con tone change per il {percentage*100}% dei campioni di classe 1...")
    
    # Carica il dataset CSV esistente
    dataset_df = pd.read_csv(dataset_csv_path)
    new_entries = []
    
    for dataset_dir in dataset_dirs:
        print(f"Processando {dataset_dir}...")
        
        # Trova tutti i campioni di classe 1 nel dataset
        class1_sessions = []
        session_dirs = sorted([d for d in os.listdir(dataset_dir) 
                              if os.path.isdir(os.path.join(dataset_dir, d)) and d.endswith('_P')])
        
        for session in session_dirs:
            session_id = session.replace('_P', '')
            if labels_dict.get(session_id, 0) == 1:  # Classe 1 (depresso)
                class1_sessions.append(session)
        
        print(f"Trovati {len(class1_sessions)} campioni di classe 1 in {os.path.basename(dataset_dir)}")
        
        if len(class1_sessions) == 0:
            print(f"Nessun campione di classe 1 trovato in {dataset_dir}")
            continue
            
        # Seleziona casualmente la percentuale specificata dei campioni di classe 1
        num_to_modify = int(len(class1_sessions) * percentage)
        if num_to_modify == 0:
            print(f"Nessun campione da modificare in {dataset_dir}")
            continue
            
        sessions_to_modify = random.sample(class1_sessions, num_to_modify)
        
        print(f"Creando {num_to_modify} nuovi campioni ({percentage*100}%) in {os.path.basename(dataset_dir)}")
        
        # Crea tone change per i campioni selezionati
        for session in tqdm(sessions_to_modify, desc=f"Tone change {os.path.basename(dataset_dir)}"):
            session_path = os.path.join(dataset_dir, session)
            session_id = session.replace('_P', '')
            
            # File originali
            original_audio_path = os.path.join(session_path, f"{session_id}_AUDIO.wav")
            original_transcript_path = os.path.join(session_path, f"{session_id}_TRANSCRIPT.csv")
            
            # Nuovi nomi per i file augmentati
            new_session_id = f"{session_id}_tone"
            new_session_dir = f"{session_id}_tone_P"
            
            # Crea directory per il nuovo campione
            new_session_path = os.path.join(dataset_dir, new_session_dir)
            os.makedirs(new_session_path, exist_ok=True)
            
            # Percorsi per i nuovi file
            new_audio_path = os.path.join(new_session_path, f"{new_session_id}_AUDIO.wav")
            new_transcript_path = os.path.join(new_session_path, f"{new_session_id}_TRANSCRIPT.csv")
            
            # Leggi e modifica audio
            audio, sr = sf.read(original_audio_path)
            modified_audio = apply_pitch_shift(audio, sr, pitch_steps)
            
            # Salva nuovo audio
            sf.write(new_audio_path, modified_audio, sr, subtype='PCM_16')
            
            # Copia il transcript (uguale all'originale)
            original_transcript = pd.read_csv(original_transcript_path, sep='\t')
            original_transcript.to_csv(new_transcript_path, sep='\t', index=False)
            
            # Crea entry per il dataset.csv
            original_entry = dataset_df[dataset_df['Participant_ID'] == session_id].iloc[0]
            new_entry = {
                'session_id': new_session_id,
                'label': original_entry['PHQ_Binary'],  # Mantiene la stessa etichetta
                'dataset': original_entry['dataset'],
                'augmentation': 'tone_change'  # Aggiungi colonna per indicare l'augmentation
            }
            
            # Aggiungi altre colonne se presenti nel dataset originale
            for col in dataset_df.columns:
                if col not in new_entry:
                    new_entry[col] = original_entry[col]
            
            new_entries.append(new_entry)
            
        print(f"Tone change completato per {len(sessions_to_modify)} campioni in {os.path.basename(dataset_dir)}")
    
    # Aggiungi le nuove entry al dataset
    if new_entries:
        new_df = pd.DataFrame(new_entries)
        
        # Aggiungi colonna augmentation se non esiste
        if 'augmentation' not in dataset_df.columns:
            dataset_df['augmentation'] = 'none'
            
        # Concatena i nuovi dati
        updated_dataset_df = pd.concat([dataset_df, new_df], ignore_index=True)
        
        # Salva il dataset aggiornato
        updated_dataset_df.to_csv(dataset_csv_path, index=False)
        print(f"Dataset aggiornato con {len(new_entries)} nuove entry salvato in {dataset_csv_path}")
    
    return new_entries

In [6]:
# Carica il dataset CSV per ottenere le etichette e la percentuale
dataset_csv_path = "../datasets/dataset.csv"
dataset_df = pd.read_csv(dataset_csv_path)

# Crea dizionario delle etichette
labels_dict = dict(zip(dataset_df['Participant_ID'].astype(str), dataset_df['PHQ_Binary']))

# Leggi la percentuale dal CSV (assumendo che ci sia una colonna 'tone_change_percentage')
# Se non esiste, usa un valore di default
if 'tone_change_percentage' in dataset_df.columns:
    # Prendi il primo valore non-null della colonna percentuale
    percentage = dataset_df['tone_change_percentage'].dropna().iloc[0] if not dataset_df['tone_change_percentage'].dropna().empty else 0.2
else:
    percentage = 0.2  # Default 20%

print(f"Percentuale per tone change: {percentage*100}%")
print(f"Numero totale di sessioni: {len(labels_dict)}")
print(f"Sessioni di classe 1: {sum(1 for v in labels_dict.values() if v == 1)}")

# Directory dei dataset preprocessati
preprocessed_dirs = ["../datasets/DAIC-WOZ-preprocessed", "../datasets/EDAIC-WOZ-preprocessed"]

# Crea augmentation con tone change
new_entries = create_tone_change_augmentation(
    preprocessed_dirs, 
    labels_dict, 
    dataset_csv_path, 
    percentage=percentage, 
    pitch_steps=2
)

print(f"Creati {len(new_entries)} nuovi campioni con tone change")
for entry in new_entries[:5]:  # Mostra i primi 5 per verifica
    print(f"- {entry['Participant_ID']} (dataset: {entry['dataset']}, label: {entry['PHQ_Binary']})")

Percentuale per tone change: 20.0%
Numero totale di sessioni: 218
Sessioni di classe 1: 86
Creando augmentation con tone change per il 20.0% dei campioni di classe 1...
Processando ../datasets/DAIC-WOZ-preprocessed...
Trovati 57 campioni di classe 1 in DAIC-WOZ-preprocessed
Creando 11 nuovi campioni (20.0%) in DAIC-WOZ-preprocessed


Tone change DAIC-WOZ-preprocessed:   0%|          | 0/11 [00:01<?, ?it/s]


IndexError: single positional indexer is out-of-bounds