In [1]:
import os
import shutil
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm

In [2]:
def clean_audio_and_transcript(input_audio_path, transcript_path, output_audio_path, output_transcript_path,
                               buffer_sec=0.1, min_gap_for_pause=2.0, pause_duration=0.5):
    """
    Clean audio and transcript by extracting only participant segments
    
    Args:
        input_audio_path: Path to input audio file
        transcript_path: Path to input transcript CSV
        output_audio_path: Path for cleaned audio output
        output_transcript_path: Path for cleaned transcript output
        buffer_sec: Buffer time in seconds to add around segments
        min_gap_for_pause: Minimum gap between segments to insert a pause (seconds)
        pause_duration: Duration of inserted pause (seconds)
    
    Returns:
        tuple: (original_duration, cleaned_duration)
    """
    # Lettura file
    audio, sr = sf.read(input_audio_path)
    transcript = pd.read_csv(transcript_path, sep='\t')

    # Filtra solo segmenti partecipante
    participant_segments = transcript[
        (transcript['speaker'] == 'Participant') &
        (~transcript['value'].str.contains('scrubbed_entry|<synch>|<sync>', na=False))
    ].copy()
    
    # Calcola posizioni in samples e aggiungi buffer
    buffer_samples = int(buffer_sec * sr)
    starts = ((participant_segments['start_time'] * sr).astype(np.int32) - buffer_samples).clip(lower=0)
    ends = ((participant_segments['stop_time'] * sr).astype(np.int32) + buffer_samples).clip(upper=len(audio))

    # Controlla gap tra segmenti consecutivi del partecipante
    insert_pause = []
    for i in range(len(participant_segments) - 1):
        current_end = participant_segments.iloc[i]['stop_time']
        next_start = participant_segments.iloc[i + 1]['start_time']
        gap_duration = next_start - current_end
        insert_pause.append(gap_duration >= min_gap_for_pause)

    # Estrai segmenti audio e inserisci pause quando necessario
    audio_segments = []
    pause_samples = np.zeros(int(pause_duration * sr), dtype=np.float32)

    for i, (s, e) in enumerate(zip(starts, ends)):
        audio_segments.append(audio[s:e])
        
        # Inserisci pausa se il gap era >= 2 secondi
        if i < len(insert_pause) and insert_pause[i]:
            audio_segments.append(pause_samples)
    
    cleaned_audio = np.concatenate(audio_segments)
    
    # Aggiorna timestamp considerando le pause inserite
    current_time = 0.0
    new_starts = []
    new_stops = []
    
    for i, (s, e) in enumerate(zip(starts, ends)):
        segment_duration = (e - s) / sr
        new_starts.append(current_time)
        current_time += segment_duration
        new_stops.append(current_time)
        
        # Aggiungi tempo di pausa se inserita
        if i < len(insert_pause) and insert_pause[i]:
            current_time += pause_duration

    # Aggiorna timestamp e rimuovi colonna speaker
    participant_segments['start_time'] = new_starts
    participant_segments['stop_time'] = new_stops
    participant_segments = participant_segments.drop('speaker', axis=1)
    
    # Salva
    sf.write(output_audio_path, cleaned_audio, sr, subtype='PCM_16')
    participant_segments.to_csv(output_transcript_path, sep='\t', index=False)
    
    original_duration = len(audio) / sr
    cleaned_duration = len(cleaned_audio) / sr

    return original_duration, cleaned_duration

In [3]:
# Elaborazione dataset
dataset_dir = "datasets/DAIC-WOZ"
output_dir = "datasets/DAIC-WOZ-Cleaned"
os.makedirs(output_dir, exist_ok=True)

total_original = 0.0
total_cleaned = 0.0

session_dirs = [d for d in os.listdir(dataset_dir) if d.endswith("_P")]
session_dirs = [d for d in session_dirs if os.path.isdir(os.path.join(dataset_dir, d))]

# Filtra sessioni già processate
sessions_to_process = []
for session in session_dirs:
    output_session_path = os.path.join(output_dir, session)
    if not os.path.exists(output_session_path):
        sessions_to_process.append(session)

print(f"Processando {len(sessions_to_process)} sessioni su {len(session_dirs)} totali")

for session in tqdm(sessions_to_process, desc="Processing sessions"):
    session_path = os.path.join(dataset_dir, session)
    session_id = session.replace("_P", "")

    # Percorsi file input
    audio_path = os.path.join(session_path, f"{session_id}_AUDIO.wav")
    transcript_path = os.path.join(session_path, f"{session_id}_TRANSCRIPT.csv")
    
    # Percorsi file output
    output_session_path = os.path.join(output_dir, session)
    os.makedirs(output_session_path, exist_ok=True)
    output_audio_path = os.path.join(output_session_path, f"{session_id}_AUDIO.wav")
    output_transcript_path = os.path.join(output_session_path, f"{session_id}_TRANSCRIPT.csv")
    
    # Processa sessione
    original_duration, cleaned_duration = clean_audio_and_transcript(
        audio_path, transcript_path, output_audio_path, output_transcript_path, 
        buffer_sec=0.1, min_gap_for_pause=2.0, pause_duration=0.5
    )    
    total_original += original_duration
    total_cleaned += cleaned_duration

total_removed = total_original - total_cleaned
reduction_percentage = (total_removed / total_original * 100) if total_original > 0 else 0
print(f"\nDurata totale originale: {total_original:.1f}s ({total_original/3600:.2f}h)")
print(f"Durata totale dopo pulizia: {total_cleaned:.1f}s ({total_cleaned/3600:.2f}h)")
print(f"Durata totale rimossa: {total_removed:.1f}s ({total_removed/3600:.2f}h)")
print(f"Riduzione: {reduction_percentage:.1f}%")

# Copia i CSV globali dalla radice del dataset nella cartella cleaned
for fname in os.listdir(dataset_dir):
    src_path = os.path.join(dataset_dir, fname)
    dst_path = os.path.join(output_dir, fname)

    if fname.endswith(".csv") and os.path.isfile(src_path):
        shutil.copy(src_path, dst_path)

Processando 0 sessioni su 189 totali


Processing sessions: 0it [00:00, ?it/s]


Durata totale originale: 0.0s (0.00h)
Durata totale dopo pulizia: 0.0s (0.00h)
Durata totale rimossa: 0.0s (0.00h)
Riduzione: 0.0%





In [4]:
def analyze_audio_lengths(dataset_dir):
    """
    Analizza le lunghezze degli audio per determinare parametri ottimali
    
    Returns:
        dict: Statistiche delle durate
    """
    print("Analizzando lunghezze audio...")
    durations = []
    session_names = []
    session_dirs = [d for d in os.listdir(dataset_dir) if d.endswith("_P")]

    for session in tqdm(session_dirs, desc="Analisi campione"):
        session_path = os.path.join(dataset_dir, session)
        session_id = session.replace("_P", "")
        transcript_path = os.path.join(session_path, f"{session_id}_TRANSCRIPT.csv")
        
        transcript = pd.read_csv(transcript_path, sep='\t')
        durations.append(transcript['stop_time'].max())
        session_names.append(session_id) 
    
    durations = np.array(durations)
    min_idx = np.argmin(durations)
    max_idx = np.argmax(durations)
    
    stats = {
        'count': len(durations),
        'mean': np.mean(durations),
        'median': np.median(durations),
        'std': np.std(durations),
        'min': np.min(durations),
        'max': np.max(durations),
        'min_audio': session_names[min_idx],  # Nome audio con durata minima
        'max_audio': session_names[max_idx],  # Nome audio con durata massima
        'q25': np.percentile(durations, 25),
        'q75': np.percentile(durations, 75)
    }
    
    return stats

In [5]:
analyze_audio_lengths(output_dir)

Analizzando lunghezze audio...


Analisi campione: 100%|██████████| 189/189 [00:00<00:00, 1128.78it/s]


{'count': 189,
 'mean': 526.5888591269842,
 'median': 479.9799375000002,
 'std': 245.74184613563557,
 'min': 97.00993750000004,
 'max': 1375.119875000002,
 'min_audio': '385',
 'max_audio': '337',
 'q25': 352.1799374999998,
 'q75': 648.8420000000001}