In [None]:
import os
import numpy as np
from disvoice.articulation import Articulation
from disvoice.phonation import Phonation
from disvoice.prosody import Prosody
import math
from joblib import Parallel, delayed, cpu_count

In [83]:
def process_participant_chunk(participant_chunk, input_dir, output_dir):
    """Processa un chunk di partecipanti estraendo tutte e 3 le features in un singolo processo"""    
    
    # Set numpy seed in each worker process for reproducibility
    np.random.seed(42)
    
    # Inizializza tutti gli estrattori
    articulation_extractor = Articulation()
    phonation_extractor = Phonation()
    prosody_extractor = Prosody()
    
    extractors = {
        "articulation": articulation_extractor,
        "phonation": phonation_extractor,
        "prosody": prosody_extractor
    }

    for participant_dir in participant_chunk:
        participant_path = os.path.join(input_dir, participant_dir)
        participant_id = participant_dir.replace("_P", "")
        audio_path = os.path.join(participant_path, f"{participant_id}_AUDIO.wav")
        
        output_participant_path = os.path.join(output_dir, participant_dir)
        os.makedirs(output_participant_path, exist_ok=True)
        
        # Estrai tutte e 3 le features per il partecipante corrente
        for features_type, extractor in extractors.items():
            output_file = os.path.join(output_participant_path, f"{features_type}_features.npy")
            
            # Salta se il file esiste già
            if os.path.exists(output_file):
                continue
                
            features = extractor.extract_features_file(audio_path, static=True, plots=False, fmt="npy")
            np.save(output_file, features)

In [84]:
def extract_all_features_parallel(input_dir, output_dir):
    """Estrae tutte le features in parallelo usando joblib"""

    # Crea la directory di output se non esiste
    os.makedirs(output_dir, exist_ok=True)
    
    # Trova tutte le directory dei partecipanti che terminano con "_P"
    all_participant_dirs = sorted([d for d in os.listdir(input_dir) 
                                  if d.endswith("_P") and os.path.isdir(os.path.join(input_dir, d))])
    
    # Filtra solo i partecipanti che non hanno tutte le features
    participant_dirs = []
    feature_types = ["articulation", "phonation", "prosody"]
    
    for participant_dir in all_participant_dirs:
        needs_processing = False
        for features_type in feature_types:
            output_file = os.path.join(output_dir, participant_dir, f"{features_type}_features.npy")
            if not os.path.exists(output_file):
                needs_processing = True
                break
        if needs_processing:
            participant_dirs.append(participant_dir)
    
    # Se non ci sono partecipanti da processare, esci
    if not participant_dirs:
        print("No participants to process.")
        return
    
    n_jobs = cpu_count()  # Numero di processi da utilizzare
    # Dividi le sessioni in chunks
    chunk_size = math.ceil(len(participant_dirs) / n_jobs)
    participant_chunks = [participant_dirs[i:i + chunk_size] for i in range(0, len(participant_dirs), chunk_size)]
    
    print(f"Processing {len(participant_dirs)} participants in {len(participant_chunks)} chunks...")
    
    # Processa i chunks in parallelo
    Parallel(n_jobs=n_jobs, verbose=0)(
        delayed(process_participant_chunk)(chunk, input_dir, output_dir)
        for chunk in participant_chunks
    )

In [85]:
# Elaborazione di tutte le features in una sola chiamata
dataset_dir = "datasets/DAIC-WOZ-Cleaned"
output_dir = "features/DAIC-WOZ-Cleaned"
extract_all_features_parallel(dataset_dir, output_dir)

Processing 189 participants in 8 chunks...


################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

