## Installation des packages nécessaires

- s3prl : framework d'extraction de features audio
- soundfile : lecture/écriture de fichiers audio
- torchaudio : traitement audio avec PyTorch
- librosa : traitement audio
- tqdm : barre de progression

In [62]:
# %pip install --upgrade pip setuptools wheel
#
# %pip install s3prl --upgrade
# %pip install huggingface_hub -- upgrade
#
# %pip install soundfile --upgrade
# %pip install torchaudio --upgrade
# %pip install librosa --upgrade
#
# %pip install tqdm --upgrade
# %pip install scikit-learn --upgrade
# %pip install matplotlib --upgrade

## Import des packages

- tqdm : barre de progression
- pathlib : gestion des chemins de fichiers
- librosa : traitement audio
- os : gestion des fichiers
- torch : PyTorch
- numpy : calculs numériques
- soundfile : lecture/écriture de fichiers audio
- s3prl : framework d'extraction de features audio
- sklearn : outils de machine learning

In [63]:
import os
from pathlib import Path

import librosa
import soundfile as sf
import torch
from s3prl.nn import S3PRLUpstream
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

## Paramètres globaux

- random_seed : graine aléatoire pour reproductibilité
- device : choix du device (GPU si disponible, sinon CPU)
- dataset_path : chemin vers le dataset audio original
- audio_path : chemin vers le dossier de sauvegarde des audios rééchantillonnés
- embedding_path : chemin vers le dossier de sauvegarde des embeddings extraits

In [64]:
random_seed = 42
limit = 200

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

dataset_path = Path("./dataset/")
audio_path = Path("./16k_Hz_audio/")
embedding_path = Path("./16k_Hz_embedding/")
pooled_path = Path("./16k_Hz_pooled/")

# Pooled versions paths
pooled_mean_path = pooled_path / "mean"
pooled_max_path = pooled_path / "max"
pooled_mean_std_path = pooled_path / "mean_std"

train_mean_path = pooled_mean_path / "_train.pt"
val_mean_path = pooled_mean_path / "_val.pt"
test_mean_path = pooled_mean_path / "_test.pt"

train_max_path = pooled_max_path / "_train.pt"
val_max_path = pooled_max_path / "_val.pt"
test_max_path = pooled_max_path / "_test.pt"

train_mean_std_path = pooled_mean_std_path / "_train.pt"
val_mean_std_path = pooled_mean_std_path / "_val.pt"
test_mean_std_path = pooled_mean_std_path / "_test.pt"

# Not pooled versions paths
transformer_path = embedding_path / "transformer"

train_transformer_path = transformer_path / "_train.pt"
val_transformer_path = transformer_path / "_val.pt"
test_transformer_path = transformer_path / "_test.pt"

audio_path.mkdir(exist_ok=True)
embedding_path.mkdir(exist_ok=True)
pooled_path.mkdir(exist_ok=True)
pooled_mean_path.mkdir(exist_ok=True)
pooled_max_path.mkdir(exist_ok=True)
pooled_mean_std_path.mkdir(exist_ok=True)
transformer_path.mkdir(exist_ok=True)

AUDIO_EXTENSIONS = (".wav", ".mp3", ".flac", ".ogg", ".m4a")

Device: cpu


## Fonctions utilitaires pour associer les fichiers audio à leurs émotions
- find_emotion_T : fonction pour la base TESS
- emotionfix : fonction pour ajuster les indices d'émotions pour la classification

In [65]:
# Emotion kind validation function for TESS database, due to emotions written within the file names.
def find_emotion_T(name):
    if 'neutral' in name:
        return "01"
    elif 'happy' in name or 'joy' in name or 'positive' in name:
        return "03"
    elif 'sad' in name or 'sadness' in name or 'pain' in name:
        return "04"
    elif 'angry' in name or 'anger' in name:
        return "05"
    elif 'fear' in name:
        return "06"
    elif 'disgust' in name or 'negative' in name:
        return "07"
    elif 'ps' in name or 'surprise' in name:
        return "08"
    else:
        return "-1"


# 'emotions' list fix for classification purposes:
#     Classification values start from 0, Thus an 'n = n-1' operation has been executed for both RAVDESS and TESS databases:
def emotionfix(e_num):
    if e_num == "01":
        return 0  # neutral
    elif e_num == "02":
        return 1  # calm
    elif e_num == "03":
        return 2  # happy
    elif e_num == "04":
        return 3  # sad
    elif e_num == "05":
        return 4  # angry
    elif e_num == "06":
        return 5  # fear
    elif e_num == "07":
        return 6  # disgust
    else:
        return 7  # suprised

labels = [0,1,2,3,4,5,6,7]

## Rééchantillonnage des fichiers audio à 16kHz et sauvegarde
- On parcourt le dataset original
- Si l'échantillonnage n'est pas à 16kHz
  - On rééchantillonne chaque fichier à 16kHz si nécessaire
  - On sauvegarde les fichiers rééchantillonnés dans un nouveau dossier
- Sinon
  - On sauvegarde le fichier

In [66]:
sample_rate = 16000
nb_treated_file = 0
fnb = 0

for subdir, dirs, files in tqdm(os.walk(dataset_path), desc="Re-sampling audio files"):
    if not "emotions" in str(subdir):

        for file in files:
            if file.lower().endswith(AUDIO_EXTENSIONS):

                # Ecriture du nouveau nom de fichier
                save_path = audio_path / file

                if not save_path.exists():

                    nb_treated_file += 1

                    y, sr = librosa.load(Path(subdir) / file, sr=None)  # sr=None garde la fréquence originale

                    if sr != sample_rate:
                        # Re-échantillonner
                        y_resampled = librosa.resample(y, orig_sr=sr, target_sr=sample_rate)

                        # Sauvegarder
                        sf.write(save_path, y_resampled, sample_rate)
                    else:
                        sf.write(save_path, y, sample_rate)


            fnb += 1
print(f"Number of files re-sampled and saved: {nb_treated_file}")
print(f"Total number of files : {fnb}")

Re-sampling audio files: 51it [00:00, 1004.23it/s]

Number of files re-sampled and saved: 0
Total number of files : 4240





## Load du modèle upstream WavLM via S3PRL et extraction des embeddings
- On liste les modèles disponibles via torch.hub
- On choisit le modèle WavLM (large de préférence)
- On charge le modèle upstream (téléchargement si nécessaire)

In [67]:
# 1) lister les modèles exposés via torch.hub (optionnel mais utile)
print("Listing available torch.hub upstreal models from s3prl (may take 1-2s)...")
try:
    hub_list = torch.hub.list('s3prl/s3prl')
    print("Some available models (first 30):\n", hub_list[:30])
except Exception as e:
    print("Impossible de lister via torch.hub:", e)
    hub_list = []

# 2) choisir un upstream (essaye wavlm_large, sinon wavlm_base_plus, sinon 'wavlm' puis tu pourras adapter)
chosen = "wavlm_large"
print(f"\nWill try to load upstream name: {chosen}")

# 3) load the upstream model (this will download checkpoint if needed)
upstream_model = S3PRLUpstream(chosen)
upstream_model.to(device)
upstream_model.eval()
print("Loaded upstream:", chosen)

Listing available torch.hub upstreal models from s3prl (may take 1-2s)...


Using cache found in /Users/alves/.cache/torch/hub/s3prl_s3prl_main


Some available models (first 30):
 ['FileLock', 'Path', 'apc', 'apc_360hr', 'apc_960hr', 'apc_local', 'apc_url', 'ast', 'audio_albert', 'audio_albert_960hr', 'audio_albert_local', 'audio_albert_logMelBase_T_share_AdamW_b32_1m_960hr_drop1', 'audio_albert_url', 'baseline', 'baseline_local', 'byol_a_1024', 'byol_a_2048', 'byol_a_512', 'byol_s_cvt', 'byol_s_default', 'byol_s_resnetish34', 'contentvec', 'contentvec_km100', 'contentvec_km500', 'cpc_local', 'cpc_url', 'customized_upstream', 'cvhubert', 'data2vec', 'data2vec_base_960']

Will try to load upstream name: wavlm_large
Loaded upstream: wavlm_large


## Extraction des embeddings et sauvegarde
- On parcourt les fichiers audio rééchantillonnés
- On charge chaque fichier audio
- On extrait les embeddings via le modèle upstream
- On sauvegarde les embeddings extraits

In [68]:
nb_treated_file = 0
efnb = 0

for file in tqdm(os.listdir(audio_path), desc="Extracting embeddings"):
    # 5) charge un wav mono 16kHz
    wav_path = audio_path / file
    if not wav_path.exists():
        raise FileNotFoundError("Place a 16kHz WAV file in the current folder")

    if (efnb * 100.0 / fnb) % 10 == 0:
        print(f"Processed {efnb} files")

    # Sauvegarde path
    save_path = embedding_path / (wav_path.stem + ".pt")
    if not save_path.exists():

        nb_treated_file += 1

        wav, sr = sf.read(str(wav_path))
        if sr != 16000:
            raise ValueError(f"Le fichier doit être 16kHz. Fichier a {sr} Hz")

        # s3prl expects shape (batch, time)
        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).to(device)

        # length tensor (in samples)
        wav_len = torch.LongTensor([wav_tensor.shape[1]]).to(device)

        with torch.no_grad():
            # all_hs is a list of hidden states from different extraction points (depends on upstream)
            all_hs, all_hs_len = upstream_model(wav_tensor, wav_len)

        # Dernière couche (tu peux aussi prendre une moyenne, ex: torch.stack(all_hs[-4:]).mean(0))
        emb = all_hs[-1].cpu().squeeze(0)  # (seq_len, hidden_dim)

        torch.save(emb, save_path)
    efnb += 1

print(f"Number of files processed: {nb_treated_file}")


Extracting embeddings: 100%|██████████| 4240/4240 [00:00<00:00, 98232.14it/s]

Processed 0 files
Processed 424 files
Processed 848 files
Processed 1272 files
Processed 1696 files
Processed 2120 files
Processed 2544 files
Processed 2968 files
Processed 3392 files
Processed 3816 files
Number of files processed: 0





## On pool les embeddings extraits pour classification
- On definit les types de pooling à appliquer
- On crée un petit modèle transformer pour l'agrégation séquentielle
- On parcourt les fichiers d'embeddings extraits
- On applique tous les types de pooling pour chaque embedding
- On sauvegarde les embeddings formatés
- Types de pooling :
    - mean : moyenne (1024)
    - max : maximum (1024)
    - mean_std : concaténation de la moyenne et de l'écart-type (2048)
    - transformer : agrégation via un petit modèle transformer (1024)

In [69]:
# === Choix du type de pooling ===
# Options possibles : "mean", "max", "mean_std"
pooling_types = ["mean", "max", "mean_std"]

# === Boucle sur les embeddings ===
for file in tqdm(list(embedding_path.glob("*.pt"))):

    emb = torch.load(file)  # (seq_len, 1024)

    for pooling_type in pooling_types:

        save_path = pooled_path / pooling_type / f"{file.stem}.pt"

        if not save_path.exists():

            # === Pooling dynamique selon le choix utilisateur ===
            if pooling_type == "mean":
                emb_pooled = emb.mean(dim=0)  # (1024,)

            elif pooling_type == "max":
                emb_pooled, _ = emb.max(dim=0)  # (1024,)

            elif pooling_type == "mean_std":
                emb_mean = emb.mean(dim=0)
                emb_std = emb.std(dim=0)
                emb_pooled = torch.cat([emb_mean, emb_std])  # (2048,)

            else:
                raise ValueError(f"Type de pooling inconnu : {pooling_type}")

            # === Sauvegarde ===
            torch.save(emb_pooled, save_path)


100%|██████████| 4240/4240 [00:04<00:00, 933.49it/s] 


## Split des embeddings en sets d'entrainement, validation et test
- On liste les chemins des fichiers d'embeddings
- On split les fichiers en set d'entrainement, validation et test
- On sauvegarde les splits

In [70]:
# On va split les embedding en set pour les entrainements/validations/tests

pooling_paths = [pooled_mean_path, pooled_max_path, pooled_mean_std_path]

for path in pooling_paths:
    files = os.listdir(path)

    train_files, test_files = train_test_split(files, test_size=0.3, random_state=random_seed)  # 70% train, 30% test
    print("Tain split is composed of ", len(train_files), " files.")
    val_files, test_files = train_test_split(test_files, test_size=0.5,
                                             random_state=random_seed)  # 50% val, 50% test donc 70% train 15% val 15% test
    print("Validation split is composed of ", len(val_files), " files.")
    print("Test split is composed of ", len(test_files), " files.\n")

    torch.save(train_files, path / "_train.pt")
    torch.save(val_files, path / "_val.pt")
    torch.save(test_files, path / "_test.pt")

# Même chose pour les embeddings non poolés (transformer)

files = os.listdir(embedding_path)

train_files, test_files = train_test_split(files, test_size=0.3, random_state=random_seed)  # 70% train, 30% test
print("Tain split is composed of ", len(train_files), " files.")
val_files, test_files = train_test_split(test_files, test_size=0.5,
                                         random_state=random_seed)  # 50% val, 50% test donc 70% train 15% val 15% test
print("Validation split is composed of ", len(val_files), " files.")
print("Test split is composed of ", len(test_files), " files.\n")

torch.save(train_files, transformer_path / "_train.pt")
torch.save(val_files, transformer_path / "_val.pt")
torch.save(test_files, transformer_path / "_test.pt")



Tain split is composed of  2970  files.
Validation split is composed of  636  files.
Test split is composed of  637  files.

Tain split is composed of  2970  files.
Validation split is composed of  636  files.
Test split is composed of  637  files.

Tain split is composed of  2970  files.
Validation split is composed of  636  files.
Test split is composed of  637  files.

Tain split is composed of  2968  files.
Validation split is composed of  636  files.
Test split is composed of  637  files.



## Vérification des fichiers sauvegardés
On load un fichier du train set de la moyenne pour vérifier si on a bien sauvegardé correctement
- On load le training set de la moyenne
- On load un embedding de ce train set
- On affiche la forme de l'embedding

### Expected output
```
Number of training files (mean pooling): 72
Loading embedding file: xxxxxxx.pt
Embedding shape: torch.Size([1024])
```

In [71]:
# On load un fichier du train set pour vérifier

# On load le training sed de la moyenne
train_files = torch.load(train_mean_path)
transformer_train_files = torch.load(train_transformer_path)

print("Number of training files (mean pooling):", len(train_files))
print("Number of training files (transformer):", len(transformer_train_files))

# On load un embedding de ce train set
embedding_file = train_files[0]
transformer_embedding_file = transformer_train_files[0]
print("Loading embedding file:", embedding_file)
print("Loading transformer embedding file:", transformer_embedding_file)

embedding = torch.load(pooled_mean_path / embedding_file)
transformer_embedding = torch.load(embedding_path / transformer_embedding_file)
print("Embedding shape:", embedding.shape)
print("Transformer Embedding shape:", transformer_embedding.shape)

Number of training files (mean pooling): 2970
Number of training files (transformer): 2968
Loading embedding file: OAF_peg_ps.pt
Loading transformer embedding file: YAF_mill_angry.pt
Embedding shape: torch.Size([1024])
Transformer Embedding shape: torch.Size([105, 1024])


## Chargement des differents sets pour la methode de pooling renseignée et association des enregistrements à leur label

In [72]:
def load_split(file_list, base_path, is_transformer=False):
    embeddings, labels = [], []
    nb_par_classes = [0 for _ in range(8)]

    for file_name in file_list:

        if not file_name.startswith("_") and file_name.endswith(".pt"):

            # 1) Charge l'embedding
            emb = torch.load(base_path / file_name)  # tensor

            # 2) Détecte l'émotion
            emo_code = find_emotion_T(file_name)
            if emo_code == "-1":
                emo_code = file_name[6:8]  # RAVDESS
            label = emotionfix(emo_code)
            nb_par_classes[label] += 1

            embeddings.append(emb)
            labels.append(label)
    if is_transformer:
        X = pad_sequence(embeddings, batch_first=True)
    else:
        X = torch.stack(embeddings)
    y = torch.tensor(labels)
    print("\t - ",nb_par_classes)

    return X, y

paths = {
    "mean": [train_mean_path, val_mean_path, test_mean_path],
    "max": [train_max_path, val_max_path, test_max_path],
    "mean_std": [train_mean_std_path, val_mean_std_path, test_mean_std_path],
    "transformer": [train_transformer_path, val_transformer_path, test_transformer_path]
}

pooling_type = "mean"  # "max", "mean_std", "transformer"

train_path, val_path, test_path = paths[pooling_type]

train_files = torch.load(train_path)
val_files = torch.load(val_path)
test_files = torch.load(test_path)

print("Number of example for each class\nTrain :")
X_train, y_train = load_split(train_files, pooled_mean_path)
print("Val  :")
X_val, y_val = load_split(val_files, pooled_mean_path)
print("Test  :")
X_test, y_test = load_split(test_files, pooled_mean_path)

Number of example for each class
Train :
	 -  [351, 130, 415, 436, 403, 434, 400, 398]
Val  :
	 -  [80, 23, 101, 95, 89, 71, 93, 84]
Test  :
	 -  [65, 39, 76, 69, 98, 85, 97, 108]


## Vérification des dimensions des sets

In [73]:
print(f"[{pooling_type.upper()}] Loaded splits:")
print("Train :", X_train.shape, y_train.shape, y_train.unique())
print("Val   :", X_val.shape, y_val.shape, y_val.unique())
print("Test  :", X_test.shape, y_test.shape, y_val.unique())

[MEAN] Loaded splits:
Train : torch.Size([2967, 1024]) torch.Size([2967]) tensor([0, 1, 2, 3, 4, 5, 6, 7])
Val   : torch.Size([636, 1024]) torch.Size([636]) tensor([0, 1, 2, 3, 4, 5, 6, 7])
Test  : torch.Size([637, 1024]) torch.Size([637]) tensor([0, 1, 2, 3, 4, 5, 6, 7])
