In [1]:
import os
import pandas as pd

data_dir = "../data/RADVESS"


In [2]:
# Helper functions
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# We only want these emotions
selected_emotions = {"neutral", "happy", "sad", "angry"}

def extract_labels(filepath):
    filename = os.path.basename(filepath)
    parts = filename.split("-")

    emotion_id = parts[2]
    emotion = emotion_map.get(emotion_id, None)

    actor_id = int(parts[-1].split(".")[0])
    gender = "male" if actor_id % 2 == 1 else "female"

    return emotion, gender


In [3]:
# scan files and extract labels
audio_files = []

for root, _, files in os.walk(data_dir):
    for f in files:
        if f.endswith(".wav"):
            audio_files.append(os.path.join(root, f))

len(audio_files)


1440

In [4]:
# build dataframe

In [5]:
rows = []

for f in audio_files:
    emotion, gender = extract_labels(f)
    if emotion in selected_emotions:
        rows.append([f, emotion, gender])

df = pd.DataFrame(rows, columns=["filepath", "emotion", "gender"])
df.head()


Unnamed: 0,filepath,emotion,gender
0,../data/RADVESS/Actor_16/03-01-05-01-02-01-16.wav,angry,female
1,../data/RADVESS/Actor_16/03-01-05-02-01-01-16.wav,angry,female
2,../data/RADVESS/Actor_16/03-01-04-01-01-02-16.wav,sad,female
3,../data/RADVESS/Actor_16/03-01-04-02-02-02-16.wav,sad,female
4,../data/RADVESS/Actor_16/03-01-03-02-02-02-16.wav,happy,female


In [6]:
# save metdata
output_path = "../data/metadata.csv"
df.to_csv(output_path, index=False)
output_path


'../data/metadata.csv'

In [8]:
# summary
print("Total samples:", len(df))
print("\nEmotion distribution:")
print(df["emotion"].value_counts())

print("\nGender distribution:")
print(df["gender"].value_counts())


Total samples: 672

Emotion distribution:
emotion
angry      192
sad        192
happy      192
neutral     96
Name: count, dtype: int64

Gender distribution:
gender
female    336
male      336
Name: count, dtype: int64
