In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

In [None]:
# Mapping from the third pair of numbers to emotion label for RAVDESS
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised',
    'N': 'neutral'  # In case an 'N' is used instead of "01" for neutral
}

In [None]:
def create_mel_spectrogram(audio_path, output_path):
    """
    Loads an audio file, computes its mel-spectrogram,
    saves the resulting image, then crops the image based on a fixed bounding box.
    """
    # Load the audio file at its native sampling rate
    y, sr = librosa.load(audio_path, sr=None)
    # Compute the mel-spectrogram with 128 mel bands
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    # Convert to log scale (dB)
    S_dB = librosa.power_to_db(S, ref=np.max)
    
    # Create a plot for the spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
    plt.title(f"Mel-Spectrogram for {os.path.basename(audio_path)}")
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    
    # Save the figure as a PNG file
    plt.savefig(output_path)
    plt.close()
    
    # Crop the image according to the bounding box:
    # Top left corner (80, 37) with width 725 and height 305 pixels.
    left = 90
    top = 37
    right = left + 715
    bottom = top + 305
    
    with Image.open(output_path) as img:
        cropped_img = img.crop((left, top, right, bottom))
        cropped_img.save(output_path)

In [None]:
def process_dataset(dataset_path, output_base):
    """
    Recursively processes each WAV file in the dataset folder and its subfolders.
    For each file:
      - Determines the emotion based on the third pair of numbers in its filename.
      - Creates an output folder for that emotion if it doesn't exist.
      - Converts the audio to a mel-spectrogram image, then crops it.
    """
    if not os.path.exists(output_base):
        os.makedirs(output_base)
    
    # Walk through the dataset folder and its subdirectories
    for root, _, files in os.walk(dataset_path):
        for filename in files:
            if filename.lower().endswith('.wav'):
                file_path = os.path.join(root, filename)
                base_name = os.path.splitext(filename)[0]
                parts = base_name.split('-')
                
                # Check if the filename has at least three parts to extract emotion code
                if len(parts) < 3:
                    print(f"Skipping {filename}: filename does not have enough parts to extract emotion code.")
                    continue
                
                # Get the third pair (index 2) as the emotion code
                emotion_code = parts[2]
                emotion = emotion_map.get(emotion_code)
                
                if emotion is None:
                    print(f"Skipping {filename}: unrecognized emotion code '{emotion_code}'.")
                    continue
                
                # Create an output directory for this emotion if needed
                emotion_dir = os.path.join(output_base, emotion)
                if not os.path.exists(emotion_dir):
                    os.makedirs(emotion_dir)
                
                output_filename = f"{base_name}.png"
                output_filepath = os.path.join(emotion_dir, output_filename)
                
                print(f"Processing {filename} -> {output_filename} (Emotion: {emotion})")
                create_mel_spectrogram(file_path, output_filepath)

In [None]:
if __name__ == "__main__":
    # Path to the RAVDESS dataset folder (which contains several subfolders with WAV files)
    dataset_folder = "RAVDESS"
    # Base folder where the mel-spectrogram images will be saved (one folder per emotion)
    output_folder = "RAVDESS_mel_spectrograms"
    
    process_dataset(dataset_folder, output_folder)