# Generating Mel Spectrograms from Audio Files 🎵

This notebook converts audio files into Mel spectrogram images, which visually represent the frequency content of sounds over time. These spectrograms can be used for machine learning tasks such as birdsong classification. We will use this images for training other deep learning architectures as bird vocalization classifiers.

The script processes audio files, ensuring they are at least 3 seconds long by padding shorter clips and splitting longer ones into 3-second segments. Each segment is then transformed into a Mel spectrogram and saved as an image.

In [1]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

In [2]:
ROOT_PATH = "../../Data/"

In [3]:
def create_spectrogram(audio_file):
    fmin = 0  # Minimum frequency (0 Hz)
    fmax = 16000  # Maximum frequency (16 kHz)
    
    # Load audio
    y, sr = librosa.load(audio_file, sr=None)
    
    # Make lowercase uppercase
    if audio_file.endswith(".wav"):
        audio_file = audio_file.replace(".wav", ".WAV")

    # Define output directory
    output_image_dir = audio_file.replace('Audios', 'images').rsplit('/', 1)[0]
    os.makedirs(output_image_dir, exist_ok=True)  # Crear solo una vez

    # define SFTF
    D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)  # Solo calcular STFT una vez

    # If audio os shorter than 3 seconds, pad it
    if len(y) < 3 * sr:
        y = np.pad(y, (0, 3 * sr - len(y)), mode='constant')  # Padding al final
        output_image_path = audio_file.replace('Audios', 'images').replace(".WAV", ".PNG")
        save_spectrogram(D, sr, fmin, fmax, output_image_path)

    # If audio is greater than 3 seconds, iterate in 3 seconds
    else:
        for i in range(0, len(y), 3 * sr):  # Iterate in 3 seconds
            y_clip = y[i:i + 3 * sr]
            if len(y_clip) < 3 * sr:
                y_clip = np.pad(y_clip, (0, 3 * sr - len(y_clip)), mode='constant')  # Padding

            output_image_path = audio_file.replace('Audios', 'images').replace(".WAV", f"_{i//(3*sr)}.PNG")
            save_spectrogram(D[:, i//(3*sr):(i//(3*sr)+1)], sr, fmin, fmax, output_image_path)

def save_spectrogram(D, sr, fmin, fmax, output_path):
    """Generates and saves a spectrogram"""
    fig, ax = plt.subplots(figsize=(12, 6))
    librosa.display.specshow(D, sr=sr, x_axis="time", y_axis="log", fmin=fmin, fmax=fmax, ax=ax)
    ax.axis('off')  # Eliminate axis
    fig.savefig(output_path, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close(fig)  # Close figure - save memory


In [4]:
import os

DATASET_FOLDER = ROOT_PATH + "Dataset/Audios/For Classifier/train"

# Recorrer todos los subdirectorios y archivos de audio
for root, _, files in os.walk(DATASET_FOLDER):
    for file in files:
        if file.lower().endswith('.wav'):  # Asegura que detecte '.WAV' y '.wav'
            audio_file = os.path.join(root, file)
            output_image_path = audio_file.replace('Audios', 'images').replace(".WAV", ".PNG").replace(".wav", ".PNG")

            if not os.path.exists(output_image_path):  # Omitir si la imagen ya existe
                try:
                    create_spectrogram(audio_file)
                    # print(f"✅ Generated Spectrogram: {output_image_path}")
                except Exception as e:
                    print(f"❌ Error processing {audio_file}: {e}")

print("🎉 Spectrograms generated succesfully!")

  D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)  # Solo calcular STFT una vez


🎉 Spectrograms generated succesfully!


In [5]:
DATASET_FOLDER = ROOT_PATH + "Dataset/Audios/For Classifier/"

# Count number of .WAV files in Dataset Folder and Count number of .PNG files in Images Folder
audio_files = sum([len(files) for _, _, files in os.walk(DATASET_FOLDER)])
image_files = sum([len(files) for _, _, files in os.walk(DATASET_FOLDER.replace('Audios', 'images'))])

print(f"Number of audio files: {audio_files}")
print(f"Number of image files: {image_files}")

Number of audio files: 3111
Number of image files: 7469


There are more images than audio files because there are segments that are longer than 3 seconds and images have to be 3 seconds long.