# Transformación de audio a espectrograma

Este notebook tiene como objetivo transformar los archivos de audio del dataset GTZAN Genre Collection en espectrogramas Mel de 3 segundos.

## Librerias necesarias

In [2]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os
from glob import glob
from PIL import Image
import random

## Parametros

In [3]:
# Constantes
SAMPLE_RATE = 22050
SEGMENT_DURATION = 3
NUMBER_OF_SEGMENTS = 10
TIME_STRETCH_RATES = [0.85, 1.15] 

### Configuración de carpetas

In [4]:
AUDIO_INPUT_FOLDER = "Data/genresWav"
SPECTROGRAM_OUTPUT_FOLDER = "Data/AugmentedSpectrograms"

Funcion para crear espectrogramas, parametros:

- Sample rate: 22050
- n_mels: 128
- Duration: 3


In [5]:
def generate_mel_spectrogram(audio_segment, sr):
    mel = librosa.feature.melspectrogram(y=audio_segment, sr=sr, n_mels=128)
    return librosa.power_to_db(mel, ref=np.max)

In [6]:
# def process_audio_files():
#     try:
#         # Se obtiene la lista de archivos de audio
#         segment_length = int(SEGMENT_DURATION * SAMPLE_RATE)
#         total_images = 0

#         genres = [d for d in os.listdir(AUDIO_INPUT_FOLDER)
#                   if os.path.isdir(os.path.join(AUDIO_INPUT_FOLDER, d))]

#         for genre in genres:

#             # Se obtiene la lista de archivos de audio por género
#             genre_path = os.path.join(AUDIO_INPUT_FOLDER, genre)
#             audio_files = glob(os.path.join(genre_path, '*.wav'))

#             if not audio_files:
#                 continue

#             print(f"\nGenre '{genre}': {len(audio_files)} files found")

#             # Se mezclan los archivos de audio y se dividen en conjuntos de entrenamiento, validación y prueba
#             random.shuffle(audio_files)
#             train_split = int(0.6 * len(audio_files))
#             val_split = int(0.8 * len(audio_files))

#             sets = {
#                 'train': audio_files[:train_split],
#                 'val': audio_files[train_split:val_split],
#                 'test': audio_files[val_split:]
#             }

#             for split, files in sets.items():
#                 print(f"  {split}: {len(files)} files")

#                 for path in files:
#                     try:
#                         # Se carga el archivo de audio y se obtiene la carpeta de salida
#                         audio, sr = librosa.load(path, sr=SAMPLE_RATE)
#                         file_stem = os.path.splitext(os.path.basename(path))[0]
#                         output_dir = os.path.join(SPECTROGRAM_OUTPUT_FOLDER, split, genre)
#                         os.makedirs(output_dir, exist_ok=True)

#                         # Se crea un espectrograma para cada segmento de audio
#                         for i in range(NUMBER_OF_SEGMENTS):
#                             start = i * segment_length
#                             end = start + segment_length
#                             segment = audio[start:end]

#                             if len(segment) != segment_length:
#                                 continue

#                             spec = generate_mel_spectrogram(segment, sr)


#                             # Se guarda el espectrograma como imagen
#                             img_path = os.path.join(output_dir, f"{file_stem}_seg_{i+1}.png")
#                             plt.imsave(img_path, np.flipud(spec), cmap='viridis', origin='lower')
#                             total_images += 1

#                     except Exception as e:
#                         print(f"Error in {path}: {e}")

#         print(f"\nDone! Total spectrograms saved: {total_images}")

#     except Exception as e:
#         print(f"Error: {e}")

# process_audio_files()

In [None]:
def process_audio_files():
    """
    Processes audio files, creates segments, generates spectrograms,
    and applies time stretching augmentation to the training set
    after saving the original spectrograms.
    """
    try:
        segment_length = int(SEGMENT_DURATION * SAMPLE_RATE)
        total_images = 0

        genres = [d for d in os.listdir(AUDIO_INPUT_FOLDER)
                  if os.path.isdir(os.path.join(AUDIO_INPUT_FOLDER, d))]

        for genre in genres:
            genre_path = os.path.join(AUDIO_INPUT_FOLDER, genre)
            audio_files = glob(os.path.join(genre_path, '*.wav'))

            if not audio_files:
                print(f"\nGenre '{genre}': No .wav files found. Skipping.")
                continue

            print(f"\nGenre '{genre}': {len(audio_files)} files found")

            random.shuffle(audio_files)
            train_split_idx = int(0.6 * len(audio_files))
            val_split_idx = int(0.8 * len(audio_files))

            sets_files = {
                'train': audio_files[:train_split_idx],
                'val': audio_files[train_split_idx:val_split_idx],
                'test': audio_files[val_split_idx:]
            }

            for split, files_in_split in sets_files.items():
                print(f"  Processing split '{split}': {len(files_in_split)} files")

                for path in files_in_split:
                    try:
                        original_audio, sr = librosa.load(path, sr=SAMPLE_RATE)
                        file_stem = os.path.splitext(os.path.basename(path))[0]
                        output_dir_genre_split = os.path.join(SPECTROGRAM_OUTPUT_FOLDER, split, genre)
                        os.makedirs(output_dir_genre_split, exist_ok=True)

                        versions_to_process = []

                        # Add original audio for processing
                        versions_to_process.append({
                            "data": original_audio,
                            "label": "original", 
                            "stem": file_stem 
                        })

                        # If it's training data, prepare augmented versions
                        # These will be processed *after* the original 
                        if split == 'train':
                            for rate in TIME_STRETCH_RATES:
                                try:
                                    stretched_audio = librosa.effects.time_stretch(y=original_audio, rate=rate)
                                    aug_label_suffix = f'ts_{str(rate).replace(".", "p")}'
                                    versions_to_process.append({
                                        "data": stretched_audio,
                                        "label": aug_label_suffix,
                                        "stem": f"{file_stem}_{aug_label_suffix}" 
                                    })
                                except Exception as e_stretch:
                                    print(f"      Error stretching {file_stem} with rate {rate}: {e_stretch}")


                        # Process each version (original, then augmented if applicable)
                        for audio_version_info in versions_to_process:
                            current_audio_data = audio_version_info["data"]
                            current_file_stem_for_segments = audio_version_info["stem"]
                            
                            # Calculate number of possible segments for this specific audio data
                            num_possible_segments = len(current_audio_data) // segment_length

                            if num_possible_segments == 0:
                                print(f"Skipping {current_file_stem_for_segments} due to insufficient length for segmentation.")
                                continue

                            else:
                                segments_data = []
                                for i in range(num_possible_segments):
                                    start = i * segment_length
                                    end = start + segment_length
                                    segment = current_audio_data[start:end]
                                    segments_data.append((segment, i))

                            # Process and save segments for the current audio version
                            for segment_audio, seg_idx in segments_data:

                                spec = generate_mel_spectrogram(segment_audio, sr)

                                img_filename = f"{current_file_stem_for_segments}_seg_{seg_idx+1}.png"
                                img_path = os.path.join(output_dir_genre_split, img_filename)
                                
                                plt.imsave(img_path, np.ascontiguousarray(np.flipud(spec)), cmap='viridis', origin='lower')
                                total_images += 1

                    except Exception as e_file:
                        print(f"Error processing file {path}: {e_file}")
        print(f"\nDone! Total spectrograms saved: {total_images}")
    except Exception as e:
        print(f"Error in process_audio_files: {e}")

process_audio_files()


Genre 'pop': 100 files found
  Processing split 'train': 60 files
  Processing split 'val': 20 files
  Processing split 'test': 20 files

Genre 'rock': 100 files found
  Processing split 'train': 60 files
  Processing split 'val': 20 files
  Processing split 'test': 20 files

Genre 'disco': 100 files found
  Processing split 'train': 60 files
  Processing split 'val': 20 files
  Processing split 'test': 20 files

Genre 'blues': 100 files found
  Processing split 'train': 60 files
  Processing split 'val': 20 files
  Processing split 'test': 20 files

Genre 'country': 100 files found
  Processing split 'train': 60 files
  Processing split 'val': 20 files
  Processing split 'test': 20 files

Genre 'hiphop': 100 files found
  Processing split 'train': 60 files
  Processing split 'val': 20 files
  Processing split 'test': 20 files

Genre 'metal': 100 files found
  Processing split 'train': 60 files
  Processing split 'val': 20 files
  Processing split 'test': 20 files

Genre 'reggae': 100