# Dataset Generation

In [1]:
import sys
import os

# Detect Google Colab
if "google.colab" in sys.modules:
  from google.colab import drive
  drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Detect Google Colab
if "google.colab" in sys.modules:
    print("Running in Google Colab...")
    os.system("git clone https://github.com/CiaranMaloy/audioautoencoder")
    os.chdir("/content/audioautoencoder/")
    os.system("git pull")
    os.system("git checkout dataset-generation-fix")
    os.system("git pull origin dataset-generation-fix")
    #os.system("pip install --upgrade torchmetrics")
else:
    print("Running locally...")
    os.system("git pull origin dataset-generation-fix")
    #os.system("pip install --upgrade torchmetrics")


Running in Google Colab...


In [3]:
import sys
sys.path.append('/content/audioautoencoder')
sys.path.append('/content/audioautoencoder/audioautoencoder')

## Data Generation

In [4]:
from audioautoencoder.data_management import *
GENERATE=False

In [5]:
def get_folder_size_gb(folder_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            if os.path.exists(file_path):  # Ensure file exists before getting size
                total_size += (os.path.getsize(file_path) / (1024 ** 3))
    return total_size  # Convert bytes to gigabytes

In [6]:
# Example Usage
dataset_dirs = ["/content/drive/MyDrive/Datasets/Noise/All_Noise"]
output_dir = "/content/drive/MyDrive/Datasets/Noise/splits_v2"

if GENERATE:
  splits = create_datasets(dataset_dirs, output_dir)
  save_splits_to_directories(splits, output_dir, max_workers=8)
  print("Training Set:", len(splits["train"]))
  print("Validation Set:", len(splits["val"]))
  print("Testing Set:", len(splits["test"]))


In [7]:
from audioautoencoder.data import *

In [8]:
noise_test = output_dir + "/test"
noise_train = output_dir + "/train"

size_gb = get_folder_size_gb(noise_test)
print(f"Test Folder total size: {size_gb:.2f} GB")

size_gb = get_folder_size_gb(noise_train)
print(f"Train Folder total size: {size_gb:.2f} GB")

Test Folder total size: 0.98 GB
Train Folder total size: 2.95 GB


In [9]:
# generate audio files for noise and music (2s)
noise_test_output = noise_test + "-2s-44100"
noise_train_output = noise_train + "-2s-44100"

if GENERATE:
  for input_path, output_path in [(noise_test, noise_test_output), (noise_train, noise_train_output)]:
    print(input_path, output_path)
    generate_audio_files(input_path, output_path, t=2, min_size=0.005)

In [10]:
size_gb = get_folder_size_gb(noise_test_output)
print(f"Noise Test Folder size: {size_gb:.2f} GB")

size_gb = get_folder_size_gb(noise_train_output)
print(f"Noise Train Folder size: {size_gb:.2f} GB")

Noise Test Folder size: 0.49 GB
Noise Train Folder size: 1.24 GB


## Generic Music Dataset

In [11]:
# Example Usage
GENERATE=False
dataset_dirs = ["/content/drive/Othercomputers/My Mac/PersonalMusic"]
output_dir = "/content/drive/MyDrive/Datasets/Music/Additional_Music/splits_v2"

if GENERATE:
  splits = create_datasets(dataset_dirs, output_dir, WAV=False, MP3=True)
  print("Training Set:", len(splits["train"]))
  print("Validation Set:", len(splits["val"]))
  print("Testing Set:", len(splits["test"]))


In [12]:
if GENERATE:
  save_splits_to_directories(splits, output_dir, max_workers=8)

In [13]:
music_test = output_dir + "/test"
music_train = output_dir + "/train"

# generate audio files for noise and music (2s)
music_test_output = music_test + "-2s-44100"
music_train_output = music_train + "-2s-44100"

if GENERATE:
  for input_path, output_path in [(music_test, music_test_output), (music_train, music_train_output)]:
    print(input_path, output_path)
    generate_audio_files(input_path, output_path, t=2, min_size=0.005)

## Stuff for MUSDB18


In [14]:
music_test = "/content/drive/MyDrive/Datasets/Music/MUSDB18/test"
music_train = "/content/drive/MyDrive/Datasets/Music/MUSDB18/train"

music_test_output = music_test + "-2s-44100"
music_train_output = music_train + "-2s-44100"

In [15]:
if False:
  size_gb = get_folder_size_gb(music_test_output)
  print(f"Music Test Folder size: {size_gb:.2f} GB")

  size_gb = get_folder_size_gb(music_train_output)
  print(f"Music Train Folder size: {size_gb:.2f} GB")

In [16]:
# generate audio files for noise and music (2s)
if False:
  for input_path, output_path in [(music_test, music_test_output), (music_train, music_train_output)]:
    print(input_path, output_path)
    generate_audio_files(input_path, output_path, t=2)

## Process files to H5

In [17]:
from audioautoencoder.generate_dataset import *

In [None]:
GENERATE_H5_FILES = True
checkpoint_file_size=100000
if GENERATE_H5_FILES:
  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level',
          SNRdB=[-10, 10],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size,
          batch_size=500
      )
  processor.process()

if False:
  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level',
          SNRdB=[0, 20],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size,
          batch_size=100
      )
  processor.process()

  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_all-noise_features',
          SNRdB=[10, 30],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size
      )
  processor.process()

  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_mix_features',
          SNRdB=[-10, 30],
          process_train=True,
          process_test=True,
          mix_only=True,
          checkpoint_file_size=checkpoint_file_size
      )
  processor.process()

Output Dir: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level
Train Checkpoint File: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train-SNRdB_-10-10-checkpoint.txt
Train Output File: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10.h5
Test Checkpoint File: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/test-SNRdB_-10-10-checkpoint.txt
Test Output File: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/test/test-SNRdB_-10-10.h5
Directory already exists: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train
Directory already exists: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/test
Processing training dataset....
Gathering wav files....
Error accessing directory: [Errno 5] Input/output error: '/content/drive/MyDrive/Dat

Processing batches:   0%|          | 0/200 [00:00<?, ?batch/s]

Creating HDF5 file....


Processing batches:   0%|          | 1/200 [08:32<28:19:52, 512.52s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_072409.h5
Free disk space: 185.45 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Music Delta - Rockabilly_other_sec26.wav: operands could not be broadcast together with shapes (88200,) (85958,) 
Creating HDF5 file....


Processing batches:   1%|          | 2/200 [10:44<15:52:44, 288.71s/batch]

Current file size: 1.7324254661798477
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_072624.h5
Free disk space: 183.63 GB
Creating HDF5 file....


Processing batches:   2%|▏         | 3/200 [13:25<12:36:45, 230.48s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_072836.h5
Free disk space: 181.81 GB
Creating HDF5 file....


Processing batches:   2%|▏         | 4/200 [16:11<11:09:41, 205.01s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_073117.h5
Free disk space: 179.99 GB
Creating HDF5 file....


Processing batches:   2%|▎         | 5/200 [18:45<10:06:30, 186.62s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_073404.h5
Free disk space: 178.17 GB
Creating HDF5 file....


Processing batches:   3%|▎         | 6/200 [21:23<9:31:19, 176.70s/batch] 

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_073639.h5
Free disk space: 176.35 GB
Creating HDF5 file....


Processing batches:   4%|▎         | 7/200 [24:06<9:14:45, 172.47s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_073915.h5
Free disk space: 174.54 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Music Delta - Reggae_drums_sec18.wav: operands could not be broadcast together with shapes (88200,) (64504,) 
Creating HDF5 file....


Processing batches:   4%|▍         | 8/200 [26:42<8:55:01, 167.19s/batch]

Current file size: 1.7324254661798477
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_074158.h5
Free disk space: 172.72 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Music Delta - Reggae_vocals_sec18.wav: operands could not be broadcast together with shapes (88200,) (64504,) 
Creating HDF5 file....


Processing batches:   4%|▍         | 9/200 [29:16<8:39:10, 163.09s/batch]

Current file size: 1.7324254661798477
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_074430.h5
Free disk space: 170.91 GB
Creating HDF5 file....


Processing batches:   5%|▌         | 10/200 [31:47<8:24:04, 159.18s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_074707.h5
Free disk space: 169.10 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Music Delta - Rock_bass_sec14.wav: operands could not be broadcast together with shapes (88200,) (48120,) 
Creating HDF5 file....


Processing batches:   6%|▌         | 11/200 [34:26<8:20:58, 159.04s/batch]

Current file size: 1.7324254661798477
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_074934.h5
Free disk space: 167.29 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Faces On Film - Waiting For Ga_other_sec258.wav: operands could not be broadcast together with shapes (88200,) (71985,) 
Creating HDF5 file....


Processing batches:   6%|▌         | 12/200 [37:06<8:19:14, 159.33s/batch]

Current file size: 1.7324254661798477
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_075215.h5
Free disk space: 165.48 GB
Creating HDF5 file....


Processing batches:   6%|▋         | 13/200 [39:39<8:11:15, 157.62s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_075453.h5
Free disk space: 163.67 GB
Creating HDF5 file....


Processing batches:   7%|▋         | 14/200 [42:15<8:06:47, 157.03s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_075728.h5
Free disk space: 161.86 GB
Creating HDF5 file....


Processing batches:   8%|▊         | 15/200 [45:02<8:13:52, 160.18s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_080004.h5
Free disk space: 160.05 GB
Creating HDF5 file....


Processing batches:   8%|▊         | 16/200 [47:40<8:09:09, 159.51s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_080250.h5
Free disk space: 158.25 GB
Creating HDF5 file....


Processing batches:   8%|▊         | 17/200 [50:10<7:57:51, 156.67s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_080528.h5
Free disk space: 156.44 GB
Creating HDF5 file....


Processing batches:   9%|▉         | 18/200 [52:43<7:51:07, 155.31s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_080757.h5
Free disk space: 154.63 GB
Creating HDF5 file....


Processing batches:  10%|▉         | 19/200 [55:26<7:55:52, 157.75s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_081029.h5
Free disk space: 152.99 GB
Creating HDF5 file....


Processing batches:  10%|█         | 20/200 [58:24<8:11:19, 163.77s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_081320.h5
Free disk space: 153.23 GB
Creating HDF5 file....


Processing batches:  10%|█         | 21/200 [1:01:29<8:27:55, 170.25s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_081626.h5
Free disk space: 153.30 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Sweet Lights - You Let Me Down_other_sec392.wav: operands could not be broadcast together with shapes (88200,) (87400,) 
Creating HDF5 file....


Processing batches:  11%|█         | 22/200 [1:04:19<8:24:59, 170.22s/batch]

Current file size: 1.7324254661798477
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_081932.h5
Free disk space: 153.23 GB
Creating HDF5 file....


Processing batches:  12%|█▏        | 23/200 [1:07:15<8:27:12, 171.94s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_082220.h5
Free disk space: 153.21 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Music Delta - Reggae_other_sec18.wav: operands could not be broadcast together with shapes (88200,) (64504,) 
Creating HDF5 file....


Processing batches:  12%|█▏        | 24/200 [1:10:15<8:31:24, 174.35s/batch]

Current file size: 1.7324254661798477
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_082514.h5
Free disk space: 153.15 GB
Creating HDF5 file....


Processing batches:  12%|█▎        | 25/200 [1:13:21<8:38:30, 177.78s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_082814.h5
Free disk space: 153.15 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Music Delta - Rock_vocals_sec14.wav: operands could not be broadcast together with shapes (88200,) (48120,) 
Creating HDF5 file....


Processing batches:  13%|█▎        | 26/200 [1:16:14<8:31:29, 176.37s/batch]

Current file size: 1.7324254661798477
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_083122.h5
Free disk space: 153.10 GB
Creating HDF5 file....


Processing batches:  14%|█▎        | 27/200 [1:19:22<8:38:36, 179.86s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_083412.h5
Free disk space: 153.16 GB
Creating HDF5 file....


Processing batches:  14%|█▍        | 28/200 [1:22:12<8:26:54, 176.83s/batch]

Current file size: 1.7358970046043396
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_083718.h5
Free disk space: 153.20 GB
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Music Delta - Rock_vocals_sec14.wav: operands could not be broadcast together with shapes (88200,) (48120,) 
Error processing /content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100/Hezekiah Jones - Borrowed Heart_bass_sec242.wav: operands could not be broadcast together with shapes (88200,) (69609,) 
Creating HDF5 file....


Processing batches:  14%|█▍        | 29/200 [1:25:04<8:20:07, 175.48s/batch]

Current file size: 1.7289539277553558
Done /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10_20250408_084013.h5
Free disk space: 153.11 GB


In [None]:
processor = DatasetProcessor(
        train_music_dir='/content/drive/MyDrive/Datasets/Music/Additional_Music/splits_v2/train-2s-44100',
        train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
        test_music_dir='/content/drive/MyDrive/Datasets/Music/Additional_Music/splits_v2/test-2s-44100',
        test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
        output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_all-noise_features_2',
        SNRdB=[-10, 20],
        process_train=True,
        process_test=True,
        checkpoint_file_size=100000,
        batch_size=200
    )
processor.process()

---