# Dataset Generation

In [1]:
import sys
import os

# Detect Google Colab
if "google.colab" in sys.modules:
  from google.colab import drive
  drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Detect Google Colab
if "google.colab" in sys.modules:
    print("Running in Google Colab...")
    os.system("git clone https://github.com/CiaranMaloy/audioautoencoder")
    os.chdir("/content/audioautoencoder/")
    os.system("git pull")
    os.system("git checkout bandchannels")
    os.system("git pull origin bandchannels")
    #os.system("pip install --upgrade torchmetrics")
else:
    print("Running locally...")
    os.system("git pull origin bandchannels")
    #os.system("pip install --upgrade torchmetrics")


Running in Google Colab...


In [3]:
import sys
sys.path.append('/content/audioautoencoder')
sys.path.append('/content/audioautoencoder/audioautoencoder')

## Data Generation

In [4]:
from audioautoencoder.data_management import *
GENERATE=True

In [16]:
def get_folder_size_gb(folder_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            if os.path.exists(file_path):  # Ensure file exists before getting size
                total_size += (os.path.getsize(file_path) / (1024 ** 3))
    return total_size  # Convert bytes to gigabytes

In [6]:
# Example Usage
if GENERATE:
  dataset_dirs = ["/content/drive/MyDrive/Datasets/Noise/All_Noise"]
  output_dir = "/content/drive/MyDrive/Datasets/Noise/splits_v2"
  splits = create_datasets(dataset_dirs, output_dir)
  print("Training Set:", len(splits["train"]))
  print("Validation Set:", len(splits["val"]))
  print("Testing Set:", len(splits["test"]))


Splits saved to /content/drive/MyDrive/Datasets/Noise/splits_v2/splits.pkl. Resume state saved to /content/drive/MyDrive/Datasets/Noise/splits_v2/split_state.pkl.
Training Set: 1318
Validation Set: 219
Testing Set: 661


In [7]:
if GENERATE:
  save_splits_to_directories(splits, output_dir, max_workers=8)

/content/drive/MyDrive/Datasets/Noise/splits_v2/train/5-260433-A-39_hteCGh.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/5-213836-C-9_ZIKViF.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/5-170338-A-41_1ZGsk5.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/4-185619-A-21_n6Grem.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/3-188726-A-35_7rMMfN.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/2-119161-A-8_KKkM3u.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/booo0008_5c1f6V.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/1-72229-B-6_qnJMb3.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/2-37806-D-40_HNspYz.wav/content/drive/MyDrive/Datasets/Noise/splits_v2/train/booh0002_CQjzCv.wav

/content/drive/MyDrive/Datasets/Noise/splits_v2/train/5-215658-B-12_i1t9vy.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/5-253085-B-3_VFgohB.wav
/content/drive/MyDrive/Datasets/Noise/splits_v2/train/5-233787-A-7_i

In [8]:
from audioautoencoder.data import *

In [9]:
noise_test = output_dir + "/test"
noise_train = output_dir + "/train"

size_gb = get_folder_size_gb(noise_test)
print(f"Test Folder total size: {size_gb:.2f} GB")

size_gb = get_folder_size_gb(noise_train)
print(f"Train Folder total size: {size_gb:.2f} GB")

Test Folder total size: 1.27 GB
Train Folder total size: 2.66 GB


In [10]:
# generate audio files for noise and music (2s)
if GENERATE:
  noise_test_output = noise_test + "-2s-44100"
  noise_train_output = noise_train + "-2s-44100"

  for input_path, output_path in [(noise_test, noise_test_output), (noise_train, noise_train_output)]:
    print(input_path, output_path)
    generate_audio_files(input_path, output_path, t=2, min_size=0.005)

/content/drive/MyDrive/Datasets/Noise/splits_v2/test /content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100


Processing files: 100%|██████████| 661/661 [01:41<00:00,  6.53it/s]


Audio files have been split and saved.
/content/drive/MyDrive/Datasets/Noise/splits_v2/train /content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100


Processing files: 100%|██████████| 1318/1318 [04:07<00:00,  5.32it/s]

Audio files have been split and saved.





In [11]:
size_gb = get_folder_size_gb(noise_test_output)
print(f"Test Folder size: {size_gb:.2f} GB")

size_gb = get_folder_size_gb(noise_train_output)
print(f"Train Folder size: {size_gb:.2f} GB")

Test Folder size: 0.50 GB
Train Folder size: 1.32 GB


In [17]:
music_test = "/content/drive/MyDrive/Datasets/Music/MUSDB18/test"
music_train = "/content/drive/MyDrive/Datasets/Music/MUSDB18/train"

music_test_output = music_test + "-2s-44100"
music_train_output = music_train + "-2s-44100"

In [18]:
size_gb = get_folder_size_gb(music_test_output)
print(f"Music Test Folder size: {size_gb:.2f} GB")

size_gb = get_folder_size_gb(music_train_output)
print(f"Music Train Folder size: {size_gb:.2f} GB")

Music Test Folder size: 4.21 GB
Music Train Folder size: 7.68 GB


In [12]:
# generate audio files for noise and music (2s)
if False:
  for input_path, output_path in [(music_test, music_test_output), (music_train, music_train_output)]:
    print(input_path, output_path)
    generate_audio_files(input_path, output_path, t=2)

## Process files to H5

In [13]:
GENERATE_H5_FILES = False

if GENERATE_H5_FILES:
  checkpoint_file_size=50000
  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/All_Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/All_Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_sep_features',
          SNRdB=[-10, 10],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size
      )
  processor.process()

  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/All_Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/All_Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_sep_features',
          SNRdB=[0, 20],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size
      )
  processor.process()

  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/All_Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/All_Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_sep_features',
          SNRdB=[10, 30],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size
      )
  processor.process()

  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/All_Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/All_Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_mix_features',
          SNRdB=[-10, 30],
          process_train=True,
          process_test=True,
          mix_only=True,
          checkpoint_file_size=checkpoint_file_size
      )
  processor.process()

---