# Dataset Generation

## Notes:
1. Right now the data generation makes and saves all the features, this is not necessary all the time... and will probably take more time overall
2. So what I should do is have a dictionary where I choose the features to generate, so that when I need them they exist

In [1]:
import sys
import os

# Detect Google Colab
if "google.colab" in sys.modules:
  from google.colab import drive
  drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Detect Google Colab
if "google.colab" in sys.modules:
    print("Running in Google Colab...")
    os.system("git clone https://github.com/CiaranMaloy/audioautoencoder")
    os.chdir("/content/audioautoencoder/")
    os.system("git pull")
    os.system("git checkout dataset-generation-fix")
    os.system("git pull origin dataset-generation-fix")
    #os.system("pip install --upgrade torchmetrics")
else:
    print("Running locally...")
    os.system("git pull origin dataset-generation-fix")
    #os.system("pip install --upgrade torchmetrics")


Running in Google Colab...


In [3]:
import sys
sys.path.append('/content/audioautoencoder')
sys.path.append('/content/audioautoencoder/audioautoencoder')

## Data Generation

In [4]:
from audioautoencoder.data_management import *
GENERATE=False

In [5]:
def get_folder_size_gb(folder_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            if os.path.exists(file_path):  # Ensure file exists before getting size
                total_size += (os.path.getsize(file_path) / (1024 ** 3))
    return total_size  # Convert bytes to gigabytes

In [6]:
# Example Usage
dataset_dirs = ["/content/drive/MyDrive/Datasets/Noise/All_Noise"]
output_dir = "/content/drive/MyDrive/Datasets/Noise/splits_v2"

if GENERATE:
  splits = create_datasets(dataset_dirs, output_dir)
  save_splits_to_directories(splits, output_dir, max_workers=8)
  print("Training Set:", len(splits["train"]))
  print("Validation Set:", len(splits["val"]))
  print("Testing Set:", len(splits["test"]))


In [7]:
from audioautoencoder.data import *

In [8]:
noise_test = output_dir + "/test"
noise_train = output_dir + "/train"

size_gb = get_folder_size_gb(noise_test)
print(f"Test Folder total size: {size_gb:.2f} GB")

size_gb = get_folder_size_gb(noise_train)
print(f"Train Folder total size: {size_gb:.2f} GB")

Test Folder total size: 0.98 GB
Train Folder total size: 2.95 GB


In [9]:
# generate audio files for noise and music (2s)
noise_test_output = noise_test + "-2s-44100"
noise_train_output = noise_train + "-2s-44100"

if GENERATE:
  for input_path, output_path in [(noise_test, noise_test_output), (noise_train, noise_train_output)]:
    print(input_path, output_path)
    generate_audio_files(input_path, output_path, t=2, min_size=0.005)

In [10]:
size_gb = get_folder_size_gb(noise_test_output)
print(f"Noise Test Folder size: {size_gb:.2f} GB")

size_gb = get_folder_size_gb(noise_train_output)
print(f"Noise Train Folder size: {size_gb:.2f} GB")

Noise Test Folder size: 0.49 GB
Noise Train Folder size: 1.24 GB


## Generic Music Dataset

In [11]:
# Example Usage
GENERATE=False
dataset_dirs = ["/content/drive/Othercomputers/My Mac/PersonalMusic"]
output_dir = "/content/drive/MyDrive/Datasets/Music/Additional_Music/splits_v2"

if GENERATE:
  splits = create_datasets(dataset_dirs, output_dir, WAV=False, MP3=True)
  print("Training Set:", len(splits["train"]))
  print("Validation Set:", len(splits["val"]))
  print("Testing Set:", len(splits["test"]))


In [12]:
if GENERATE:
  save_splits_to_directories(splits, output_dir, max_workers=8)

In [13]:
music_test = output_dir + "/test"
music_train = output_dir + "/train"

# generate audio files for noise and music (2s)
music_test_output = music_test + "-2s-44100"
music_train_output = music_train + "-2s-44100"

if GENERATE:
  for input_path, output_path in [(music_test, music_test_output), (music_train, music_train_output)]:
    print(input_path, output_path)
    generate_audio_files(input_path, output_path, t=2, min_size=0.005)

## Stuff for MUSDB18


In [14]:
music_test = "/content/drive/MyDrive/Datasets/Music/MUSDB18/test"
music_train = "/content/drive/MyDrive/Datasets/Music/MUSDB18/train"

music_test_output = music_test + "-2s-44100"
music_train_output = music_train + "-2s-44100"

In [15]:
if False:
  size_gb = get_folder_size_gb(music_test_output)
  print(f"Music Test Folder size: {size_gb:.2f} GB")

  size_gb = get_folder_size_gb(music_train_output)
  print(f"Music Train Folder size: {size_gb:.2f} GB")

In [16]:
# generate audio files for noise and music (2s)
if False:
  for input_path, output_path in [(music_test, music_test_output), (music_train, music_train_output)]:
    print(input_path, output_path)
    generate_audio_files(input_path, output_path, t=2)

## Process files to H5

In [17]:
from audioautoencoder.generate_dataset import *

In [None]:
GENERATE_H5_FILES = True
checkpoint_file_size=100000
if GENERATE_H5_FILES:
  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level',
          SNRdB=[-10, 10],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size,
          batch_size=500,
      )
  processor.process()

  processor = DatasetProcessor(
        train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
        train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
        test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
        test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
        output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level_mix-only',
        SNRdB=[-10, 10],
        process_train=True,
        process_test=True,
        mix_only=True,
        checkpoint_file_size=checkpoint_file_size//4,
        batch_size=500,
    )
  processor.process()

if False:
  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level',
          SNRdB=[0, 20],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size,
          batch_size=100
      )
  processor.process()

  processor = DatasetProcessor(
          train_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/train-2s-44100',
          train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
          test_music_dir='/content/drive/MyDrive/Datasets/Music/MUSDB18/test-2s-44100',
          test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
          output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_all-noise_features',
          SNRdB=[10, 30],
          process_train=True,
          process_test=True,
          checkpoint_file_size=checkpoint_file_size,
      )
  processor.process()

Output Dir: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level
Train Checkpoint File: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train-SNRdB_-10-10-checkpoint.txt
Train Output File: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train/train-SNRdB_-10-10.h5
Test Checkpoint File: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/test-SNRdB_-10-10-checkpoint.txt
Test Output File: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/test/test-SNRdB_-10-10.h5
Directory already exists: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/train
Directory already exists: /content/drive/MyDrive/Datasets/Music-Noise/SNRdB_remapped-signal-level/SNRdB_-10-10/test
Processing training dataset....
Gathering wav files....
Files processed:  46733
Gathering noise files....
Files processed:  7550
Checkpoint 

Processing batches:   0%|          | 0/106 [00:00<?, ?batch/s]

In [None]:
processor = DatasetProcessor(
        train_music_dir='/content/drive/MyDrive/Datasets/Music/Additional_Music/splits_v2/train-2s-44100',
        train_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/train-2s-44100',
        test_music_dir='/content/drive/MyDrive/Datasets/Music/Additional_Music/splits_v2/test-2s-44100',
        test_noise_dir='/content/drive/MyDrive/Datasets/Noise/splits_v2/test-2s-44100',
        output_dir='/content/drive/MyDrive/Datasets/Music-Noise/SNRdB_all-noise_features_2',
        SNRdB=[-10, 20],
        process_train=True,
        process_test=True,
        checkpoint_file_size=100000,
        batch_size=200
    )
processor.process()

---