# Creating Synthetic Overlapping Audio

## Jessica Stinson
## s224576666

The following notebook provides a basic script for creating a synthetic audio dataset. When run, the script will copy all original audiofiles to the output directory then create a mix of new files containing either two or three overlapping vocalisations. The 'SYNTHETIC_SIZE' variable can be altered to change the number of synthetic audio files added to the dataset. The ratio of original to synthetic audio files used in this example was approximately 1:2. 

In [19]:
# Suppress warnings 
import warnings
warnings.filterwarnings("ignore")

# Set TensorFlow environment
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

import soundfile as sf 
from pathlib import Path
import librosa
import numpy as np
import random 
import tensorflow as tf
import json
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer

SC = {
    'AUDIO_DATA_DIRECTORY': r"C:\Project-Echo\src\Prototypes\engine\Working with overlapping audio\Synthetic Dataset tests\originals",
    'AUDIO_SAMPLE_RATE': 48000,
    'AUDIO_CLIP_DURATION': 5, # seconds
    'SYNTHETIC_SIZE': 400,
    'OUTPUT_DIR': r"C:\Project-Echo\src\Prototypes\engine\Working with overlapping audio\Synthetic Dataset"
}

audio_dir = Path(SC['AUDIO_DATA_DIRECTORY'])
output_dir = Path(SC['OUTPUT_DIR'])
synthetic_size = SC['SYNTHETIC_SIZE']
SR = SC['AUDIO_SAMPLE_RATE']
duration = SC['AUDIO_CLIP_DURATION']

In [20]:
def index_directory(directory, file_types=('.ogg', '.mp3', '.wav', '.flac')):
    """ 
    Indexes audio files in a directory structured by class subdirectories.

    Args:
        directory (str or Path): The root directory containing subdirectories for each class.
        file_types (tuple, optional): Allowed audio file extensions.
            Defaults to ('.ogg', '.mp3', '.wav', '.flac').

    Returns: 
        tuple:
            - audio_files (list of str): Full paths to audio files found.
            - labels (list of int): Integer labels corresponding to class directories.
            - class_names (list of str): Sorted list of class names.

    Notes:
        The function assumes each subdirectory in 'directory' represents a unique class. 
        The function assigns integer labels to each class based on alphabetical order of class names. 
    """
    audio_files = []
    labels = []
    class_names = sorted([dir.name for dir in Path(directory).glob('*') if dir.is_dir()])

    # Create a mapping from file path to a list of labels
    file_label_map = {}
    for label_idx, class_name in enumerate(class_names):
        class_dir = Path(directory) / class_name
        for file_path in class_dir.glob(f'**/*'):
            if file_path.suffix in file_types:
                str_path = str(file_path)
                if str_path not in file_label_map:
                    file_label_map[str_path] = []
                file_label_map[str_path].append(label_idx)

    # Convert the map to parallel lists for dataset creation
    file_paths = list(file_label_map.keys())
    labels_list = list(file_label_map.values())
    
    # Use MultiLabelBinarizer to create multi-hot encoded labels
    mlb = MultiLabelBinarizer(classes=range(len(class_names)))
    labels = mlb.fit_transform(labels_list)
    
    # Check if a file has no labels and remove it.
    valid_indices = [i for i, label in enumerate(labels) if any(label)]
    file_paths = [file_paths[i] for i in valid_indices]
    labels = labels[valid_indices]

    return file_paths, labels, class_names

In [21]:
# Load original audio dataset 
file_paths, labels, class_names = index_directory(audio_dir)

In [22]:
# Copy original audio to output directory
combined_file_paths = []
synthetic_labels = []

for i, (src_path, label) in enumerate(zip(file_paths, labels)):
    species_name = Path(src_path).parent.name.replace(" ", "_").lower()
    dst_path = output_dir / f"{species_name}_{i + 1}.wav"
    audio, _ = librosa.load(src_path, sr=SR)
    audio = librosa.util.fix_length(audio, size=int(duration * SR))
    sf.write(dst_path, audio, SR)

    combined_file_paths.append(str(dst_path))
    synthetic_labels.append(label)

print(f"Copied and normalized {len(file_paths)} original files into: {output_dir}")

Copied and normalized 268 original files into: C:\Project-Echo\src\Prototypes\engine\Working with overlapping audio\Synthetic Dataset


In [23]:
for i in range(synthetic_size):
    num_to_mix = random.randint(2, 3)    
    indices = random.sample(range(len(file_paths)), num_to_mix)

    synthetic_audio = np.zeros(int(duration * SR), dtype = np.float32)
    label_vector = np.zeros(len(class_names), dtype = np.float32)

    for idx in indices:
        path = file_paths[idx]
        label = labels[idx]

        audio, _ = librosa.load(path, sr = SR)
        audio = librosa.util.fix_length(audio, size = int(duration * SR))
        synthetic_audio += audio
        label_vector += label
        label_vector = np.clip(label_vector, 0, 1)

    # Normalize to prevent clipping
    max_val = np.max(np.abs(synthetic_audio))
    if max_val > 1.0:
        synthetic_audio = synthetic_audio / max_val

    file_name = f"synthetic_{i + 1}.wav"
    dst_path = output_dir / file_name
    sf.write(dst_path, synthetic_audio, SR)

    combined_file_paths.append(str(dst_path))
    synthetic_labels.append(label_vector)
    
    if i == 0 or (i + 1) % 100 == 0 or i == synthetic_size - 1:
        print(f"Generated {i + 1}/{synthetic_size} synthetic samples")

synthetic_labels = np.array(synthetic_labels)

# Save labels
np.save(output_dir / "synthetic_labels.npy", np.array(synthetic_labels))
    
# Save class names and file paths
with open(output_dir / "class_names.json", "w") as f:
    json.dump(class_names, f)

with open(output_dir / "file_paths.json", "w") as f:
    json.dump(combined_file_paths, f)

print("Synthetic dataset generation complete.")
print(f"\nCombined dataset created in {output_dir}")
print(f"Total files: {len(combined_file_paths)}")

Generated 1/400 synthetic samples
Generated 100/400 synthetic samples
Generated 200/400 synthetic samples
Generated 300/400 synthetic samples
Generated 400/400 synthetic samples
Synthetic dataset generation complete.

Combined dataset created in C:\Project-Echo\src\Prototypes\engine\Working with overlapping audio\Synthetic Dataset
Total files: 668
