# Loader for Space Divers Mini sound lib

In [1]:
SOURCE_NAME = "space_divers_mini"
ORIGINAL_PATH = "./data/original/" + SOURCE_NAME + "/"
SAMPLES_PATH = ORIGINAL_PATH + "Samples/"
TARGET_PATH = "./data/intermediate/" + SOURCE_NAME + "/"

## Get the filenames to parse

In [2]:
import os

In [3]:
audio_extensions = (".wav", ".mp3", ".flac", ".aac", ".m4a", ".ogg", ".opus")

def load_filenames(path: str) -> list[str]:
    "Load a list of audio filenames in the directory."
    return [filename for filename in os.listdir(path)
            if filename.endswith(audio_extensions)]

In [4]:
filenames = load_filenames(SAMPLES_PATH)
filenames[:5]

['99S LT Ambience Underwater - Coral Sea.wav',
 '99S LT Manipulate - Distant.wav',
 '99S LT Impact - Crash Distant A.wav',
 '99S LT Underwater - Move D.wav',
 '99S LT Vehicle - Resonation Slow.wav']

In [5]:
extensions = [filename.split(".")[-1] for filename in filenames]
assert len(extensions) == len(filenames)
extensions[:5]

['wav', 'wav', 'wav', 'wav', 'wav']

## Parse the files into labels

All files are added to the `Sci_Fi` category.

The titles follow a couple of patterns but follow within 2:  
`<space separated categories> - <specific label>.wav`  
`<specific label>.wav`  

Space separated categories can also include the `-` split.  
The specific labels can have `A B C...` variants which get ignored.

In [6]:
import string
import stringcase

In [18]:
# We have to lower() first otherwise stringcase.snakecase will prepend multiple underscores
def normalise_label(label: str) -> str:
    return stringcase.capitalcase(stringcase.snakecase(label.lower().strip()))

In [7]:
def get_labels(filename: str) -> list[str]:
    "Parse the filename to get a list of labels."
    # Remove the prefix and extension
    title = filename.removeprefix("99S LT ").rsplit(".", maxsplit=1)[0]
    *labels, specific_label = title.split("-")
    labels = [l.strip() for label in labels for l in label.split()]
    # Remove the variant at the end
    specific_label = specific_label.rstrip(string.whitespace + string.ascii_uppercase)
    return [normalise_label(l) for l in labels + [specific_label]]

In [8]:
labels = [get_labels(filename) for filename in filenames]
labels[:5]

[['Ambience', 'Underwater', 'Coral_sea'],
 ['Manipulate', 'Distant'],
 ['Impact', 'Crash_distant'],
 ['Underwater', 'Move'],
 ['Vehicle', 'Resonation_slow']]

## Copy the files to the intermediate folder

In [9]:
import xxhash
import shutil

In [10]:
def get_hash(filename: str) -> str:
    "Get the sha2 hash of the file."
    with open(os.path.normpath(filename), "rb") as f:
        return xxhash.xxh64(f.read()).hexdigest()

In [11]:
hashed_filename = ["{}.{}".format(get_hash(SAMPLES_PATH + filename), extension)
                   for filename, extension in zip(filenames, extensions)]
hashed_filename[:5]

['df329b2f45088143.wav',
 'beda557bed5629a2.wav',
 '56b3dae6f6efd75f.wav',
 'baaddb69127f8753.wav',
 '6c110836db6654e5.wav']

In [12]:
if not os.path.exists(TARGET_PATH):
    os.makedirs(TARGET_PATH)

# Copy new the files to the target directory
for filename, hash in zip(filenames, hashed_filename):
    target_file_path = os.path.join(TARGET_PATH, hash)
    if not os.path.exists(target_file_path):
        shutil.copy2(SAMPLES_PATH + filename, target_file_path)

## Export the CSV rows

In [13]:
import csv

In [14]:
with open("version", "r") as f:
    version = int(f.read())
print("Version:", version)

Version: 5


In [19]:
# Create a CSV file with the labels
category_rows = [normalise_label("sci-fi")] * len(labels)
label_rows = [",".join(l) for l in labels]
source_rows = [SOURCE_NAME] * len(labels)
version_rows = [version] * len(labels)

with open(TARGET_PATH + "metadata.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "category", "label", "extra", "source", "version"])
    writer.writerows(zip(hashed_filename, category_rows, label_rows, source_rows, version_rows))