# Loader for Footsteps PPSFX sound lib

In [17]:
CATEGORY = "footsteps"
SOURCE_NAME = "footsteps_two_ppsfx_008"
ORIGINAL_PATH = "./data/original/" + SOURCE_NAME + "/"
SAMPLES_PATH = ORIGINAL_PATH + "Audio/"
METADATA_PATH = ORIGINAL_PATH + "Documents/PPSFX008 - Footsteps Two Metadata.xls"
TARGET_PATH = "./data/intermediate/" + SOURCE_NAME + "/"

## Get the filenames to parse

In [2]:
import os

In [3]:
audio_extensions = (".wav", ".mp3", ".flac", ".aac", ".m4a", ".ogg", ".opus")

def load_filenames(path: str) -> list[str]:
    "Load a list of audio filenames in the directory."
    return [filename for filename in os.listdir(path)
            if filename.endswith(audio_extensions)]

In [6]:
# We sort to make it easier to match with extra metadata later.
filenames = sorted(load_filenames(SAMPLES_PATH))
filenames

['Footsteps Dirt Scuffs.wav',
 'Footsteps Dirt Stomps.wav',
 'Footsteps Dirt Walk.wav',
 'Footsteps Dry Sand Scuffs.wav',
 'Footsteps Dry Sand Stomps.wav',
 'Footsteps Dry Sand Walk.wav',
 'Footsteps Dry Swamp Scuffs.wav',
 'Footsteps Dry Swamp Stomps.wav',
 'Footsteps Dry Swamp Walk.wav',
 'Footsteps Forest Plain II Scuffs.wav',
 'Footsteps Forest Plain II Stomps.wav',
 'Footsteps Forest Plain II Walk.wav',
 'Footsteps Grass Long Scuffs.wav',
 'Footsteps Grass Long Stomps.wav',
 'Footsteps Grass Long Walk.wav',
 'Footsteps Grass Short Scuffs.wav',
 'Footsteps Grass Short Stomps.wav',
 'Footsteps Grass Short Walk.wav',
 'Footsteps Water Deep Scuffs.wav',
 'Footsteps Water Deep Wade.wav',
 'Footsteps Water Shallow Scuffs.wav',
 'Footsteps Water Shallow Stomps.wav',
 'Footsteps Water Shallow Walk.wav',
 'Footsteps Wet Swamp Scuffs.wav',
 'Footsteps Wet Swamp Squishes.wav',
 'Footsteps Wet Swamp Stomps.wav',
 'Footsteps Wet Swamp Walk.wav',
 'Footsteps Wooden Stairs Scuffs.wav',
 'Footste

In [22]:
extensions = [filename.split(".")[-1] for filename in filenames]
assert len(extensions) == len(filenames)
extensions[:5]

['wav', 'wav', 'wav', 'wav', 'wav']

## Parse the files into labels

* All file names start with `Footsteps`, which we set as category.  
* All file names follow with the type of material/ground they were recorded in, which we separate into a label.  
* Some names will then include type (e.g. Walk / Stomps), which we separate into a label.  
* One includes variation `Plain II`, which we replace with `Plain`.    

In [9]:
import string
import stringcase

In [10]:
# We have to lower() first otherwise stringcase.snakecase will prepend multiple underscores
def normalise_label(label: str) -> str:
    return stringcase.capitalcase(stringcase.snakecase(label.lower().strip()))

In [11]:
def get_labels(filename: str) -> list[str]:
    "Parse the filename to get a list of labels."
    # Remove the prefix and extension
    filename = filename.removeprefix("Footsteps").rsplit(".", maxsplit=1)[0]

    # Repalce Plain II with Plain
    filename = filename.replace("Plain II", "Plain")

    # For each step type, we remove it from the filename and add it to the labels
    labels = []

    step_types = ("Walk", "Scuffs", "Stomps", "Squishes", "Wade")
    step_label = None
    for step in step_types:
        if step in filename:
            step_label = step
    
    if step_label is not None:
        filename = filename.replace(" " + step_label, "")
        labels.append(normalise_label(step_label))
    
    labels.append(normalise_label(filename))

    return labels

In [12]:
labels = [get_labels(filename) for filename in filenames]
assert len(labels) == len(filenames)
labels

[['Scuffs', 'Dirt'],
 ['Stomps', 'Dirt'],
 ['Walk', 'Dirt'],
 ['Scuffs', 'Dry_sand'],
 ['Stomps', 'Dry_sand'],
 ['Walk', 'Dry_sand'],
 ['Scuffs', 'Dry_swamp'],
 ['Stomps', 'Dry_swamp'],
 ['Walk', 'Dry_swamp'],
 ['Scuffs', 'Forest_plain'],
 ['Stomps', 'Forest_plain'],
 ['Walk', 'Forest_plain'],
 ['Scuffs', 'Grass_long'],
 ['Stomps', 'Grass_long'],
 ['Walk', 'Grass_long'],
 ['Scuffs', 'Grass_short'],
 ['Stomps', 'Grass_short'],
 ['Walk', 'Grass_short'],
 ['Scuffs', 'Water_deep'],
 ['Wade', 'Water_deep'],
 ['Scuffs', 'Water_shallow'],
 ['Stomps', 'Water_shallow'],
 ['Walk', 'Water_shallow'],
 ['Scuffs', 'Wet_swamp'],
 ['Squishes', 'Wet_swamp'],
 ['Stomps', 'Wet_swamp'],
 ['Walk', 'Wet_swamp'],
 ['Scuffs', 'Wooden_stairs'],
 ['Stomps', 'Wooden_stairs'],
 ['Walk', 'Wooden_stairs']]

## Get the Extra metadata

This sound bank includes an xls with extra metadata.

In [13]:
import pandas as pd

In [18]:
metadata = pd.read_excel(METADATA_PATH)
# Sorted to match with filenames
metadata.sort_values(by="Filename", inplace=True)
descriptions = metadata["BWDescription"].values

assert len(descriptions) == len(filenames)
list(zip(filenames, descriptions))[:5]

[('Footsteps Dirt Scuffs.wav',
  'Footsteps Dirt Scuffs Sneakers Shuffle Slide Small Rocks Sony D100'),
 ('Footsteps Dirt Stomps.wav',
  'Footsteps Dirt Stomps Sneakers Small Rocks Impact Sony D100'),
 ('Footsteps Dirt Walk.wav',
  'Footsteps Dirt Walk Sneakers Rattle Small Rocks Sony D100'),
 ('Footsteps Dry Sand Scuffs.wav',
  'Footsteps Dry Sand Scuffs Sneakers Shuffle Slide Sony D100'),
 ('Footsteps Dry Sand Stomps.wav',
  'Footsteps Dry Sand Stomps Sneakers Smooth Impact Sony D100')]

## Copy the files to the intermediate folder

In [19]:
import xxhash
import shutil

In [20]:
def get_hash(filename: str) -> str:
    "Get the sha2 hash of the file."
    with open(os.path.normpath(filename), "rb") as f:
        return xxhash.xxh64(f.read()).hexdigest()

In [23]:
hashed_filename = ["{}.{}".format(get_hash(SAMPLES_PATH + filename), extension)
                   for filename, extension in zip(filenames, extensions)]
assert len(hashed_filename) == len(filenames)
hashed_filename[:5]

['672c1b72b6e7e0c6.wav',
 '2eff0b9a61c7d61d.wav',
 'f88702cfabaeb850.wav',
 '8eb3ed1bd6f74884.wav',
 'ee425ebac8251f83.wav']

In [24]:
if not os.path.exists(TARGET_PATH):
    os.makedirs(TARGET_PATH)

In [25]:
# Copy new the files to the target directory
for filename, hash in zip(filenames, hashed_filename):
    target_file_path = os.path.join(TARGET_PATH, hash)
    if not os.path.exists(target_file_path):
        shutil.copy2(SAMPLES_PATH + filename, target_file_path)

## Export the CSV rows

In [26]:
import csv

In [27]:
with open("version", "r") as f:
    version = int(f.read())
print("Version:", version)

Version: 8


In [28]:
# Create a CSV file with the labels
category_rows = [normalise_label(CATEGORY)] * len(labels)
label_rows = [",".join(l) for l in labels]
source_rows = [SOURCE_NAME] * len(labels)
version_rows = [version] * len(labels)

with open(TARGET_PATH + "metadata.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "category", "label", "extra", "source", "version"])
    writer.writerows(zip(hashed_filename, category_rows, label_rows, descriptions, source_rows, version_rows))