# Loader for Footsteps PPSFX sound lib

In [1]:
CATEGORY = "footsteps"
SOURCE_NAME = "footsteps_one_ppsfx_004"
ORIGINAL_PATH = "./data/original/" + SOURCE_NAME + "/"
SAMPLES_PATH = ORIGINAL_PATH + "Audio/"
METADATA_PATH = ORIGINAL_PATH + "Documents/PPSFX 004 - Footsteps One Metadata.xls"
TARGET_PATH = "./data/intermediate/" + SOURCE_NAME + "/"

## Get the filenames to parse

In [2]:
import os

In [3]:
audio_extensions = (".wav", ".mp3", ".flac", ".aac", ".m4a", ".ogg", ".opus")

def load_filenames(path: str) -> list[str]:
    "Load a list of audio filenames in the directory."
    return [filename for filename in os.listdir(path)
            if filename.endswith(audio_extensions)]

In [4]:
# We sort to make it easier to match with extra metadata later.
filenames = sorted(load_filenames(SAMPLES_PATH))
filenames[:5]

['Footsteps Forest Plain Scuffs.wav',
 'Footsteps Forest Plain Stomps.wav',
 'Footsteps Forest Plain Walk.wav',
 'Footsteps Frozen Stairs Walk.wav',
 'Footsteps Grass Scuffs.wav']

In [5]:
extensions = [filename.split(".")[-1] for filename in filenames]
assert len(extensions) == len(filenames)
extensions[:5]

['wav', 'wav', 'wav', 'wav', 'wav']

## Parse the files into labels

* All file names start with `Footsteps`, which we set as category.  
* All file names follow with the type of material/ground they were recorded in, which we separate into a label.  
* Some names will then include type (e.g. Walk / Stomps), which we separate into a label.  
* Some include variations (e.g. 01 / 02...), which we ignore.  
* Some include the step length (e.g. Short / Medium), which we keep.  

In [6]:
import string
import stringcase

In [7]:
# We have to lower() first otherwise stringcase.snakecase will prepend multiple underscores
def normalise_label(label: str) -> str:
    return stringcase.capitalcase(stringcase.snakecase(label.lower().strip()))

In [8]:
def get_step_type(filename: str) -> str:
    step_types = ("Walk", "Scuffs", "Stomps", "Squishes", "Wade", "Scrape")
    step_label = None
    for step in step_types:
        if step in filename:
            step_label = step
    return step_label

In [9]:
def get_labels(filename: str) -> list[str]:
    "Parse the filename to get a list of labels."
    # Remove the prefix and extension
    filename = filename.removeprefix("Footsteps").rsplit(".", maxsplit=1)[0]

    # Remove number at the end of the filename
    filename = filename.rsplit("0", maxsplit=1)[0]

    # For each step type, we remove it from the filename and add it to the labels
    labels = []

    step_label = get_step_type(filename)    
    if step_label is not None:
        filename = filename.replace(" " + step_label, "")
        labels.append(normalise_label(step_label))
    
    labels.append(normalise_label(filename))

    return labels

In [10]:
labels = [get_labels(filename) for filename in filenames]
assert len(labels) == len(filenames)
labels

[['Scuffs', 'Forest_plain'],
 ['Stomps', 'Forest_plain'],
 ['Walk', 'Forest_plain'],
 ['Walk', 'Frozen_stairs'],
 ['Scuffs', 'Grass'],
 ['Stomps', 'Grass'],
 ['Walk', 'Grass'],
 ['Scuffs', 'Gravel'],
 ['Stomps', 'Gravel'],
 ['Walk', 'Gravel'],
 ['Scuffs', 'Pine_needle_ground'],
 ['Stomps', 'Pine_needle_ground'],
 ['Walk', 'Pine_needle_ground'],
 ['Scuffs', 'Snow_deep'],
 ['Walk', 'Snow_deep'],
 ['Walk', 'Snow_deep'],
 ['Walk', 'Snow_deep'],
 ['Scuffs', 'Snow_packed'],
 ['Stomps', 'Snow_packed'],
 ['Walk', 'Snow_packed_crunch'],
 ['Walk', 'Snow_packed_rough'],
 ['Scrape', 'Snow_long'],
 ['Scrape', 'Snow_medium'],
 ['Scrape', 'Snow_short_hard'],
 ['Scrape', 'Snow_short'],
 ['Scuffs', 'Wood_scrap'],
 ['Stomps', 'Wood_scrap'],
 ['Walk', 'Wood_scrap']]

## Get the Extra metadata

This sound bank includes an xls with extra metadata.

In [11]:
import pandas as pd

In [12]:
metadata = pd.read_excel(METADATA_PATH)
# Sorted to match with filenames
metadata.sort_values(by="Filename", inplace=True)
descriptions = metadata["BWDescription"].values

assert len(descriptions) == len(filenames)
list(zip(filenames, descriptions))[:5]

[('Footsteps Forest Plain Scuffs.wav',
  'Footsteps Forest Plain Sneakers Scuffs Shuffle Rustle Crackle Sony D50'),
 ('Footsteps Forest Plain Stomps.wav',
  'Footsteps Forest Plain Sneakers Stomps Rustle Crackle Sony D50'),
 ('Footsteps Forest Plain Walk.wav',
  'Footsteps Forest Plain Sneakers Walk Rustle Crackle Sony D50'),
 ('Footsteps Frozen Stairs Walk.wav',
  'Footsteps Frozen Stairs Wood Boots Walk Up Down Hollow Ice Crackle Snow Sony D50'),
 ('Footsteps Grass Scuffs.wav',
  'Footsteps Grass Sneakers Scuffs Shuffle Rustle Sony D50')]

## Copy the files to the intermediate folder

In [13]:
import xxhash
import shutil

In [17]:
def get_hash(filename: str) -> str:
    "Get the sha2 hash of the file."
    with open(os.path.normpath(filename), "rb") as f:
        return xxhash.xxh64(f.read()).hexdigest()

In [18]:
hashed_filename = ["{}.{}".format(get_hash(SAMPLES_PATH + filename), extension)
                   for filename, extension in zip(filenames, extensions)]
assert len(hashed_filename) == len(filenames)
hashed_filename[:5]

['21dcc2adb5107bc2.wav',
 'ce61e38ac2467be4.wav',
 '18133e0cbcdd080c.wav',
 '2a058003156613e8.wav',
 '381210e8584b6c1f.wav']

In [52]:
if not os.path.exists(TARGET_PATH):
    os.makedirs(TARGET_PATH)

In [20]:
# Copy new the files to the target directory
for filename, hash in zip(filenames, hashed_filename):
    target_file_path = os.path.join(TARGET_PATH, hash)
    if not os.path.exists(target_file_path):
        shutil.copy2(SAMPLES_PATH + filename, target_file_path)

## Export the CSV rows

In [14]:
import csv

In [15]:
with open("version", "r") as f:
    version = int(f.read())
print("Version:", version)

Version: 11


In [19]:
# Create a CSV file with the labels
category_rows = [normalise_label(CATEGORY)] * len(labels)
label_rows = [",".join(l) for l in labels]
source_rows = [SOURCE_NAME] * len(labels)
version_rows = [version] * len(labels)

with open(TARGET_PATH + "metadata.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "category", "label", "extra", "source", "version"])
    writer.writerows(zip(hashed_filename, category_rows, label_rows, descriptions, source_rows, version_rows))