In [None]:
#default_exp sources.custom_fsd

# Source / Custom FSD Downloads

> We have downloaded the data using a custom FSD scrapper, at https://github.com/DiogoNeves/freesound-scraper/tree/custom  

This scraper works by providing a list of queries and then downloading the metadata and audio it can find for those.  
The output includes a `metadata.csv` with the output audio details and a `samples` folder with all the audio.  

As you'll see below, we tweaked this logic a little bit.

In [None]:
#|export

from __future__ import annotations

from DataImporters.sources.core import *

import os
import pandas as pd

## Data Import Instructions

1. Clone https://github.com/DiogoNeves/freesound-scraper/tree/custom  
1. Follow installation instructions  
1. Run command

```bash
python freesounds_scraper.py --query <this_project>/data/original/custom_fsd/query.csv \
	--target <this_project>/data/original/custom_fsd/samples/ \
	--data-file-name <this_project>/data/original/custom_fsd/metadata \
	--download true
```

### credentials.json

```json
{
	"client_id": "TKmwjcIrmDFvgcUQtyqm",
	"client_secret": "ZXPLg0GOZDBc93n2jZ4fmAR9qSMua8Mrsz9sx1Dc"
}
```

### Fetching the Data

> This is expected to be run manually from the notebook.

Assuming the scraper is setup as a sibling to this project, we can use it directly from here.  
First we break the `query.csv` into one CSV per category, this helps extend the categories later and work around the API rate limiting.  

In [None]:
# Break the query.csv into one file per row.
ORIGINAL_DIR = os.path.realpath(os.path.join(os.getcwd(), "../", "data/original/custom_fsd/"))
QUERY_PATH = os.path.join(ORIGINAL_DIR, "query.csv")
CATEGORY_DIR = os.path.join(ORIGINAL_DIR, "categories/")
SAMPLES_DIR = os.path.join(ORIGINAL_DIR, "samples/")

ORIGINAL_DIR

'/home/diogoneves/Projects/metaphora/DataImporters/data/original/custom_fsd'

In [None]:
#exporti

def _clean_category_name(query):
    return query.replace("+", " ").split("&")[0]

In [None]:
#hide

assert _clean_category_name("Test+Category") == "Test Category"
assert _clean_category_name("test") == "test"
assert _clean_category_name("test&category") == "test"

In [None]:
import stringcase

def filename_from_query(query):
    return stringcase.snakecase(_clean_category_name(query)) + ".csv"

Split the queries into separate files

In [None]:
# Read each line of the query.csv file.
with open(QUERY_PATH, "r") as f:
    queries = f.readlines()

query_paths = []
for row in queries:
    # Write a csv file for each query.
    query = row.strip()
    filename = filename_from_query(query)
    path = os.path.join(CATEGORY_DIR, filename)
    try:
        with open(path, "w") as f:
            f.write(query)
        query_paths.append((path, filename))
    except Exception as e:
        print(e)
        print(f"Could not write {path}")

Get the commands we can use.

In [None]:
#hide_output

SCRAPER_DIR = os.path.realpath(os.path.join(os.getcwd(), "../../", "freesound-scraper/"))
print("Scraper at: ", SCRAPER_DIR)

current_dir = os.getcwd()
os.chdir(SCRAPER_DIR)
for path, filename in query_paths:
    category = filename.split(".")[0]
    samples_dir = os.path.relpath(os.path.join(SAMPLES_DIR, category))
    print(f"python freesounds_scraper.py --query {os.path.relpath(path)} --target {samples_dir} --data-file-name {samples_dir}/metadata --download true")
os.chdir(current_dir)
os.getcwd()

Scraper at:  /home/diogoneves/Projects/metaphora/freesound-scraper
python freesounds_scraper.py --query ../DataImporters/data/original/custom_fsd/categories/footsteps.csv --target ../DataImporters/data/original/custom_fsd/samples/footsteps --data-file-name ../DataImporters/data/original/custom_fsd/samples/footsteps/metadata --download true
python freesounds_scraper.py --query ../DataImporters/data/original/custom_fsd/categories/water.csv --target ../DataImporters/data/original/custom_fsd/samples/water --data-file-name ../DataImporters/data/original/custom_fsd/samples/water/metadata --download true
python freesounds_scraper.py --query ../DataImporters/data/original/custom_fsd/categories/birds.csv --target ../DataImporters/data/original/custom_fsd/samples/birds --data-file-name ../DataImporters/data/original/custom_fsd/samples/birds/metadata --download true
python freesounds_scraper.py --query ../DataImporters/data/original/custom_fsd/categories/drone.csv --target ../DataImporters/data/o

'/home/diogoneves/Projects/metaphora/DataImporters/nbs'

## Format conversion

The `metadata.csv` for each category includes the tags and no further title parsing is necessary.  

We also assume the category is the query and simply copy it over, with a small amount of processing to unescape the strings (e.g. `"test+query&s=Rating+(highest+first)" -> "test query"`).

In [None]:
#exporti

def _category_renames(category: str) -> str:
    "Applies renaming rules to category names."
    rules = [
        ("ship_horn", "horn"),
        ("robot_movement", "robot"),
        ("zombie_noises", "zombie"),
        ("sword_hit", "sword")
    ]
    for old, new in rules:
        if category.lower() == old:
            return new
    return category

In [None]:
#exports

class CustomFsd(Source):
    def preload(self, root_dir: str):
        # Load all metadata.csv files into a single dataframe.
        columns = ["id", "filename", "category", "tags"]
        samples_dir = os.path.join(root_dir, "samples")
        self.metadata = pd.concat([pd.read_csv(os.path.join(path, f), header=None, names=columns)
                                   for path, f in get_filenames(samples_dir, ".csv")])

    @property
    def name(self) -> str:
        return "custom_fsd"
    
    def get_files(self, root_dir: str) -> list[tuple[str, str]]:
        return get_audio_filenames(root_dir)
    
    def get_category(self, path: str, filename: str) -> str:
        return _category_renames(os.path.basename(path))
    
    def get_labels(self, path: str, filename: str) -> list[str]:
        tags = self.metadata.loc[self.metadata["filename"] == filename, "tags"].values[0]
        return tags.split(",")

### Sample Usage

In [None]:
root_dir = "../data/original/custom_fsd/"

source = CustomFsd()
source.preload(root_dir)

path, filename = source.get_files(root_dir)[0]
source.get_labels(path, filename)

['horror']