In [1]:
import IPython.display as ipd
from tqdm.auto import tqdm
from pathlib import Path
from youtube_dl import YoutubeDL
from slugify import slugify
import pandas as pd
import requests
import treelib
import json

AUDIOSET_PATH = Path("../datasets/audioset")
AUDIOSET_PATH.mkdir(exist_ok=True, parents=True)
ONTOLOGY_URL = "https://raw.githubusercontent.com/audioset/ontology/master/ontology.json"
ONTOLOGY_PATH = AUDIOSET_PATH / "ontology.json"
SEGMENTS_URL = "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv"
SEGMENTS_PATH = AUDIOSET_PATH / "segments.csv"

def download_file(url: str, path: Path):
    print(f"Downloading {path.name}...", end=" ", flush=True)
    res = requests.get(url, allow_redirects=True)
    with open(path, "wb") as file:
        file.write(res.content)
    print("OK")
    
download_file(ONTOLOGY_URL, ONTOLOGY_PATH)
download_file(SEGMENTS_URL, SEGMENTS_PATH)

Downloading ontology.json... OK
Downloading segments.csv... OK


In [2]:
ontology_df = pd.read_json(str(ONTOLOGY_PATH))
good_columns = ["id", "name", "description", "child_ids"]
ontology_df.drop(ontology_df.columns.difference(good_columns), 1, inplace=True)
ontology_df.style.set_properties(subset=["description"], **{"width": "600px"})
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
ipd.display(ontology_df)
pd.set_option("display.max_rows", 50)

Unnamed: 0,id,name,description,child_ids
0,/m/0dgw9r,Human sounds,Sounds produced by the human body through the actions of the individual.,"[/m/09l8g, /m/01w250, /m/09hlz4, /m/0bpl036, /m/0160x5, /m/0k65p, /m/01jg02, /m/04xp5v, /t/dd00012]"
1,/m/09l8g,Human voice,"The human voice consists of sound made by a human being using the vocal folds for talking, singing, laughing, crying, screaming, etc. The human voice is specifically a part of human sound production in which the vocal folds are the primary sound source.","[/m/09x0r, /m/07p6fty, /m/03qc9zr, /m/02rtxlg, /m/01j3sz, /m/0463cq4, /m/07qw_06, /m/07plz5l, /m/015lz1, /m/02fxyj, /m/07s2xch, /m/07r4k75, /m/01j423]"
2,/m/09x0r,Speech,"Speech is the vocalized form of human communication, created out of the phonetic combination of a limited set of vowel and consonant speech sound units.","[/m/05zppz, /m/02zsn, /m/0ytgt, /m/01h8n0, /m/02qldy, /m/0261r1, /m/0brhx]"
3,/m/05zppz,"Male speech, man speaking",Speech uttered by an adult male human.,[]
4,/m/02zsn,"Female speech, woman speaking",Speech uttered by an adult female human.,[]
5,/m/0ytgt,"Child speech, kid speaking","Speech uttered by a human child, i.e. a human whose voice does not yet resemble that of an adult.",[]
6,/m/01h8n0,Conversation,"Interactive, spontaneous spoken communication between two or more people.",[]
7,/m/02qldy,"Narration, monologue","Speech by a single human aimed at an audience (either present or assumed, as in a video blog).",[]
8,/m/0261r1,Babbling,Speech-like sounds uttered by a human that lack the deeper structure and meaning of conventional speech. Babbling is a stage in a child's development of language.,[]
9,/m/0brhx,Speech synthesizer,Artificially-produced human speech.,[]


In [3]:
id_to_slug = {}
clean_classes = {}
for _, x in ontology_df.iterrows():
    slug = slugify(x["name"])
    assert slug not in clean_classes
    id_to_slug[x["id"]] = slug
    clean_classes[slug] = {
        "id": x["id"],
        "description": x["description"],
        "children": x["child_ids"],
    }

for i, (k, x) in enumerate(clean_classes.items()):
    x["children"] = [id_to_slug[c] for c in x["children"]]
    for child in x["children"]:
        clean_classes[child]["parent"] = k

tree = treelib.Tree()
tree_nodes = {"root": tree.create_node("*")}
failing = True
while failing:
    failing = False
    for k, v in clean_classes.items():
        if k in tree_nodes:
            continue
        parent = v.get("parent", "root")
        if parent not in tree_nodes:
            failing = True
            continue
        node = tree.create_node(k, parent=tree_nodes[parent])
        tree_nodes[k] = node

print("AUDIOSET CLASS HIERARCHY:")
tree.show()

AUDIOSET CLASS HIERARCHY:
*
├── animal
│   ├── domestic-animals-pets
│   │   ├── cat
│   │   │   ├── cat-communication
│   │   │   ├── caterwaul
│   │   │   ├── meow
│   │   │   └── purr
│   │   └── dog
│   │       ├── bark
│   │       ├── bay
│   │       ├── bow-wow
│   │       ├── whimper-dog
│   │       └── yip
│   ├── livestock-farm-animals-working-animals
│   │   ├── cattle-bovinae
│   │   │   ├── moo
│   │   │   └── yak
│   │   ├── donkey-ass
│   │   ├── fowl
│   │   │   ├── chicken-rooster
│   │   │   │   ├── cluck
│   │   │   │   └── crowing-cock-a-doodle-doo
│   │   │   ├── duck
│   │   │   │   └── quack
│   │   │   ├── goose
│   │   │   │   └── honk
│   │   │   └── turkey
│   │   │       └── gobble
│   │   ├── goat
│   │   ├── horse
│   │   │   ├── neigh-whinny
│   │   │   ├── nicker
│   │   │   └── snort-horse
│   │   ├── pig
│   │   │   └── oink
│   │   └── sheep
│   │       └── bleat
│   └── wild-animals
│       ├── bird
│       │   ├── bird-flight-flapping-wings
│       │

In [4]:
segments_df = pd.read_csv(
    SEGMENTS_PATH,
    sep=", ",
    comment="#",
    engine="python",
    names=["id", "start", "end", "classes"],
)

slug_classes = []
for index, row in tqdm(segments_df.iterrows(), total=len(segments_df)):
    classes = row["classes"].strip('"').split(",")
    classes = set(id_to_slug[x] for x in classes)
    old_classes = None
    while classes != old_classes:
        old_classes = classes.copy()
        for x in old_classes:
            parent = clean_classes[x].get("parent")
            if parent:
                classes.add(parent)
    slug_classes.append(classes)

segments_df["classes"] = pd.Series(slug_classes, dtype='O')
segments_df

  0%|          | 0/22160 [00:00<?, ?it/s]

Unnamed: 0,id,start,end,classes
0,--PJHxphWEs,30.0,40.0,"{human-sounds, sounds-of-things, human-voice, speech, liquid, pour, gush}"
1,--ZhevVpy1s,50.0,60.0,"{toothbrush, sounds-of-things, domestic-sounds-home-sounds}"
2,--aE2O5G5WE,0.0,10.0,"{human-sounds, animal, goat, speech, music, livestock-farm-animals-working-animals, human-voice}"
3,--aO5cdqSAg,30.0,40.0,"{human-sounds, child-singing, male-singing, singing, human-voice}"
4,--aaILOrkII,200.0,210.0,"{sounds-of-things, cap-gun, explosion, gunshot-gunfire}"
...,...,...,...,...
22155,zyqg4pYEioQ,20.0,30.0,"{human-sounds, sewing-machine, sounds-of-things, mechanisms, speech, human-voice}"
22156,zz0ddNfz0h0,30.0,40.0,"{sounds-of-things, motor-vehicle-road, ice-cream-truck-ice-cream-van, truck, vehicle, car}"
22157,zz8TGV83nkE,80.0,90.0,"{sounds-of-things, motor-vehicle-road, engine, motorcycle, vehicle}"
22158,zzlK8KDqlr0,370.0,380.0,"{acoustic-environment, domestic-sounds-home-sounds, human-sounds, inside-small-room, computer-keyboard, channel-environment-and-background, sounds-of-things, clicking, source-ambiguous-sounds, speech, typing, onomatopoeia, human-voice}"


In [5]:
# How many samples to download (per class)
NUM = 2000

# Target audio sample rate
SAMPLE_RATE = 16000

# AUDIO CLASSES FILTER
# include = download if ANY tag matches
# exclude = DO NOT download if ANY tag mathes
GROUPS = {
    "noise": {
        "include": [
            "channel-environment-and-background",
            "natural-sounds",
            "sounds-of-things",
        ],
        "exclude": [
            "music",
            "hubbub-speech-noise-speech-babble",
            "radio",
            "television",
            "human-voice",
        ],
    },
    "music": {
        "include": ["music"],
        "exclude": [
            "hubbub-speech-noise-speech-babble",
            "radio",
            "television",
            "human-voice",
        ],
    },
}

In [6]:
YDL_OPTIONS = {
    "format": "bestaudio/best",
    "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "wav"}],
    "postprocessor_args": "-ss {start} -to {end} -ar {sample_rate} -ac 1",
    "prefer_ffmpeg": True,
    "keepvideo": False,
    "quiet": True,
    "no_warnings": True,
}


def filter_segments(include, exclude):
    include, exclude = set(include), set(exclude)
    func = lambda x: bool(x["classes"] & include) and not bool(x["classes"] & exclude)
    return segments_df[segments_df.apply(func, axis=1)]


def seconds_to_ffmpeg_format(sec):
    sec = float(sec)
    return f"{sec // 3600:.0f}:{(sec % 3600) // 60:.0f}:{sec % 60:.0f}"


def download_segment(video_id, start, end, path):
    options = YDL_OPTIONS.copy()
    options["outtmpl"] = str(path.parent / "%(id)s.%(ext)s")
    start = seconds_to_ffmpeg_format(start)
    end = seconds_to_ffmpeg_format(end)
    args = options["postprocessor_args"]
    args = args.format(start=start, end=end, sample_rate=SAMPLE_RATE)
    options["postprocessor_args"] = args.split()
    try:
        with YoutubeDL(options) as ydl:
            ydl.download([video_id])
            return True
    except:
        path.unlink(missing_ok=True)
        return False

In [None]:
print("Please check following parameters are correct:")
print("Group names to download:", list(GROUPS))
print("Number of files to download (per group):", NUM)
print("Output audio sample rate:", SAMPLE_RATE)
if input("Continue? (y/n): ").strip().lower() != "y":
    print("Aborting...")
    assert False


for name, filters in GROUPS.items():
    path = AUDIOSET_PATH / name
    path.mkdir(parents=True, exist_ok=True)
    print(f"Downloading group {name!r} -> {path}")

    segments = filter_segments(filters["include"], filters["exclude"])
    progress = tqdm(total=NUM)
    for _, row in segments.iterrows():
        if progress.n >= NUM:
            print("Target number of files reached")
            break
        file_path = path / f"{row['id']}.wav"
        if file_path.exists():
            progress.update()
            continue
        progress.set_description(f"{file_path.name}")
        ok = download_segment(row["id"], row["start"], row["end"], file_path)
        progress.update(ok)
        
    print(f"Total samples in group {name!r}: {progress.n}")
    progress.close()

Please check following parameters are correct:
Group names to download: ['noise', 'music']
Number of files to download (per group): 2000
Output audio sample rate: 16000
Continue? (y/n): y
Downloading group 'noise' -> ..\datasets\audioset\noise


  0%|          | 0/2000 [00:00<?, ?it/s]

ERROR: Video unavailable
This video is no longer available because the YouTube account associated with this video has been terminated.
ERROR: Private video
Sign in if you've been granted access to this video
ERROR: Video unavailable
ERROR: Video unavailable
ERROR: Private video
Sign in if you've been granted access to this video
ERROR: Video unavailable
This video contains content from Lasso Group, who has blocked it on copyright grounds.
ERROR: Video unavailable
ERROR: Private video
Sign in if you've been granted access to this video
ERROR: Video unavailable
ERROR: Video unavailable
ERROR: Video unavailable
ERROR: Video unavailable
ERROR: Private video
Sign in if you've been granted access to this video
ERROR: Private video
Sign in if you've been granted access to this video
ERROR: Video unavailable
ERROR: Private video
Sign in if you've been granted access to this video
ERROR: Video unavailable
ERROR: Video unavailable
This video is no longer available because the YouTube account ass