# notebook contains code and comment for downloading vggset dataset, re-download missing downloads, postprocess raw audio files

In [1]:
import os
from glob import glob
from tqdm.notebook import tqdm
import pandas as pd
import json

#import youtube_dl
import yt_dlp as youtube_dl
import librosa
from pydub import AudioSegment

import concurrent.futures 
import threading

In [2]:
AUDIOSET_DATASET_PATH = "./audioset/"
MP3_FILE_ROOT_PATH = "./audioset_tmp"
AUDIOSET_METADATA_PATH = "./audioset_meta_data.json"

audioset_data = pd.read_csv(
    "./eval_segments.csv",
    delimiter=', ',
    skiprows = [0, 1, 2],
    names = ["vid", "stt", "ett", "label"]
)

with open(AUDIOSET_METADATA_PATH, "r") as fp :
    audioset_meta_data = json.load(fp)    

audioset_data

  audioset_data = pd.read_csv(


Unnamed: 0,vid,stt,ett,label
0,--4gqARaEJE,0.0,10.0,"""/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"""
1,--BfvyPmVMo,20.0,30.0,"""/m/03l9g"""
2,--U7joUcTCo,0.0,10.0,"""/m/01b_21"""
3,--i-y1v8Hy8,0.0,9.0,"""/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005"""
4,-0BIyqJj9ZU,30.0,40.0,"""/m/07rgt08,/m/07sq110,/t/dd00001"""
...,...,...,...,...
20366,zyF8TGSRvns,150.0,160.0,"""/m/0dwsp,/m/0dwtp,/m/0f8s22,/m/0j45pbj"""
20367,zz35Va7tYmA,30.0,40.0,"""/m/012f08,/m/07q2z82,/m/07qmpdm,/m/0k4j"""
20368,zzD_oVgzKMc,30.0,40.0,"""/m/07pn_8q"""
20369,zzNdwF40ID8,70.0,80.0,"""/m/04rlf,/m/0790c"""


## set yt_dlp options

In [3]:
ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '320',
    }],
    'outtmpl': os.path.join(MP3_FILE_ROOT_PATH, "%(id)s.%(ext)s"),
    'quiet':True,
    'external_downloader_args': ['-loglevel', 'panic']
}
slink = "https://www.youtube.com/watch?v="

audioset_data["idx"] = range(len(audioset_data))
audioset_data["url"] = slink + audioset_data["vid"]

audioset_data

Unnamed: 0,vid,stt,ett,label,idx,url
0,--4gqARaEJE,0.0,10.0,"""/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk""",0,https://www.youtube.com/watch?v=--4gqARaEJE
1,--BfvyPmVMo,20.0,30.0,"""/m/03l9g""",1,https://www.youtube.com/watch?v=--BfvyPmVMo
2,--U7joUcTCo,0.0,10.0,"""/m/01b_21""",2,https://www.youtube.com/watch?v=--U7joUcTCo
3,--i-y1v8Hy8,0.0,9.0,"""/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005""",3,https://www.youtube.com/watch?v=--i-y1v8Hy8
4,-0BIyqJj9ZU,30.0,40.0,"""/m/07rgt08,/m/07sq110,/t/dd00001""",4,https://www.youtube.com/watch?v=-0BIyqJj9ZU
...,...,...,...,...,...,...
20366,zyF8TGSRvns,150.0,160.0,"""/m/0dwsp,/m/0dwtp,/m/0f8s22,/m/0j45pbj""",20366,https://www.youtube.com/watch?v=zyF8TGSRvns
20367,zz35Va7tYmA,30.0,40.0,"""/m/012f08,/m/07q2z82,/m/07qmpdm,/m/0k4j""",20367,https://www.youtube.com/watch?v=zz35Va7tYmA
20368,zzD_oVgzKMc,30.0,40.0,"""/m/07pn_8q""",20368,https://www.youtube.com/watch?v=zzD_oVgzKMc
20369,zzNdwF40ID8,70.0,80.0,"""/m/04rlf,/m/0790c""",20369,https://www.youtube.com/watch?v=zzNdwF40ID8


## define function for download raw audio 

In [4]:
def download_video(url, opt) :
    with youtube_dl.YoutubeDL(opt) as ydl :
        try :
            ydl.download([url])
            return {"url": url, "status":True}
        except Exception as e :
            return {"url": url, "status":False, "error": str(e)}

def download_parallel(url_list, opt, max_workers = 10) :
    log_data = []
    
    with concurrent.futures.ThreadPoolExecutor(
        max_workers = max_workers
    ) as executor :
        future_to_url = {
            executor.submit(
                download_video, url, opt
            ) : url for url in url_list
        }
        for future in tqdm(
            concurrent.futures.as_completed(future_to_url),
            total = len(url_list)
        ) :
            url = future_to_url[future]
            result = future.result()
            log_data.append(result)

    return log_data

## download audios parellely and save result metadata

In [2]:
_ = '''
# do this work using python file. ipython notebook is not suitable for this task.
results = download_parallel(
    audioset_data["url"][:100], ydl_opts, max_workers = 10
)

with open(AUDIOSET_METADATA_PATH, "w") as fp :
    json.dump(
        results, fp
    )
'''

# re-download missing audio

In [6]:
with open(AUDIOSET_METADATA_PATH, "r") as fp :
    data=json.load(fp)

redownload_video_list = list(filter(
    lambda record : record["status"] == False and "errno 2" in record["error"],
    data
))


print("videos to re-download :")
print(redownload_video_list)

print("downloading..")
results = download_parallel(
    list(map(
        lambda record : record["url"],
        redownload_video_list
    )),
    ydl_opts,
    max_workers = 50
)


for r1 in results :
     for r2 in data :
          if r1["url"] == r2["url"] :
               data.remove(r2)
               data.append(r1)
               

videos to re-download :
[]
downloading..


0it [00:00, ?it/s]

## postprocess

In [4]:
mp3_name_list = glob(os.path.join(MP3_FILE_ROOT_PATH, "*"))

id_list = sorted(list(map(
    lambda x :  os.path.basename(x).split('.')[0],
    mp3_name_list
)))

id_list

['--4gqARaEJE',
 '--BfvyPmVMo',
 '--U7joUcTCo',
 '--i-y1v8Hy8',
 '-0BIyqJj9ZU',
 '-0CamVQdP_Y',
 '-0Gj8-vB1q4',
 '-0Gj8-vB1q4',
 '-0RWZT-miFs',
 '-0jeONf82dE',
 '-0nqfRcnAYE',
 '-0nqfRcnAYE',
 '-0p7hKXZ1ww',
 '-0vPFx-wRRI',
 '-0xzrMun0Rs',
 '-0yRK50zyTI',
 '-116CjQ3MAg',
 '-1EXhfqLLwQ',
 '-1Hub6Ps_cc',
 '-1LQP2wemiQ',
 '-1OlgJWehn8',
 '-1PZQg5Gi8A',
 '-1UWSisR2zo',
 '-1hDIl9Udkw',
 '-1hDIl9Udkw',
 '-1nilez17Dg',
 '-1nilez17Dg',
 '-1pRmoJIGQc',
 '-21_SXelVNo',
 '-22tna7KHzI',
 '-2EKWgTNEYU',
 '-2sE5CH8Wb8',
 '-2xiZDEuHd8',
 '-3-4qmWSJXU',
 '-36qTeAdDMI',
 '-3Kv4fdm7Uk',
 '-3YWuPXHknk',
 '-3nXxwBlX5A',
 '-3nyUrKWFEE',
 '-3rHVsIj1M8',
 '-3z5mFRgbxc',
 '-47nPCeukVc',
 '-4SYC2YgzL8',
 '-4SYC2YgzL8',
 '-4kkGS4-qVM',
 '-4kkGS4-qVM',
 '-4pmCrSdMhg',
 '-4viN_EoxOA',
 '-53zl3bPmpM',
 '-5CGQGSFGyg',
 '-5FoeegAgvU',
 '-5FoeegAgvU',
 '-5PZ_Bh-M6o',
 '-5PZ_Bh-M6o',
 '-65CfQUX9Ng',
 '-6Aq2fJwlgU',
 '-6GcdDStIwM',
 '-6HBGg1cAI0',
 '-6JnAxTXApw',
 '-6cTEqIcics',
 '-7B9tPuIP-w',
 '-7B9tPuIP-w',
 '-7Ihwe

In [5]:
label_data = pd.read_csv("./class_labels_indices.csv")
label_dict = dict(
    zip(label_data["mid"], label_data["display_name"])
)

In [6]:
audioset_data = pd.read_csv(
    "./eval_segments.csv",
    delimiter=', ',
    skiprows = [0, 1, 2],
    names = ["vid", "stt", "ett", "label"]
)
audioset_data["idx"] = list(map(
    str,
    range(len(audioset_data))
))

class_list = []
for k, row in audioset_data.iterrows() :
    #print(row["label"])
    labels = row["label"][1:-1].split(',')  
    #aprint(labels)
    classes = list(map(
        lambda l : label_dict[l],
        labels
    ))
    cls = ',,'.join(classes)
    #print(cls)

    class_list.append(cls)

audioset_data["class"] = class_list


audioset_data["result_file_name"] = audioset_data["class"] + '_' + audioset_data["idx"] + ".wav"

#audioset_data["class"] = audioset_data["label"].map(label_dict)

downloaded_data = audioset_data[audioset_data['vid'].isin(id_list)]
downloaded_data

  audioset_data = pd.read_csv(


Unnamed: 0,vid,stt,ett,label,idx,class,result_file_name
0,--4gqARaEJE,0.0,10.0,"""/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk""",0,"Domestic animals, pets,,Squeak,,Dog,,Animal","Domestic animals, pets,,Squeak,,Dog,,Animal_0.wav"
1,--BfvyPmVMo,20.0,30.0,"""/m/03l9g""",1,Hammer,Hammer_1.wav
2,--U7joUcTCo,0.0,10.0,"""/m/01b_21""",2,Cough,Cough_2.wav
3,--i-y1v8Hy8,0.0,9.0,"""/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005""",3,"Music,,Speech,,Female singing,,Child singing","Music,,Speech,,Female singing,,Child singing_3..."
4,-0BIyqJj9ZU,30.0,40.0,"""/m/07rgt08,/m/07sq110,/t/dd00001""",4,"Chuckle, chortle,,Belly laugh,,Baby laughter","Chuckle, chortle,,Belly laugh,,Baby laughter_4..."
...,...,...,...,...,...,...,...
20364,zxuey_Zi0wY,30.0,40.0,"""/m/085jw,/m/0mkg""",20364,"Wind instrument, woodwind instrument,,Accordion","Wind instrument, woodwind instrument,,Accordio..."
20366,zyF8TGSRvns,150.0,160.0,"""/m/0dwsp,/m/0dwtp,/m/0f8s22,/m/0j45pbj""",20366,"Marimba, xylophone,,Glockenspiel,,Chime,,Malle...","Marimba, xylophone,,Glockenspiel,,Chime,,Malle..."
20367,zz35Va7tYmA,30.0,40.0,"""/m/012f08,/m/07q2z82,/m/07qmpdm,/m/0k4j""",20367,"Motor vehicle (road),,Accelerating, revving, v...","Motor vehicle (road),,Accelerating, revving, v..."
20369,zzNdwF40ID8,70.0,80.0,"""/m/04rlf,/m/0790c""",20369,"Music,,Sonar","Music,,Sonar_20369.wav"


In [7]:
with tqdm(total = len(downloaded_data)) as pbar :
    for i, row in downloaded_data.iterrows() :

        vid = row["vid"]
        sttime = int(row["stt"])
        idx = row["idx"]
        result_file_name = row["result_file_name"]
        AudioSegment.from_mp3(
            os.path.join(MP3_FILE_ROOT_PATH, f"{row['vid']}.mp3"),
        )[sttime*1000 : (sttime + 10)*1000].export(
            os.path.join(AUDIOSET_DATASET_PATH, result_file_name)
        )
    
        pbar.update(1)

  0%|          | 0/17049 [00:00<?, ?it/s]

In [81]:
len(glob("./audioset_tmp/*"))

17047

In [82]:
len(glob("./vggsound_tmp/*"))

13017