# notebook contains code and comment for downloading vggset dataset, re-download missing downloads, postprocess raw audio files

In [1]:
import os
from glob import glob
from tqdm.notebook import tqdm
import pandas as pd
import json

#import youtube_dl
import yt_dlp as youtube_dl
import librosa
from pydub import AudioSegment

import concurrent.futures
import threading

In [2]:
VGGSOUND_DATASET_PATH = "./vggsound/"
MP3_FILE_ROOT_PATH = "./vggsound_tmp"
VGGSOUND_METADATA_PATH = "./vggsound_meta_data.json"

os.makedirs(VGGSOUND_DATASET_PATH, exist_ok=True)

# vggsound.csv : https://www.robots.ox.ac.uk/~vgg/data/vggsound/
vggsound_data = pd.read_csv("vggsound.csv", names=["YouTube ID", "start seconds", "label", "train/test split"])

with open(VGGSOUND_METADATA_PATH, "r") as fp :
    vggsound_meta_data = json.load(fp)
    

## set yt_dlp options

In [3]:
ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '320',
    }],
    'outtmpl': os.path.join(MP3_FILE_ROOT_PATH, "%(id)s.%(ext)s"),
    'quiet':True,
    'external_downloader_args': ['-loglevel', 'panic']
}
slink = "https://www.youtube.com/watch?v="

vggsound_data["url"] = slink + vggsound_data["YouTube ID"]

vggsound_data

Unnamed: 0,YouTube ID,start seconds,label,train/test split,url
0,---g-f_I2yQ,1,people marching,test,https://www.youtube.com/watch?v=---g-f_I2yQ
1,--0PQM4-hqg,30,waterfall burbling,train,https://www.youtube.com/watch?v=--0PQM4-hqg
2,--56QUhyDQM,185,playing tennis,train,https://www.youtube.com/watch?v=--56QUhyDQM
3,--5OkAjCI7g,40,people belly laughing,train,https://www.youtube.com/watch?v=--5OkAjCI7g
4,--8puiAGLhs,30,car engine starting,train,https://www.youtube.com/watch?v=--8puiAGLhs
...,...,...,...,...,...
199462,zzsUhaDSqzI,145,lathe spinning,train,https://www.youtube.com/watch?v=zzsUhaDSqzI
199463,zztvx3WUBss,30,people shuffling,train,https://www.youtube.com/watch?v=zztvx3WUBss
199464,zzvCPtdNxNo,68,civil defense siren,test,https://www.youtube.com/watch?v=zzvCPtdNxNo
199465,zzvSVusPPgM,30,chicken clucking,train,https://www.youtube.com/watch?v=zzvSVusPPgM


## define function for download raw audio 

In [4]:
def download_video(url, opt) :
    with youtube_dl.YoutubeDL(opt) as ydl :
        try :
            ydl.download([url])
            return {"url": url, "status":True}
        except Exception as e :
            return {"url": url, "status":False, "error": str(e)}

def download_parallel(url_list, opt, max_workers = 10) :
    log_data = []
    
    with concurrent.futures.ThreadPoolExecutor(
        max_workers = max_workers
    ) as executor :
        future_to_url = {
            executor.submit(
                download_video, url, opt
            ) : url for url in url_list
        }
        for future in tqdm(
            concurrent.futures.as_completed(future_to_url),
            total = len(url_list)
        ) :
            url = future_to_url[future]
            result = future.result()
            log_data.append(result)

    return log_data

## download auidos parellely and save result metadata

In [5]:
_ = '''
# do this work using python file. ipython notebook is not suitable for this task.
results = download_parallel(
    vggsound_data[vggsound_data["train/test split"] == "test"]["url"] , ydl_opts, max_workers = 50
)

with open(VGGSOUND_METADATA_PATH, "w") as fp :
    json.dump(
        results, fp
    )
'''

# re-download missing audio

## filter information of videos that were failed to downloaded not due to youtube related problem

In [41]:
with open(VGGSOUND_METADATA_PATH, "r") as fp :
    data=json.load(fp)

redownload_video_list = list(filter(
    lambda record : record["status"] == False and "errno 2" in record["error"],
    data
))


print("videos to re-download :")
print(redownload_video_list)
print("total", len(redownload_video_list), "to download")

print("downloading..")
results = download_parallel(
    list(map(
        lambda record : record["url"],
        redownload_video_list
    )),
    ydl_opts,
    max_workers = 50
)
print("\n")
print("download completed")

for r1 in results :
     for r2 in data :
          if r1["url"] == r2["url"] :
               data.remove(r2)
               data.append(r1)
               

redownload_video_list = list(filter(
    lambda record : not record["status"] and "errno 2" in record["error"].lower(),
    data
))

print("videos to re-download :", len(redownload_video_list))
print(redownload_video_list)

with open(VGGSOUND_METADATA_PATH, "w") as fp :
    json.dump(
        data, fp
    )

videos to re-download :
[]
total 0 to download
downloading..


0it [00:00, ?it/s]



download completed
videos to re-download : 511
[{'url': 'https://www.youtube.com/watch?v=-7sg--aJdrc', 'status': False, 'error': "\x1b[0;31mERROR:\x1b[0m Unable to rename file: [Errno 2] No such file or directory: './vggsound_tmp/-7sg--aJdrc.webm.part' -> './vggsound_tmp/-7sg--aJdrc.webm'"}, {'url': 'https://www.youtube.com/watch?v=-GOaBCyC5Js', 'status': False, 'error': "[Errno 2] No such file or directory: './vggsound_tmp/-GOaBCyC5Js.webm' -> './vggsound_tmp/-GOaBCyC5Js.webm'"}, {'url': 'https://www.youtube.com/watch?v=-7tYmeOmsRg', 'status': False, 'error': "[Errno 2] No such file or directory: './vggsound_tmp/-7tYmeOmsRg.webm' -> './vggsound_tmp/-7tYmeOmsRg.webm'"}, {'url': 'https://www.youtube.com/watch?v=-KqXcm-I2zY', 'status': False, 'error': "[Errno 2] No such file or directory: './vggsound_tmp/-KqXcm-I2zY.webm' -> './vggsound_tmp/-KqXcm-I2zY.webm'"}, {'url': 'https://www.youtube.com/watch?v=-K-ccLMFE5M', 'status': False, 'error': "[Errno 2] No such file or directory: './vggs

## postprocess

In [3]:
mp3_name_list = sorted(glob(os.path.join(MP3_FILE_ROOT_PATH, "*")))

id_list = sorted(list(map(
    lambda x :  os.path.basename('.'.join(x.split('.')[:-1])),
    mp3_name_list
)))

id_list

['---g-f_I2yQ',
 '--U7joUcTCo',
 '--i-y1v8Hy8',
 '-0BIyqJj9ZU',
 '-0jeONf82dE',
 '-0p7hKXZ1ww',
 '-0pJqpNjft4',
 '-0vPFx-wRRI',
 '-0yRK50zyTI',
 '-1BtY81-D54',
 '-1EXhfqLLwQ',
 '-1pRmoJIGQc',
 '-2-wdcN5vOw',
 '-2Dm0VjW8oM',
 '-2sE5CH8Wb8',
 '-2sOH8XovEE',
 '-2xiZDEuHd8',
 '-3-4qmWSJXU',
 '-3Kv4fdm7Uk',
 '-3RH8_aeZkk',
 '-3YWuPXHknk',
 '-3b9gwBYXp8',
 '-3rHVsIj1M8',
 '-3z5mFRgbxc',
 '-40nDU5Ecgg',
 '-4DpBHTuc88',
 '-4bPiXbovf0',
 '-4kkGS4-qVM',
 '-4pmCrSdMhg',
 '-4viN_EoxOA',
 '-5CGQGSFGyg',
 '-615mGonUqU',
 '-79qo5MUYBk',
 '-7IhwezUrUA',
 '-7TanrCbmME',
 '-7YESdyyHVw',
 '-7sg--aJdrc',
 '-7tYmeOmsRg',
 '-8C-gydUbR8',
 '-8cgbhIR_pw',
 '-8lzdlqUtJQ',
 '-8pCMgGKZY8',
 '-A1o1Egi20c',
 '-A3zsFeU_OI',
 '-ABOfhcXwt8',
 '-AIzKe9X0hA',
 '-AMsYmKRnWE',
 '-ANxUxvGASw',
 '-AeiYb4vDK0',
 '-Am9MfJ6Z2o',
 '-BmjFZ1xu5Q',
 '-By6I234TSs',
 '-ByoSbgzr4M',
 '-CGkSDY0mWI',
 '-COelgvUEW4',
 '-CZ1LIc8aos',
 '-Cr0WQoFQQs',
 '-D64b_8YJK4',
 '-DnZ_ZY3lgA',
 '-DwwSHC2jLs',
 '-E5o64ACjm0',
 '-Ezk-jIm6kM',
 '-FKrYT

In [4]:
vggsound_data = pd.read_csv("vggsound.csv", names=["YouTube ID", "start seconds", "label", "train/test split"])
vggsound_data["idx"] = list(map(
    str,
    range(len(vggsound_data))
))
#vggsound_data["result_file_path"] = \
#    f"{vggsound_data['label']}_{vggsound_data['idx']}.wav"

vggsound_data["result_file_name"] = vggsound_data["label"]+ '_' + vggsound_data["idx"] + ".wav"
vggsound_data

downloaded_data = vggsound_data[vggsound_data["YouTube ID"].isin(id_list)]

downloaded_data

Unnamed: 0,YouTube ID,start seconds,label,train/test split,idx,result_file_name
0,---g-f_I2yQ,1,people marching,test,0,people marching_0.wav
36,--U7joUcTCo,0,people coughing,test,36,people coughing_36.wav
52,--i-y1v8Hy8,0,female singing,test,52,female singing_52.wav
78,-0BIyqJj9ZU,30,people belly laughing,test,78,people belly laughing_78.wav
117,-0jeONf82dE,21,horse neighing,test,117,horse neighing_117.wav
...,...,...,...,...,...,...
199413,zz0fdUGTHWo,45,chimpanzee pant-hooting,test,199413,chimpanzee pant-hooting_199413.wav
199425,zzFdhaiG_Hk,95,popping popcorn,test,199425,popping popcorn_199425.wav
199441,zzbTaK7CXJY,30,wind noise,test,199441,wind noise_199441.wav
199458,zzqrZOq928w,55,train horning,test,199458,train horning_199458.wav


In [5]:
'''
for vid in id_list :
    row_data = downloaded_data[downloaded_data["YouTube ID"] == vid]    
    label = ',,'.join(row_data["label"].tolist())

    class_list = []
    for _, row in downloaded_data.iterrows() :
        labels = row["label"]
        idx = row["idx"]

        result_file_name = f"{labels}_{idx}.wav" 

        row_data["result_file_name"] = result_file_name
'''

downloaded_data["result_file_name"] = downloaded_data["label"] + '_' + downloaded_data["idx"] + '.wav'
downloaded_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downloaded_data["result_file_name"] = downloaded_data["label"] + '_' + downloaded_data["idx"] + '.wav'


Unnamed: 0,YouTube ID,start seconds,label,train/test split,idx,result_file_name
0,---g-f_I2yQ,1,people marching,test,0,people marching_0.wav
36,--U7joUcTCo,0,people coughing,test,36,people coughing_36.wav
52,--i-y1v8Hy8,0,female singing,test,52,female singing_52.wav
78,-0BIyqJj9ZU,30,people belly laughing,test,78,people belly laughing_78.wav
117,-0jeONf82dE,21,horse neighing,test,117,horse neighing_117.wav
...,...,...,...,...,...,...
199413,zz0fdUGTHWo,45,chimpanzee pant-hooting,test,199413,chimpanzee pant-hooting_199413.wav
199425,zzFdhaiG_Hk,95,popping popcorn,test,199425,popping popcorn_199425.wav
199441,zzbTaK7CXJY,30,wind noise,test,199441,wind noise_199441.wav
199458,zzqrZOq928w,55,train horning,test,199458,train horning_199458.wav


In [6]:
with tqdm(total=len(downloaded_data)) as pbar :
    for i, row in downloaded_data.iterrows() :
        idx = id_list.index(row["YouTube ID"])
        sttime = int(row["start seconds"])
        endtime = sttime + 10
        result_file_name = row["result_file_name"]
        #print(sttime, result_file_name)
        try :
            sound = AudioSegment.from_mp3(
                mp3_name_list[idx]
            )[int(sttime) * 1000:int(endtime) * 1000].export(
                os.path.join(VGGSOUND_DATASET_PATH, result_file_name)
            )
        except Exception as e:
            print(e)

        pbar.update(1)


  0%|          | 0/13543 [00:00<?, ?it/s]