**Load Librairies**

In [16]:
import asyncio
import contextlib
import os
from typing import NoReturn

import librosa
import pandas as pd
import yt_dlp as yt

**Load Data To Download**

In [17]:
df = pd.read_csv("../../data/scraped/complete_kaggle_dataset.csv", sep=",")

**Download Audio Files & Automatic Conversion To CSV**

In [18]:
class YTDLSource:
    @classmethod
    async def from_url(cls, url, ytdl, *, loop=None):
        loop = loop or asyncio.get_event_loop()
        data = await loop.run_in_executor(
            None, lambda: ytdl.extract_info(url)
        )
        if "entries" in data:
            data = data["entries"][0]
        return ytdl.prepare_filename(data)

In [19]:
async def download_audio_files(loop) -> NoReturn:
    data = []
    for row in df.values:
        try:
            path = f"../../data/dataset/training/{row[0]}_{row[1]}"
            ytdl_format_options = {
                "outtmpl": path,
                "format": "bestaudio/best",
                "postprocessors": [
                    {
                        "key": "FFmpegExtractAudio",
                        "preferredcodec": "wav",
                        "preferredquality": "192",
                    }
                ],
                "noplaylist": False,
                "nocheckcertificate": True,
                "ignoreerrors": False,
                "quiet": True,
                "no_warnings": True,
                "default_search": "auto",
            }

            ytdl = yt.YoutubeDL(ytdl_format_options)
            await YTDLSource.from_url(f"{row[0]}_{row[1]}", ytdl=ytdl, loop=loop)
            audio_file = f"{path}.wav"
            y, sr = librosa.load(audio_file)
            mfccs = librosa.feature.mfcc(y=y, sr=sr)
            data.append((row[2], {row[0]}, {row[1]}, mfccs))
            os.remove(audio_file)
        except Exception as e:
            print(f"Error : {e}")

In [20]:
async def start():
    await download_audio_files(asyncio.get_event_loop())

In [None]:
await start()

In [24]:
dataset = pd.DataFrame(data, columns=["Decade", "Artist", "Song", "MFCCs"])
dataset.to_csv("dataset.csv", index=False)