In [None]:
from datasets import load_dataset
import pandas as pd
import requests
import os
from tqdm import tqdm

DATASET_NAME = "Saads/xecanto_birds"
OUTPUT_DIR = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet"

audio_dir = os.path.join(OUTPUT_DIR, "audio")
os.makedirs(audio_dir, exist_ok=True)

# Load dataset WITHOUT decoding audio
dataset = load_dataset(DATASET_NAME, split="train", streaming=True)

rows = []

HF_BASE_URL = "https://huggingface.co/datasets/Saads/xecanto_birds/resolve/main/"

for i, item in tqdm(enumerate(dataset)):
    audio_info = item["audio"]

    # The dataset gives "path" like "audio/xxx.wav"
    relative_path = audio_info["path"]

    # Build real download URL
    download_url = HF_BASE_URL + relative_path

    # Determine extension
    ext = relative_path.split(".")[-1]

    # Local file path
    file_path = os.path.join(audio_dir, f"{i}.{ext}")

    # Download audio
    resp = requests.get(download_url)
    with open(file_path, "wb") as f:
        f.write(resp.content)

    # Save metadata
    meta = item.copy()
    meta["file_path"] = file_path
    rows.append(meta)

# Export CSV
csv_path = os.path.join(OUTPUT_DIR, "metadata.csv")
pd.DataFrame(rows).to_csv(csv_path, index=False)

print("DONE! Audio + CSV saved!")
