In [None]:
from datasets import load_dataset
import soundfile as sf
import pandas as pd
from tqdm import tqdm
import os

# ----------------------------
# CHANGE THESE 2 LINES
# ----------------------------

DATASET_NAME = "Saads/xecanto_birds"     # e.g., "soundata/esc50"
OUTPUT_DIR = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet"  # any folder you want

# ----------------------------

# Create folders
audio_dir = os.path.join(OUTPUT_DIR, "audio")
os.makedirs(audio_dir, exist_ok=True)

# Load dataset
dataset = load_dataset(DATASET_NAME, split="train")

rows = []

for i, item in tqdm(enumerate(dataset), total=len(dataset)):
    audio = item["audio"]

    # Determine file extension safely
    if audio.get("path") and "." in audio["path"]:
        ext = audio["path"].split(".")[-1]
    else:
        ext = "wav"

    # Construct output audio path
    file_path = os.path.join(audio_dir, f"{i}.{ext}")

    # Save audio correctly
    sf.write(file_path, audio["array"], audio["sampling_rate"])

    # Prepare CSV row
    meta = item.copy()
    meta["file_path"] = file_path
    meta.pop("audio")  # remove audio array
    rows.append(meta)

# Save CSV in output folder
csv_path = os.path.join(OUTPUT_DIR, "metadata.csv")
df = pd.DataFrame(rows)
df.to_csv(csv_path, index=False)

print(f"Done!\nAudio saved to: {audio_dir}\nCSV saved to: {csv_path}")
