In [None]:
import os
import torchaudio
import json
from tqdm import tqdm
import re
import random
import string

INPUT_DIR = "dataset/chunks"                    
OUTPUT_BASE = "./AudioLDM-training-finetuning/data/dataset/"  
DATASET_NAME = "audioset"               

SAMPLE_RATE = 16000
TRAIN_SPLIT = 0.90 

DATASET_DIR = os.path.join(OUTPUT_BASE, DATASET_NAME)
WAV_DIR = os.path.join(DATASET_DIR, "wavs")
META_DIR = os.path.join(DATASET_DIR, "metadata")
ROOT_META_DIR = os.path.join(OUTPUT_BASE, "metadata")

os.makedirs(WAV_DIR, exist_ok=True)
os.makedirs(META_DIR, exist_ok=True)
os.makedirs(ROOT_META_DIR, exist_ok=True)

TRAIN_JSON = os.path.join(META_DIR, f"{DATASET_NAME}_train.json")
VAL_JSON = os.path.join(META_DIR, f"{DATASET_NAME}_val.json")
ROOT_JSON = os.path.join(ROOT_META_DIR, "dataset_root.json")

def random_base62_id(length=11):
    chars = string.ascii_letters + string.digits
    return ''.join(random.choice(chars) for _ in range(length))

def extract_title(fname):
    raw = os.path.splitext(fname)[0]
    raw = re.sub(r"_chunk.*$", "", raw)
    raw = re.sub(r"^\d+[\.\-_ ]*\s*", "", raw)
    return raw.strip()

audio_files = sorted([
    f for f in os.listdir(INPUT_DIR)
    if f.lower().endswith(".wav")
])

print(f"Found {len(audio_files)} audio files.")

dataset_entries = []

for fname in tqdm(audio_files, desc="Processing audio", unit="file"):

    in_path = os.path.join(INPUT_DIR, fname)

    wav, sr = torchaudio.load(in_path)

    if wav.shape[0] > 1:
        wav = wav.mean(dim=0).unsqueeze(0)

    if sr != SAMPLE_RATE:
        wav = torchaudio.functional.resample(wav, sr, SAMPLE_RATE)

    # ðŸŽ¯ Use YouTube-like random ID for filename
    simple_name = f"{random_base62_id()}.wav"
    out_path = os.path.join(WAV_DIR, simple_name)

    torchaudio.save(out_path, wav, SAMPLE_RATE)

    title = extract_title(fname)

    dataset_entries.append({
        "wav": f"wavs/{simple_name}",
        "caption": title
    })

random.seed(0)
random.shuffle(dataset_entries)

split_idx = int(len(dataset_entries) * TRAIN_SPLIT)
train_entries = dataset_entries[:split_idx]
val_entries = dataset_entries[split_idx:]

with open(TRAIN_JSON, "w") as f:
    json.dump({"data": train_entries}, f, indent=2)

with open(VAL_JSON, "w") as f:
    json.dump({"data": val_entries}, f, indent=2)

root_json_content = {
    DATASET_NAME: f"./data/dataset/{DATASET_NAME}",

    "metadata": {
        "path": {
            DATASET_NAME: {
                "train": f"./data/dataset/{DATASET_NAME}/metadata/{DATASET_NAME}_train.json",
                "val":   f"./data/dataset/{DATASET_NAME}/metadata/{DATASET_NAME}_val.json",
                "test":  f"./data/dataset/{DATASET_NAME}/metadata/{DATASET_NAME}_val.json",
                "class_label_indices": None
            }
        }
    }
}

with open(ROOT_JSON, "w") as f:
    json.dump(root_json_content, f, indent=2)

print("\nâœ“ Dataset preprocessing complete.")
print(f"âœ“ Random ID WAVs saved in: {WAV_DIR}")
print(f"âœ“ Example filename: {random_base62_id()}.wav")
print(f"âœ“ Train metadata: {TRAIN_JSON}")
print(f"âœ“ Val metadata:   {VAL_JSON}")
print(f"âœ“ dataset_root.json created: {ROOT_JSON}")


# Cuda Version has to match

In [5]:
import torch
print(torch.version.cuda)


13.0


In [6]:
! nvidia-smi

Mon Dec  8 13:29:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.76.07              Driver Version: 581.08         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060        On  |   00000000:01:00.0  On |                  N/A |
| 33%   41C    P8            N/A  /  115W |     498MiB /   8188MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

# To run training, run in terminal not in ipynb

In [None]:
import os
os.chdir("./AudioLDM-training-finetuning/")

In [None]:
! python3 audioldm_train/train/latent_diffusion.py \
    -c audioldm_train/config/2023_08_23_reproduce_audioldm/audioldm_original.yaml \
    --reload_from_ckpt data/checkpoints/audioldm-s-full