In [1]:
import os
import numpy as np
import torch
import IPython.display as ipd
import matplotlib.pyplot as plt
import librosa
import librosa.display
import torch 
import time
from audioldm import build_model
from audioldm.latent_diffusion.ddim import DDIMSampler
from torch.profiler import profile, record_function, ProfilerActivity

sr = 16000
# sr = 24000

# Testing notebook modification - this is a random comment! 🚀

In [2]:

model = build_model(model_name="audioldm-m-full")

# Clear cache and reset memory tracking for clean baseline measurement
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()

# Get baseline memory after model loading
baseline_memory = torch.cuda.max_memory_allocated() / (1024 * 1024)  # Convert to MiB
current_memory = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert to MiB

print(f"MODEL LOADING MEMORY: {baseline_memory:.1f} MiB (current: {current_memory:.1f} MiB)")

Load AudioLDM: %s audioldm-m-full
DiffusionWrapper has 415.95 M params.


  WeightNorm.apply(module, name, dim)
  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load(resume_from_checkpoint, map_location=device)


MODEL LOADING MEMORY: 4436.2 MiB (current: 4436.2 MiB)


In [3]:
prompt = "A violin playing a heartfelt melody."
duration = 15.0  

In [4]:
model.cond_stage_model.embed_mode = "text"
text_list = [prompt] 

text_emb = model.get_learned_conditioning(text_list)

unconditional_text = [""] 
unconditional_embedding = model.get_learned_conditioning(unconditional_text)

In [5]:
# DIFFUSION VRAM MEASUREMENT
sampler = DDIMSampler(model)

from audioldm.pipeline import duration_to_latent_t_size
latent_size = duration_to_latent_t_size(duration)
shape = [model.channels, latent_size, model.latent_f_size]
z = torch.randn([1] + shape).to(model.device)

ddim_steps = 200
ddim_eta = 0.1
unconditional_guidance_scale = 3.0

# Get memory before diffusion and reset peak tracker
memory_before_diffusion = torch.cuda.memory_allocated() / (1024 * 1024)
torch.cuda.reset_peak_memory_stats()

# Run diffusion
start_time = time.time()
samples, _ = sampler.sample(
    S=ddim_steps,  
    batch_size=1,
    shape=shape,
    conditioning=text_emb,
    unconditional_conditioning=unconditional_embedding,
    unconditional_guidance_scale=unconditional_guidance_scale,
    eta=ddim_eta,
    verbose=False,
    x_T=z
)
torch.cuda.synchronize()

# Get time and memory
diffusion_time = time.time() - start_time
isolated_peak = torch.cuda.max_memory_allocated() / (1024 * 1024)  # Convert to MiB
current_memory = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert to MiB
isolated_cost = isolated_peak - memory_before_diffusion

print(f"DIFFUSION ISOLATED PEAK: {isolated_peak:.1f} MiB (cost: +{isolated_cost:.1f} MiB)")
print(f"DIFFUSION CURRENT MEMORY: {current_memory:.1f} MiB")
print(f"DIFFUSION TIME: {diffusion_time:.1f}s ({diffusion_time/ddim_steps:.3f}s per step)")

# TEST: Force release cached memory to see if nvidia-smi goes down
print(f"BEFORE empty_cache() - Current memory: {torch.cuda.memory_allocated() / (1024 * 1024):.1f} MiB")
print("Now run 'nvidia-smi' in terminal to see memory before empty_cache()")
time.sleep(2)
torch.cuda.empty_cache()
print(f"AFTER empty_cache() - Current memory: {torch.cuda.memory_allocated() / (1024 * 1024):.1f} MiB")
print("Now run 'nvidia-smi' again to see if reserved memory decreased")

DDIM Sampler: 100%|██████████| 200/200 [00:15<00:00, 13.18it/s]


DIFFUSION ISOLATED PEAK: 4956.2 MiB (cost: +510.8 MiB)
DIFFUSION CURRENT MEMORY: 4446.5 MiB
DIFFUSION TIME: 15.2s (0.076s per step)
BEFORE empty_cache() - Current memory: 4446.5 MiB
Now run 'nvidia-smi' in terminal to see memory before empty_cache()
AFTER empty_cache() - Current memory: 4446.5 MiB
Now run 'nvidia-smi' again to see if reserved memory decreased


In [6]:
# VAE DECODER VRAM MEASUREMENT
# Get memory before VAE and reset peak tracker
memory_before_vae = torch.cuda.memory_allocated() / (1024 * 1024)
torch.cuda.reset_peak_memory_stats()

# Run VAE
start_time = time.time()
mel_spectrogram = model.decode_first_stage(samples)
torch.cuda.synchronize()
vae_time = time.time() - start_time

# Get memory usage
isolated_peak = torch.cuda.max_memory_allocated() / (1024 * 1024)  # Convert to MiB
current_memory = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert to MiB
isolated_cost = isolated_peak - memory_before_vae

print(f"VAE DECODER ISOLATED PEAK: {isolated_peak:.1f} MiB (cost: +{isolated_cost:.1f} MiB)")
print(f"VAE DECODER CURRENT MEMORY: {current_memory:.1f} MiB")
print(f"VAE TIME: {vae_time:.1f}s")

# Keep mel_spectrogram as GPU tensor - no need for CPU conversion!

# TEST: Force release cached memory after VAE step
print(f"BEFORE VAE empty_cache() - Current memory: {torch.cuda.memory_allocated() / (1024 * 1024):.1f} MiB")
print("Check nvidia-smi now to see memory before VAE empty_cache()")
time.sleep(2)
torch.cuda.empty_cache()
print(f"AFTER VAE empty_cache() - Current memory: {torch.cuda.memory_allocated() / (1024 * 1024):.1f} MiB")
print("Check nvidia-smi again to see if VAE cached memory was released")


VAE DECODER ISOLATED PEAK: 4832.9 MiB (cost: +386.4 MiB)
VAE DECODER CURRENT MEMORY: 4446.9 MiB
VAE TIME: 0.1s
BEFORE VAE empty_cache() - Current memory: 4446.9 MiB
Check nvidia-smi now to see memory before VAE empty_cache()
AFTER VAE empty_cache() - Current memory: 4446.9 MiB
Check nvidia-smi again to see if VAE cached memory was released


In [None]:
# VOCODER VRAM MEASUREMENT
# Use mel_spectrogram directly - it's already a GPU tensor!

# Get memory before vocoder and reset peak tracker
memory_before_vocoder = torch.cuda.memory_allocated() / (1024 * 1024)
torch.cuda.reset_peak_memory_stats()

# Run vocoder
start_time = time.time()
waveform = model.mel_spectrogram_to_waveform(mel_spectrogram)[0]
torch.cuda.synchronize()
vocoder_time = time.time() - start_time

# Get memory usage
isolated_peak = torch.cuda.max_memory_allocated() / (1024 * 1024)  # Convert to MiB
current_memory = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert to MiB
isolated_cost = isolated_peak - memory_before_vocoder

print(f"VOCODER ISOLATED PEAK: {isolated_peak:.1f} MiB (cost: +{isolated_cost:.1f} MiB)")
print(f"VOCODER CURRENT MEMORY: {current_memory:.1f} MiB")
print(f"VOCODER TIME: {vocoder_time:.1f}s")

# Final memory summary
print(f"\n=== ISOLATED MEMORY COSTS SUMMARY ===")
print(f"Model Loading Baseline: {baseline_memory:.1f} MiB")
print(f"Diffusion Cost: (see diffusion cell output)")
print(f"VAE Cost: (see VAE cell output)")
print(f"Vocoder Cost: +{isolated_cost:.1f} MiB")

# Play audio
ipd.display(ipd.Audio(waveform, rate=sr))

#lo

NameError: name 'mel_spectrogram' is not defined