# DDSP Diagnostic - Find the Bottleneck

Run this to diagnose why training is so slow (264s per epoch).

In [None]:
import torch
import time

print("=" * 60)
print("DIAGNOSTIC REPORT")
print("=" * 60)

# 1. Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\n1. Device Check:")
print(f"   Device: {device}")
print(f"   CUDA available: {torch.cuda.is_available()}")
if device == 'cuda':
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("   ⚠️  WARNING: Running on CPU! This is why it's slow.")

# 2. Check where tensors are
print(f"\n2. Tensor Location Check:")
print(f"   f0 device: {f0.device}")
print(f"   model device: {next(model.parameters()).device}")
if str(f0.device) != device or str(next(model.parameters()).device) != device:
    print("   ⚠️  WARNING: Tensors not on same device as model!")

# 3. Check batch size
print(f"\n3. Batch Size Check:")
print(f"   Batch size: {f0.shape[0]}")
print(f"   Frames per sample: {f0.shape[1]}")
print(f"   Total elements: {f0.numel():,}")
if f0.shape[0] > 12:
    print(f"   ⚠️  WARNING: Batch size {f0.shape[0]} might be too large!")

# 4. Time a single forward pass
print(f"\n4. Forward Pass Timing:")
model.eval()
with torch.no_grad():
    # Warmup
    _ = model(f0[:1], loudness[:1], mfcc[:1])
    
    # Time single sample
    start = time.time()
    _ = model(f0[:1], loudness[:1], mfcc[:1])
    single_time = time.time() - start
    
    # Time full batch
    start = time.time()
    _ = model(f0, loudness, mfcc)
    batch_time = time.time() - start

print(f"   Single sample: {single_time:.3f}s")
print(f"   Full batch ({f0.shape[0]} samples): {batch_time:.3f}s")
print(f"   Expected per-sample cost: {batch_time / f0.shape[0]:.3f}s")

if single_time > 1.0:
    print(f"   ⚠️  CRITICAL: Single forward pass is {single_time:.1f}s - should be <0.1s!")
    print(f"   Likely running on CPU or model not optimized.")

# 5. Check mixed precision
print(f"\n5. Mixed Precision Check:")
try:
    from torch.amp import autocast, GradScaler
    scaler = GradScaler('cuda')
    print("   ✅ PyTorch 2.x AMP API detected")
except:
    try:
        from torch.cuda.amp import autocast, GradScaler
        scaler = GradScaler()
        print("   ✅ PyTorch 1.x AMP API detected")
    except Exception as e:
        print(f"   ⚠️  WARNING: AMP not available: {e}")

# 6. Memory check
if device == 'cuda':
    print(f"\n6. GPU Memory:")
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    print(f"   Allocated: {allocated:.2f} GB")
    print(f"   Reserved: {reserved:.2f} GB")
    if allocated > 20:
        print(f"   ⚠️  WARNING: High memory usage, may cause slowdown")

# 7. Recommendation
print(f"\n" + "=" * 60)
print("RECOMMENDATIONS:")
print("=" * 60)

if device == 'cpu':
    print("\n🚨 CRITICAL: You're running on CPU!")
    print("   Solution: Enable GPU in Colab")
    print("   Runtime → Change runtime type → Hardware accelerator → GPU")
elif single_time > 1.0:
    print("\n🚨 CRITICAL: Forward pass too slow")
    print("   Current: {:.1f}s per forward pass".format(single_time))
    print("   Expected: <0.1s")
    print("   \nPossible causes:")
    print("   1. Mixed precision not working")
    print("   2. Model or tensors not on GPU")
    print("   3. GPU utilization low")
elif f0.shape[0] > 12:
    print(f"\n⚠️  Batch size ({f0.shape[0]}) might be too large")
    print("   Solution: Reduce to 6-8 files")
    print("   audio_files = audio_files[:8]")
else:
    print("\n✅ System looks OK, but still slow")
    print("   Try reducing batch size to 4 files")
    print("   Or train files sequentially (1 at a time)")

print("\n" + "=" * 60)