# DDSP Timbre Grower - VECTORIZED (FAST!) 🚀

**THE FIX**: Vectorized HarmonicOscillator and FilteredNoiseGenerator

**Root cause**: Python for-loops launching hundreds of sequential GPU kernels

**Solution**: Vectorize all loops for parallel GPU execution

**Expected**: ~0.5-1s per epoch, 8-17 minutes for 1000 epochs

In [None]:
# 1. Setup
!pip install torch librosa soundfile matplotlib scipy tqdm -q

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import librosa
import soundfile as sf
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import Audio, display
import glob
from pathlib import Path
import time
import os

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# 2. Upload files
from google.colab import files
uploaded = files.upload()

os.makedirs('scale_tones', exist_ok=True)
for filename in uploaded.keys():
    os.rename(filename, f'scale_tones/{filename}')

audio_files = sorted(glob.glob('scale_tones/*.wav'))
print(f"\n✅ Uploaded {len(audio_files)} files")

In [None]:
# 3. VECTORIZED DDSP Components (THE FIX!)

class HarmonicOscillatorVectorized(nn.Module):
    """VECTORIZED harmonic oscillator - NO for-loops!"""

    def __init__(self, sample_rate=22050, n_harmonics=64):
        super().__init__()
        self.sample_rate = sample_rate
        self.n_harmonics = n_harmonics
        
        # Pre-compute harmonic numbers [1, 2, 3, ..., n_harmonics]
        self.register_buffer(
            'harmonic_numbers',
            torch.arange(1, n_harmonics + 1, dtype=torch.float32)
        )

    def forward(self, f0_hz, harmonic_amplitudes):
        batch_size, n_frames = f0_hz.shape
        hop_length = 512
        n_samples = n_frames * hop_length

        # Upsample f0 and amplitudes
        f0_upsampled = F.interpolate(
            f0_hz.unsqueeze(1), size=n_samples, mode='linear', align_corners=True
        ).squeeze(1)  # [batch, samples]

        harmonic_amplitudes_upsampled = F.interpolate(
            harmonic_amplitudes.transpose(1, 2), size=n_samples,
            mode='linear', align_corners=True
        ).transpose(1, 2)  # [batch, samples, n_harmonics]

        # Compute base phase
        phase = 2 * torch.pi * torch.cumsum(f0_upsampled / self.sample_rate, dim=1)  # [batch, samples]

        # VECTORIZED: Generate ALL harmonics at once!
        # Broadcast phase to [batch, samples, n_harmonics]
        phase_broadcast = phase.unsqueeze(-1)  # [batch, samples, 1]
        harmonic_phases = phase_broadcast * self.harmonic_numbers  # [batch, samples, n_harmonics]
        
        # Single sin() call for ALL harmonics
        harmonic_signals = torch.sin(harmonic_phases)  # [batch, samples, n_harmonics]
        
        # Apply amplitudes and sum across harmonics
        weighted_harmonics = harmonic_signals * harmonic_amplitudes_upsampled
        audio = weighted_harmonics.sum(dim=-1)  # [batch, samples]

        return audio


class FilteredNoiseGeneratorVectorized(nn.Module):
    """VECTORIZED noise generator - NO for-loops!"""

    def __init__(self, sample_rate=22050, n_filter_banks=64):
        super().__init__()
        self.sample_rate = sample_rate
        self.n_filter_banks = n_filter_banks

        self.register_buffer(
            'filter_freqs',
            torch.logspace(
                torch.log10(torch.tensor(20.0)),
                torch.log10(torch.tensor(sample_rate / 2.0)),
                n_filter_banks
            )
        )

    def forward(self, filter_magnitudes):
        batch_size, n_frames, _ = filter_magnitudes.shape
        hop_length = 512
        n_samples = n_frames * hop_length

        # Generate white noise
        noise = torch.randn(batch_size, n_samples, device=filter_magnitudes.device)
        noise_fft = torch.fft.rfft(noise, dim=1)
        freqs = torch.fft.rfftfreq(n_samples, 1/self.sample_rate).to(filter_magnitudes.device)

        # Upsample filter magnitudes
        filter_magnitudes_upsampled = F.interpolate(
            filter_magnitudes.transpose(1, 2), size=n_samples,
            mode='linear', align_corners=True
        ).transpose(1, 2)  # [batch, samples, n_filter_banks]

        # VECTORIZED: Compute filter response for ALL frequencies at once
        # Shape manipulations for broadcasting
        log_freqs = torch.log(freqs + 1e-7).unsqueeze(-1)  # [n_freqs, 1]
        log_filter_freqs = torch.log(self.filter_freqs + 1e-7).unsqueeze(0)  # [1, n_filter_banks]
        
        # Compute distances between all freq pairs
        distances = torch.abs(log_freqs - log_filter_freqs)  # [n_freqs, n_filter_banks]
        
        # Compute weights for all frequencies
        weights = torch.exp(-distances**2 / 0.5)  # [n_freqs, n_filter_banks]
        weights = weights / (weights.sum(dim=-1, keepdim=True) + 1e-7)
        
        # Apply weights using batch matrix multiplication
        # filter_magnitudes_upsampled: [batch, samples, n_filter_banks]
        # We want: [batch, n_freqs]
        filter_response = torch.einsum('fk,bsk->bf', weights, filter_magnitudes_upsampled)

        # Apply filter
        filtered_fft = noise_fft * filter_response
        filtered_noise = torch.fft.irfft(filtered_fft, n=n_samples, dim=1)

        return filtered_noise


class DDSPSynthesizer(nn.Module):
    """Neural network that maps features to synthesis parameters."""

    def __init__(self, n_harmonics=64, n_filter_banks=64, hidden_size=512, n_mfcc=30):
        super().__init__()
        input_size = 1 + 1 + n_mfcc
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=0.1)
        
        self.harmonic_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_harmonics),
            nn.Softplus()
        )
        
        self.noise_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_filter_banks),
            nn.Sigmoid()
        )

    def forward(self, f0, loudness, mfcc):
        x = torch.cat([f0, loudness, mfcc], dim=-1)
        x, _ = self.gru(x)
        harmonic_amplitudes = self.harmonic_head(x)
        filter_magnitudes = self.noise_head(x)
        loudness_scale = torch.exp(loudness / 20.0)
        harmonic_amplitudes = harmonic_amplitudes * loudness_scale
        return harmonic_amplitudes, filter_magnitudes


class DDSPModelVectorized(nn.Module):
    """Complete DDSP model with VECTORIZED components."""

    def __init__(self, sample_rate=22050, n_harmonics=64, n_filter_banks=64, hidden_size=512):
        super().__init__()
        self.sample_rate = sample_rate
        self.synthesizer = DDSPSynthesizer(n_harmonics, n_filter_banks, hidden_size)
        self.harmonic_osc = HarmonicOscillatorVectorized(sample_rate, n_harmonics)  # VECTORIZED!
        self.noise_gen = FilteredNoiseGeneratorVectorized(sample_rate, n_filter_banks)  # VECTORIZED!
        self.register_parameter('harmonic_noise_ratio', nn.Parameter(torch.tensor(0.8)))

    def forward(self, f0, loudness, mfcc):
        harmonic_amplitudes, filter_magnitudes = self.synthesizer(f0, loudness, mfcc)
        f0_hz = f0.squeeze(-1)
        harmonic_audio = self.harmonic_osc(f0_hz, harmonic_amplitudes)
        noise_audio = self.noise_gen(filter_magnitudes)
        ratio = torch.sigmoid(self.harmonic_noise_ratio)
        audio = ratio * harmonic_audio + (1 - ratio) * noise_audio
        return audio, harmonic_audio, noise_audio


class MultiScaleSpectralLoss(nn.Module):
    def __init__(self, fft_sizes=[2048, 1024, 512, 256]):
        super().__init__()
        self.fft_sizes = fft_sizes

    def forward(self, pred_audio, target_audio):
        total_loss = 0.0
        for fft_size in self.fft_sizes:
            pred_stft = torch.stft(
                pred_audio, n_fft=fft_size, hop_length=fft_size // 4,
                window=torch.hann_window(fft_size, device=pred_audio.device),
                return_complex=True
            )
            target_stft = torch.stft(
                target_audio, n_fft=fft_size, hop_length=fft_size // 4,
                window=torch.hann_window(fft_size, device=target_audio.device),
                return_complex=True
            )
            pred_log_mag = torch.log(torch.abs(pred_stft) + 1e-5)
            target_log_mag = torch.log(torch.abs(target_stft) + 1e-5)
            total_loss += F.l1_loss(pred_log_mag, target_log_mag)
        return total_loss / len(self.fft_sizes)

print("✅ VECTORIZED model defined")

In [None]:
# 4. Feature extraction

def extract_features(audio_path, sample_rate=22050):
    audio, sr = librosa.load(audio_path, sr=sample_rate, mono=True)
    hop_length = int(sample_rate / 43.066)

    f0_yin = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, hop_length=hop_length)
    f0_yin = np.nan_to_num(f0_yin, nan=0.0)
    f0_yin = np.maximum(f0_yin, 0.0)

    loudness = librosa.feature.rms(y=audio, frame_length=2048, hop_length=hop_length)[0]
    loudness_db = librosa.amplitude_to_db(loudness, ref=1.0)

    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=30, hop_length=hop_length).T
    min_len = min(len(f0_yin), len(loudness_db), len(mfcc))

    return {
        'f0': f0_yin[:min_len],
        'loudness': loudness_db[:min_len],
        'mfcc': mfcc[:min_len],
        'audio': audio,
        'n_frames': min_len
    }

print("✅ Feature extraction ready")

In [None]:
# 5. Test vectorized model speed

print("🧪 Testing vectorized model performance...\n")

# Load one file for testing
test_features = extract_features(audio_files[0])
f0_test = torch.tensor(test_features['f0'], dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)
loudness_test = torch.tensor(test_features['loudness'], dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)
mfcc_test = torch.tensor(test_features['mfcc'], dtype=torch.float32).unsqueeze(0).to(device)

model_test = DDSPModelVectorized().to(device)

# Warmup
with torch.no_grad():
    _ = model_test(f0_test, loudness_test, mfcc_test)

# Time forward pass
start = time.time()
with torch.no_grad():
    _ = model_test(f0_test, loudness_test, mfcc_test)
forward_time = time.time() - start

print(f"Forward pass time: {forward_time:.3f}s")

if forward_time < 0.5:
    print(f"✅ EXCELLENT! This is {11.9 / forward_time:.1f}x faster than before!")
    print(f"   Expected time per epoch: ~{forward_time * 2:.1f}s")
    print(f"   Expected total time (1000 epochs): ~{forward_time * 2000 / 60:.1f} minutes")
elif forward_time < 2.0:
    print(f"✅ GOOD! This is {11.9 / forward_time:.1f}x faster than before!")
    print(f"   Expected time per epoch: ~{forward_time * 2:.1f}s")
    print(f"   Expected total time (1000 epochs): ~{forward_time * 2000 / 60:.1f} minutes")
else:
    print(f"⚠️  Still slow ({forward_time:.1f}s), but {11.9 / forward_time:.1f}x improvement")

print("\n" + "="*70)

In [None]:
# 6. Train files sequentially with vectorized model

N_EPOCHS = 1000
os.makedirs('outputs', exist_ok=True)

print(f"\n🎯 Training {len(audio_files)} files with VECTORIZED model")
print(f"   Epochs per file: {N_EPOCHS}")
print(f"   Expected per file: 10-15 minutes\n")

total_start = time.time()

for file_idx, audio_file in enumerate(audio_files):
    print(f"\n{'='*70}")
    print(f"File {file_idx + 1}/{len(audio_files)}: {Path(audio_file).name}")
    print(f"{'='*70}\n")
    
    # Extract features
    features = extract_features(audio_file)
    
    f0 = torch.tensor(features['f0'], dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)
    loudness = torch.tensor(features['loudness'], dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)
    mfcc = torch.tensor(features['mfcc'], dtype=torch.float32).unsqueeze(0).to(device)
    target_audio = torch.tensor(features['audio'], dtype=torch.float32).unsqueeze(0).to(device)
    
    # Create model
    model = DDSPModelVectorized().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = MultiScaleSpectralLoss().to(device)
    
    # Training
    losses = []
    best_loss = float('inf')
    file_start = time.time()
    
    for epoch in tqdm(range(N_EPOCHS), desc=f"Training"):
        model.train()
        optimizer.zero_grad()
        
        pred_audio, _, _ = model(f0, loudness, mfcc)
        
        min_len = min(pred_audio.shape[1], target_audio.shape[1])
        pred_audio_trim = pred_audio[:, :min_len]
        target_audio_trim = target_audio[:, :min_len]
        
        spec_loss = loss_fn(pred_audio_trim, target_audio_trim)
        time_loss = F.l1_loss(pred_audio_trim, target_audio_trim)
        total_loss = 1.0 * spec_loss + 0.1 * time_loss
        
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        loss_value = total_loss.item()
        losses.append(loss_value)
        
        if loss_value < best_loss:
            best_loss = loss_value
        
        if epoch == 0:
            print(f"\n   First epoch: {time.time() - file_start:.1f}s (Loss: {loss_value:.6f})\n")
        
        if (epoch + 1) % 200 == 0:
            elapsed = time.time() - file_start
            eta = (elapsed / (epoch + 1)) * (N_EPOCHS - epoch - 1)
            print(f"\n   Epoch {epoch+1}: Loss={loss_value:.6f}, Best={best_loss:.6f}, ETA={eta/60:.1f}min")
    
    file_time = time.time() - file_start
    
    # Save
    torch.save({
        'model_state_dict': model.state_dict(),
        'features': features,
        'losses': losses,
        'best_loss': best_loss,
    }, f"outputs/model_{Path(audio_file).stem}.pt")
    
    print(f"\n✅ Complete! Time: {file_time/60:.1f}min, Loss: {best_loss:.6f}")

total_time = time.time() - total_start
print(f"\n\n🎉 ALL FILES COMPLETE!")
print(f"Total: {total_time/60:.1f}min, Avg: {total_time/60/len(audio_files):.1f}min per file")