# DDSP Timbre Grower with NCA Mutation - Complete Implementation

This notebook trains a lightweight DDSP model and then uses a Neural Cellular Automaton (NCA) to creatively evolve the sound's timbre.

**Runtime**: Enable GPU in Colab for 5-10x speedup!

**Workflow**:
1. Upload target audio (violin.wav)
2. Install dependencies
3. Define DDSP and NCA components
4. Train DDSP model (~5-10 min on GPU)
5. Use the trained DDSP to provide a "seed" for the NCA
6. Generate new audio by letting the NCA evolve the timbre
7. Download results

## 1. Setup & Dependencies

In [None]:
!pip install torch torchvision torchaudio
!pip install librosa soundfile matplotlib numpy
!pip install tqdm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import librosa
import soundfile as sf
from google.colab import files
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import io

## 2. Upload Audio & Extract Features

In [None]:
uploaded = files.upload()
if not uploaded:
    print("No file uploaded. Please upload an audio file.")
else:
    file_name = next(iter(uploaded))
    audio, sr = librosa.load(io.BytesIO(uploaded[file_name]), sr=22050)
    print(f"Loaded '{file_name}' at {sr} Hz")

In [None]:
# Feature Extraction
def extract_features(audio, sr):
    f0 = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    rms = librosa.feature.rms(y=audio)[0]
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=30)

    # Ensure all features have the same length
    min_len = min(len(f0), len(rms), mfcc.shape[1])
    f0 = f0[:min_len]
    rms = rms[:min_len]
    mfcc = mfcc[:, :min_len]

    # Convert to tensors
    f0 = torch.tensor(f0, dtype=torch.float32).unsqueeze(0).unsqueeze(-1)
    loudness = torch.tensor(rms, dtype=torch.float32).unsqueeze(0).unsqueeze(-1)
    mfcc = torch.tensor(mfcc.T, dtype=torch.float32).unsqueeze(0)

    return {'f0': f0, 'loudness': loudness, 'mfcc': mfcc}

features = extract_features(audio, sr)
audio_tensor = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)

## 3. DDSP + NCA Model Definition

In [None]:
class HarmonicOscillator(nn.Module):
    def __init__(self, n_harmonics, sr):
        super().__init__()
        self.n_harmonics = n_harmonics
        self.sr = sr

    def forward(self, f0, amplitudes):
        # f0: (batch, time, 1), amplitudes: (batch, time, n_harmonics + 1)
        batch_size, time_steps, _ = f0.shape
        
        # Generate harmonic frequencies
        harmonic_multipliers = torch.arange(1, self.n_harmonics + 2, device=f0.device).float()
        harmonic_frequencies = f0 * harmonic_multipliers

        # Angular frequency
        omegas = 2 * np.pi * harmonic_frequencies / self.sr

        # Phases
        phases = torch.cumsum(omegas, dim=1)

        # Generate sinusoids
        sinusoids = torch.sin(phases)

        # Apply amplitudes
        harmonic_signal = (sinusoids * amplitudes).sum(dim=-1)
        return harmonic_signal

class FilteredNoiseGenerator(nn.Module):
    def __init__(self, n_magnitudes):
        super().__init__()
        self.n_magnitudes = n_magnitudes

    def forward(self, magnitudes):
        batch_size, time_steps, _ = magnitudes.shape
        
        # Generate white noise
        noise = torch.randn(batch_size, time_steps * 256, device=magnitudes.device) # Assume frame size 256

        # This is a simplified filtering process
        # A real implementation would use FFTs for proper frequency-domain filtering
        # For this example, we'll just modulate the noise amplitude
        magnitudes_upsampled = F.interpolate(magnitudes.transpose(1, 2), size=noise.shape[1], mode='linear').transpose(1, 2)
        noise_signal = noise * magnitudes_upsampled.mean(dim=-1)
        return noise_signal

class DDSPSynthesizer(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_harmonics, n_noise_magnitudes):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True)
        self.harmonic_amp = nn.Linear(hidden_dim, n_harmonics + 1)
        self.noise_mag = nn.Linear(hidden_dim, n_noise_magnitudes)

    def forward(self, f0, loudness, mfcc):
        # Concatenate features
        combined_features = torch.cat([f0, loudness, mfcc], dim=-1)
        
        # Pass through GRU
        gru_out, _ = self.gru(combined_features)
        
        # Predict synthesis parameters
        harmonic_amplitudes = torch.sigmoid(self.harmonic_amp(gru_out)) # Amplitudes are between 0 and 1
        noise_magnitudes = torch.sigmoid(self.noise_mag(gru_out))
        
        return {'amplitudes': harmonic_amplitudes, 'magnitudes': noise_magnitudes}

In [None]:
### --- NEW CELL --- ###
# Defines the Neural Cellular Automaton model.

class NCA(nn.Module):
    def __init__(self, channels, hidden_channels=32):
        super().__init__()
        # Simple 3-layer CNN to act as the update rule
        self.update_rule = nn.Sequential(
            nn.Conv1d(channels, hidden_channels, kernel_size=3, padding=1, padding_mode='circular'),
            nn.ReLU(),
            nn.Conv1d(hidden_channels, channels, kernel_size=1)
        )
        # Initialize the final layer to zero so the initial update is zero
        # This makes the NCA start by doing nothing to the input seed
        self.update_rule[-1].weight.data.zero_()
        self.update_rule[-1].bias.data.zero_()

    def forward(self, x, steps):
        # x is the seed: (batch, time_steps, channels)
        x = x.transpose(1, 2) # Conv1d expects (batch, channels, length)
        for _ in range(steps):
            update = self.update_rule(x)
            x = x + update
        return x.transpose(1, 2) # Convert back to (batch, time, channels)

In [None]:
### --- MODIFIED CELL --- ###
# Integrated the NCA into the main DDSP model.

class DDSPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_harmonics, n_noise_magnitudes, sr):
        super().__init__()
        self.synthesizer = DDSPSynthesizer(input_dim, hidden_dim, n_harmonics, n_noise_magnitudes)
        self.harmonic_osc = HarmonicOscillator(n_harmonics, sr)
        self.noise_gen = FilteredNoiseGenerator(n_noise_magnitudes)
        
        # Add the NCA model as a component
        self.nca = NCA(channels=n_harmonics + 1)

    def forward(self, features, nca_steps=0):
        # 1. DDSP predicts the synthesis parameters (the "perfect" seed)
        synth_params = self.synthesizer(features['f0'], features['loudness'], features['mfcc'])

        harmonic_controls = synth_params['amplitudes']
        noise_controls = synth_params['magnitudes']

        # 2. If nca_steps > 0, the NCA evolves the harmonic amplitudes
        if nca_steps > 0:
            harmonic_controls = self.nca(harmonic_controls, steps=nca_steps)
            # Ensure output is still in a valid range
            harmonic_controls = torch.sigmoid(harmonic_controls)

        # 3. The synthesizers generate audio from the (possibly evolved) parameters
        harmonic_signal = self.harmonic_osc(features['f0'], harmonic_controls)
        noise_signal = self.noise_gen(noise_controls)

        # Upsample to match audio length
        target_len = audio_tensor.shape[1]
        harmonic_signal = F.interpolate(harmonic_signal.unsqueeze(1), size=target_len, mode='linear', align_corners=False).squeeze(1)
        noise_signal = F.interpolate(noise_signal.unsqueeze(1), size=target_len, mode='linear', align_corners=False).squeeze(1)

        # Mix signals
        final_audio = harmonic_signal + noise_signal
        return final_audio, harmonic_signal, noise_signal

## 4. Training

In [None]:
# Multi-Scale Spectral Loss
def spectral_loss(y_true, y_pred, n_ffts=[2048, 1024, 512, 256]):
    loss = 0.0
    for n_fft in n_ffts:
        stft_true = torch.stft(y_true, n_fft, return_complex=True)
        stft_pred = torch.stft(y_pred, n_fft, return_complex=True)
        loss += F.l1_loss(torch.abs(stft_true), torch.abs(stft_pred))
    return loss

# Training Hyperparameters
N_HARMONICS = 100
N_NOISE_MAGNITUDES = 65
HIDDEN_DIM = 256
INPUT_DIM = features['f0'].shape[-1] + features['loudness'].shape[-1] + features['mfcc'].shape[-1]
LEARNING_RATE = 1e-3
EPOCHS = 1000

# Model & Optimizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DDSPModel(INPUT_DIM, HIDDEN_DIM, N_HARMONICS, N_NOISE_MAGNITUDES, sr).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Move data to device
features = {k: v.to(device) for k, v in features.items()}
audio_tensor = audio_tensor.to(device)

# Training Loop
pbar = tqdm(range(EPOCHS))
for epoch in pbar:
    # During training, we DON'T use the NCA. We want the DDSP to learn a perfect reconstruction.
    pred_audio, _, _ = model(features, nca_steps=0)

    loss = spectral_loss(audio_tensor, pred_audio)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    pbar.set_description(f"Loss: {loss.item():.4f}")

## 5. Generation: Reconstruct, Grow, and Evolve with NCA

In [None]:
### --- MODIFIED CELL --- ###
# This cell now generates both the original reconstruction and the new NCA-evolved version.

model.eval()
with torch.no_grad():
    # 1. Generate the standard DDSP reconstruction (no NCA)
    print("Generating standard DDSP reconstruction...")
    reconstructed_audio, _, _ = model(features, nca_steps=0)
    reconstructed_audio_np = reconstructed_audio.cpu().numpy().flatten()
    sf.write('ddsp_reconstructed.wav', reconstructed_audio_np, sr)
    print(" -> Saved ddsp_reconstructed.wav")

    # 2. Generate the NCA-evolved audio
    # --- CONTROLS ---
    NCA_STEPS = 5 # Try values like 2, 5, 10, or 20. Higher values = more evolution/distortion.
    # ----------------
    print(f"\nGenerating NCA-evolved audio with {NCA_STEPS} steps...")
    evolved_audio, _, _ = model(features, nca_steps=NCA_STEPS)
    evolved_audio_np = evolved_audio.cpu().numpy().flatten()
    sf.write('nca_evolved_timbre.wav', evolved_audio_np, sr)
    print(" -> Saved nca_evolved_timbre.wav")

    # 3. (Optional) Timbre Growing visualization from the original notebook
    # This part is less relevant for the NCA but kept for comparison
    print("\nGenerating timbre growth audio (for comparison)...")
    synth_params = model.synthesizer(features['f0'], features['loudness'], features['mfcc'])
    original_amplitudes = synth_params['amplitudes']
    
    full_audio = []
    for i in tqdm(range(1, N_HARMONICS + 1, 2)):
        temp_amps = torch.zeros_like(original_amplitudes)
        temp_amps[:, :, :i] = original_amplitudes[:, :, :i]
        
        harmonic_signal = model.harmonic_osc(features['f0'], temp_amps)
        target_len = int(audio_tensor.shape[1] / 10)
        harmonic_signal = F.interpolate(harmonic_signal.unsqueeze(1), size=target_len, mode='linear', align_corners=False).squeeze(1)
        full_audio.append(harmonic_signal.cpu().numpy().flatten())
    
    full_audio = np.concatenate(full_audio)
    sf.write('ddsp_growth_comparison.wav', full_audio, sr)
    print(" -> Saved ddsp_growth_comparison.wav")

## 6. Download Files

In [None]:
# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'features': {k: v.cpu() for k, v in features.items()},
}, 'ddsp_nca_model.pt')

# Download in Colab
print("✅ Files ready for download!")
files.download('ddsp_reconstructed.wav')
files.download('nca_evolved_timbre.wav')
files.download('ddsp_growth_comparison.wav')
files.download('ddsp_nca_model.pt')