<a href="https://colab.research.google.com/github/magenta/ddsp/blob/master/ddsp/colab/tutorials/3_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


##### Copyright 2020 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");





In [0]:
# Copyright 2020 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# DDSP Training

This notebook demonstrates the libraries in [https://github.com/magenta/ddsp/tree/master/ddsp/training](https://github.com/magenta/ddsp/tree/master/ddsp/training). It is a simple example, overfitting a single audio sample, for educational purposes. 

_For a full training pipeline please use [ddsp/training/ddsp_run.py](https://github.com/magenta/ddsp/blob/master/ddsp/training/README.md#train-1) as in the [train_autoencoder.ipynb](https://github.com/magenta/ddsp/blob/master/ddsp/colab/demos/train_autoencoder.ipynb)_.



In [0]:
#@title Install and import dependencies

!pip install -qU ddsp

# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")

import time

import ddsp
import ddsp.training
from ddsp.colab.colab_utils import play, specplot, DEFAULT_SAMPLE_RATE
import gin
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

tf.disable_v2_behavior()
sample_rate = DEFAULT_SAMPLE_RATE  # 16000

# Get a Batch of Data

In [0]:
# Get a single example from NSynth.
# Takes a few seconds to load from GCS.
tf.reset_default_graph()
data_provider = ddsp.training.data.NSynthTfds(split='test')
batch = data_provider.get_batch(batch_size=1, shuffle=False)
batch = next(tfds.as_numpy(batch))
audio = batch['audio']
n_samples = audio.shape[1]

specplot(audio, sess=tf.Session())
play(audio)

# Train

### Model in python 

In [0]:
tf.reset_default_graph()
preprocessing = ddsp.training.preprocessing
encoders = ddsp.training.encoders
decoders = ddsp.training.decoders
models = ddsp.training.models
TIME_STEPS = 1000

# Create Neural Networks.
preprocessor = preprocessing.DefaultPreprocessor(time_steps=TIME_STEPS)

decoder = decoders.RnnFcDecoder(rnn_channels = 256,
                                rnn_type = 'gru',
                                ch = 256,
                                layers_per_stack = 1,
                                output_splits = (('amps', 1),
                                                 ('harmonic_distribution', 20),
                                                 ('noise_magnitudes', 20)))

# Create Processors.
additive = ddsp.synths.Additive(n_samples=n_samples, 
                                sample_rate=sample_rate,
                                name='additive')
noise = ddsp.synths.FilteredNoise(window_size=0,
                                  initial_bias=-10.0,
                                  name='noise')
add = ddsp.processors.Add(name='add')

# Create ProcessorGroup.
dag = [(additive, ['amps', 'harmonic_distribution', 'f0_hz']),
       (noise, ['noise_magnitudes']),
       (add, ['noise/signal', 'additive/signal'])]

processor_group = ddsp.processors.ProcessorGroup(dag=dag,
                                                 name='processor_group')


# Loss_functions
spectral_loss = ddsp.losses.SpectralLoss(loss_type='L1',
                                         mag_weight=1.0,
                                         logmag_weight=1.0)

# Put it together in a model.
model = models.Autoencoder(preprocessor=preprocessor,
                           encoder=None,
                           decoder=decoder,
                           processor_group=processor_group,
                           losses=[spectral_loss])

#### Or model in gin...

In [0]:
tf.reset_default_graph()
gin_string = """
import ddsp
import ddsp.training

# Preprocessor
models.Autoencoder.preprocessor = @preprocessing.DefaultPreprocessor()
preprocessing.DefaultPreprocessor.time_steps = 1000


# Encoder
models.Autoencoder.encoder = None

# Decoder
models.Autoencoder.decoder = @decoders.RnnFcDecoder()
decoders.RnnFcDecoder.rnn_channels = 256
decoders.RnnFcDecoder.rnn_type = 'gru'
decoders.RnnFcDecoder.ch = 256
decoders.RnnFcDecoder.layers_per_stack = 1
decoders.RnnFcDecoder.output_splits = (('amps', 1),
                                       ('harmonic_distribution', 20),
                                       ('noise_magnitudes', 20))

# ProcessorGroup
models.Autoencoder.processor_group = @processors.ProcessorGroup()

processors.ProcessorGroup.dag = [
  (@additive/synths.Additive(),
    ['amps', 'harmonic_distribution', 'f0_hz']),
  (@noise/synths.FilteredNoise(),
    ['noise_magnitudes']),
  (@add/processors.Add(),
    ['noise/signal', 'additive/signal']),
]

# Additive Synthesizer
additive/synths.Additive.name = 'additive'
additive/synths.Additive.n_samples = 64000
additive/synths.Additive.scale_fn = @core.exp_sigmoid

# Filtered Noise Synthesizer
noise/synths.FilteredNoise.name = 'noise'
noise/synths.FilteredNoise.n_samples = 64000
noise/synths.FilteredNoise.initial_bias = -10.0

# Add
add/processors.Add.name = 'add'

models.Autoencoder.losses = [
    @losses.SpectralLoss(),
]
losses.SpectralLoss.loss_type = 'L1'
losses.SpectralLoss.mag_weight = 1.0
losses.SpectralLoss.logmag_weight = 1.0
"""

with gin.unlock_config():
  gin.parse_config(gin_string)

# Autoencoder arguments are filled by gin.
model = ddsp.training.models.Autoencoder()

## Get training op

In [0]:
learning_rate = 1e-3
use_tpu = True  # Set: Runtime -> Set Runtime Type -> (TPU or GPU)

# Get model predictions for the batch.
start_time = time.time()
outputs = model(batch)
loss = outputs['total_loss']
train_op = ddsp.training.train_util.get_train_op(loss, 
                                                 learning_rate=learning_rate,
                                                 use_tpu=use_tpu)
print('Setting up the graph took %.1f seconds' % (time.time() - start_time))

# Setup the session.
if use_tpu:
  import os
  assert "COLAB_TPU_ADDR" in os.environ, "ERROR: Not connected to a TPU runtime; please set the runtime type to 'TPU'."
  TPU_ADDRESS = "grpc://" + os.environ["COLAB_TPU_ADDR"]
  sess = tf.Session(TPU_ADDRESS)
else:
  # Set: Runtime -> Set Runtime Type -> GPU
  sess = tf.Session()

start_time = time.time()
sess.run(tf.initialize_all_variables())
print('Initializing model took %.1f seconds' % (time.time() - start_time))

## Train Loop

In [0]:
for i in range(300):
  _, loss_ = sess.run([train_op, loss])
  print('i: {}\tLoss: {}'.format(i, loss_))

# Analyze results

In [0]:
# Run a batch of predictions.
start_time = time.time()
predictions = sess.run(outputs)
print('Prediction took %.1f seconds' % (time.time() - start_time))

In [0]:
batch_idx = 0
get = lambda key: ddsp.core.nested_lookup(key, predictions)[batch_idx]

audio = get('audio')
audio_gen = get('audio_gen')
amps = get('additive/controls/amplitudes')
harmonic_distribution = get('additive/controls/harmonic_distribution')
f0_hz = get('f0_hz')
loudness = get('loudness_db')

print('Original Audio')
play(audio)
print('Resynthesized Audio')
play(audio_gen)

specplot(audio, sess=sess)
plt.title('Audio')
specplot(audio_gen, sess=sess)
plt.title('Audio Synth')

f, ax = plt.subplots(1, 2, figsize=(14, 4))
ax[0].semilogy(amps)
ax[0].set_xlabel('Amps')
ax[0].set_ylim(1e-5, 2)
ax[1].plot(loudness)
ax[1].set_xlabel('loudness')

f, ax = plt.subplots(1, 2, figsize=(14, 4))
ax[0].plot(harmonic_distribution)
ax[0].set_title('Harmonic Distribution')
ax[1].plot(f0_hz)
_ = ax[1].set_title('F0_Hz')
