<a href="https://colab.research.google.com/github/magenta/ddsp/blob/master/ddsp/colab/demos/timbre_transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2020 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");





In [0]:
# Copyright 2020 Google LLC. All Rights Reserved.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# DDSP Timbre Transfer Demo

This notebook is a demo of timbre transfer using DDSP (Differentiable Digital Signal Processing). 
The model here is trained to generate audio conditioned on a time series of fundamental frequency and loudness. 

* [DDSP ICLR paper](https://openreview.net/forum?id=B1x1ma4tDr)
* [Audio Examples](http://goo.gl/magenta/ddsp/blob/master/ddsp/colab-examples) 

<img src="https://storage.googleapis.com/ddsp/additive_diagram/ddsp_autoencoder.png" alt="DDSP Autoencoder figure" width="700">


# Environment Setup


This notebook extracts these features from input audio (either uploaded files, or recorded from the microphone) and resynthesizes with the model.

Have fun! And please feel free to hack this notebook to make your own creative interactions.

### Instructions for running:

* Make sure to use a GPU runtime, click:  __Runtime >> Change Runtime Type >> GPU__
* Press the ▶️button on the left of each of the cells
* View the code: Double-click any of the cells
* Hide the code: Double click the right side of the cell


In [0]:
#@title #Install and Import

#@markdown Install ddsp, define some helper functions, and download the model. This transfers a lot of data and _should take a minute or two_.

GCS_DIR = 'gs://ddsp/models'
CKPT_DIR = '/content/ckpts'

print('Installing from pip package...')
!pip install -qU ddsp

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time

import crepe
import ddsp
import ddsp.training
from ddsp.colab.colab_utils import (download, play, record, specplot, upload,
                                    DEFAULT_SAMPLE_RATE)
import gin
from google.colab import files
import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

# Helper Functions
tf.compat.v1.disable_v2_behavior()
sample_rate = DEFAULT_SAMPLE_RATE  # 16000
TARGET = ''

def reset_crepe():
  """Reset the global state of CREPE to force model re-building."""
  for k in crepe.core.models:
    crepe.core.models[k] = None

print('Done!')

In [0]:
#@title Record or Upload Audio
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav) 
#@markdown * Audio should be monophonic (single instrument / voice)
#@markdown * Extracts fundmanetal frequency (f0) and loudness features. 

record_or_upload = "Record" #@param ["Record", "Upload (.mp3 or .wav)"]

record_seconds =   5#@param {type:"number", min:1, max:10, step:1}

if record_or_upload == "Record":
  audio = record(seconds=record_seconds)
else:
  # Load audio sample here (.mp3 or .wav3 file)
  # Just use the first file.
  filenames, audios = upload()
  audio = audios[0]
print('\nExtracting audio features...')

# Plot.
with tf.Session() as sess:
  specplot(audio, sess=sess)
play(audio)


# Setup the session.
tf.reset_default_graph()
sess = tf.Session(TARGET)
tf.keras.backend.set_session(sess)
reset_crepe()

# Compute features.
start_time = time.time()
audio_features = ddsp.training.eval_util.compute_audio_features(audio)
audio_features_mod = None
print('Audio features took %.1f seconds' % (time.time() - start_time))



# Plot Features.
fig, ax = plt.subplots(nrows=3, 
                       ncols=1, 
                       sharex=True,
                       figsize=(6, 8))
ax[0].plot(audio_features['loudness_db'])
ax[0].set_ylabel('loudness_db')

ax[1].plot(librosa.hz_to_midi(audio_features['f0_hz']))
ax[1].set_ylabel('f0 [midi]')

ax[2].plot(audio_features['f0_confidence'])
ax[2].set_ylabel('f0 confidence')
_ = ax[2].set_xlabel('Time step [frame]')

In [0]:
#@title Choose a model

model = 'Violin' #@param ['Violin', 'Flute', 'Flute2', 'Upload your own (checkpoint folder as .zip)']
MODEL = model

GCS_CKPT_DIR = 'gs://ddsp/models'

if model in ('Violin', 'Flute', 'Flute2'):
  model_dir = os.path.join(GCS_CKPT_DIR, 'solo_%s_ckpt' % model.lower())
else:
  raise ValueError

# Assumes only one checkpoint in the folder, 'model.ckpt-[iter]`.
ckpt_files = [f for f in tf.gfile.ListDirectory(model_dir) if 'model.ckpt' in f]
ckpt_name = '.'.join(ckpt_files[0].split('.')[:2])
ckpt = os.path.join(model_dir, ckpt_name)

# Parse gin config
with gin.unlock_config():
  gin_file = os.path.join(model_dir, 'operative_config-0.gin')
  gin.parse_config_file(gin_file, skip_unknown=True)

# Ensure dimensions and sampling rates are equal
time_steps_train = gin.query_parameter('DefaultPreprocessor.time_steps')
n_samples_train = gin.query_parameter('Additive.n_samples')
hop_size = int(n_samples_train / time_steps_train)

time_steps = int(audio.shape[0] / hop_size)
n_samples = time_steps * hop_size

# print("===Trained model===")
# print("Time Steps", time_steps_train)
# print("Samples", n_samples_train)
# print("Hop Size", hop_size)
# print("\n===Resynthesis===")
# print("Time Steps", time_steps)
# print("Samples", n_samples)
# print('')

gin_params = [
    'Additive.n_samples = {}'.format(n_samples),
    'FilteredNoise.n_samples = {}'.format(n_samples),
    'DefaultPreprocessor.time_steps = {}'.format(time_steps),
]

with gin.unlock_config():
  gin.parse_config(gin_params)


# Trim all input vectors to correct lengths 
for key in ['f0_hz', 'f0_confidence', 'loudness_db']:
  audio_features[key] = audio_features[key][:time_steps]
audio_features['audio'] = audio_features['audio'][:n_samples]


# Set up the model just to predict audio given new conditioning
tf.reset_default_graph()


ph_f0_hz = tf.placeholder(tf.float32, shape=[1, time_steps])
ph_loudness_db = tf.placeholder(tf.float32, shape=[1, time_steps])
ph_audio = tf.placeholder(tf.float32, shape=[1, n_samples])
ph_features = {
    "loudness_db": ph_loudness_db,
    "f0_hz": ph_f0_hz,
    "audio": ph_audio,
}

model = ddsp.training.models.Autoencoder()
predictions = model.get_outputs(ph_features, training=False)

sess = tf.Session(TARGET)

start_time = time.time()
model.restore(sess, ckpt)
print('\nLoading model took %.1f seconds' % (time.time() - start_time))


In [0]:
#@title Modify conditioning

#@markdown These models were not explicitly trained to perform timbre transfer, so they may sound unnatural if the incoming loudness and frequencies are very different then the training data (which will always be somewhat true). 

#@markdown This button will at least adjusts the average loudness and pitch to be similar to the training data (although not for user trained models).

auto_adjust = True #@param{type:"boolean"}

#@markdown You can also make additional manual adjustments:
#@markdown * Shift the fundmental frequency to a more natural register.
#@markdown * Silence audio below a threshold on f0_confidence.
#@markdown * Adjsut the overall loudness level.
f0_octave_shift =  0 #@param {type:"slider", min:-2, max:2, step:1}
f0_confidence_threshold =  0 #@param {type:"slider", min:0.0, max:1.0, step:0.05}
loudness_db_shift = 0 #@param {type:"slider", min:-20, max:20, step:1}

#@markdown You might get more realistic sounds by shifting a few dB, or try going extreme and see what weird sounds you can make...

audio_features_mod = {k: v.copy() for k, v in audio_features.items()}


## Helper functions.
def shift_ld(audio_features, ld_shift=0.0):
  """Shift loudness by a number of ocatves."""
  audio_features['loudness_db'] += ld_shift
  return audio_features

def shift_f0(audio_features, f0_octave_shift=0.0):
  """Shift f0 by a number of ocatves."""
  audio_features['f0_hz'] *= 2.0 ** (f0_octave_shift)
  audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 
                                    0.0, 
                                    librosa.midi_to_hz(110.0))
  return audio_features

def mask_by_confidence(audio_features, confidence_level=0.1):
  """For the violin model, the masking causes fast dips in loudness. 
  This quick transient is interpreted by the model as the "plunk" sound.
  """
  mask_idx = audio_features['f0_confidence'] < confidence_level
  audio_features['f0_hz'][mask_idx] = 0.0
  # audio_features['loudness_db'][mask_idx] = -ddsp.spectral_ops.LD_RANGE
  return audio_features



def smooth_loudness(audio_features, filter_size=3):
  """Smooth loudness with a box filter."""
  smoothing_filter = np.ones([filter_size]) / float(filter_size)
  audio_features['loudness_db'] = np.convolve(audio_features['loudness_db'], 
                                           smoothing_filter, 
                                           mode='same')
  return audio_features


if auto_adjust:
  # Adjust the peak loudness.
  l = audio_features['loudness_db']
  model_ld_avg_max = {
      'Violin': -34.0,
      'Flute': -45.0,
      'Flute2': -44.0,
  }[MODEL]
  ld_max = np.max(audio_features['loudness_db'])
  ld_diff_max = model_ld_avg_max - ld_max
  audio_features_mod = shift_ld(audio_features_mod, ld_diff_max)

  # Further adjust the average loudness above a threshold.
  l = audio_features_mod['loudness_db']
  model_ld_mean = {
      'Violin': -44.0,
      'Flute': -51.0,
      'Flute2': -53.0,
  }[MODEL]
  ld_thresh = -50.0
  ld_mean = np.mean(l[l > ld_thresh])
  ld_diff_mean = model_ld_mean - ld_mean
  audio_features_mod = shift_ld(audio_features_mod, ld_diff_mean)

  # Shift the pitch register.
  model_p_mean = {
      'Violin': 73.0,
      'Flute': 81.0,
      'Flute2': 74.0,
  }[MODEL]
  p = librosa.hz_to_midi(audio_features['f0_hz'])
  p[p == -np.inf] = 0.0
  p_mean = p[l > ld_thresh].mean()
  p_diff = model_p_mean - p_mean
  p_diff_octave = p_diff / 12.0
  round_fn = np.floor if p_diff_octave > 1.5 else np.ceil
  p_diff_octave = round_fn(p_diff_octave)
  audio_features_mod = shift_f0(audio_features_mod, p_diff_octave)

  

audio_features_mod = shift_ld(audio_features_mod, loudness_db_shift)
audio_features_mod = shift_f0(audio_features_mod, f0_octave_shift)
audio_features_mod = mask_by_confidence(audio_features_mod, f0_confidence_threshold)


# Plot Features.
fig, ax = plt.subplots(nrows=3, 
                       ncols=1, 
                       sharex=True,
                       figsize=(6, 8))
ax[0].plot(audio_features['loudness_db'])
ax[0].plot(audio_features_mod['loudness_db'])
ax[0].set_ylabel('loudness_db')

ax[1].plot(librosa.hz_to_midi(audio_features['f0_hz']))
ax[1].plot(librosa.hz_to_midi(audio_features_mod['f0_hz']))
ax[1].set_ylabel('f0 [midi]')

ax[2].plot(audio_features_mod['f0_confidence'])
ax[2].plot(np.ones_like(audio_features_mod['f0_confidence']) * f0_confidence_threshold)
ax[2].set_ylabel('f0 confidence')
_ = ax[2].set_xlabel('Time step [frame]')

In [0]:
#@title #Resynthesize Audio

af = audio_features if audio_features_mod is None else audio_features_mod
feed_dict = {}
feed_dict[ph_features['loudness_db']] = af['loudness_db'][np.newaxis, :, np.newaxis]
feed_dict[ph_features['f0_hz']] = af['f0_hz'][np.newaxis, :, np.newaxis]
feed_dict[ph_features['audio']] = af['audio'][np.newaxis, :]

# Run a batch of predictions.
start_time = time.time()
audio_gen = sess.run(predictions['audio_gen'],
                     feed_dict=feed_dict)[0]
print('Prediction took %.1f seconds' % (time.time() - start_time))

# Plot
print('Original')
play(audio)

print('Resynthesis')
play(audio_gen)

specplot(audio, sess=sess)
plt.title("Original")

specplot(audio_gen, sess=sess)
_ = plt.title("Resynthesis")