<a href="https://colab.research.google.com/github/anon9119/anon-project-1/blob/main/colab/NEWT_Timbre_Transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Neural Waveshaping Synthesis: Timbre Transfer Demo
#@markdown _by [Ben Hayes](https://benhayes.net/)_

# Setup

In [None]:
#@title Install dependencies
#@markdown 
!pip install torchtext==0.9.1 pytorch-lightning
!pip install auraloss==0.2.1 black==20.8b1 click==7.1.2 gin-config==0.4.0 librosa==0.8.0 numpy==1.20.1 resampy==0.2.2 scipy==1.6.1 torchcrepe==0.0.12 wandb
!pip install youtube-dl  # More recent version
!pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip
!rm -rf anon-project-1
!git clone https://github.com/anon9119/anon-project-1.git
!cd anon-project-1
!python -m pip install git+https://github.com/anon9119/anon-project-1.git

In [None]:
#@title Make imports

import os
import time
import warnings
warnings.filterwarnings("ignore")

from dl_colab_notebooks.audio import record_audio
import gin
from google.colab import files
import IPython.display as ipd
import numpy as np
from scipy.io import wavfile
import torch

from neural_waveshaping_synthesis.data.utils.loudness_extraction import extract_perceptual_loudness
from neural_waveshaping_synthesis.data.utils.mfcc_extraction import extract_mfcc
from neural_waveshaping_synthesis.data.utils.f0_extraction import extract_f0_with_crepe
from neural_waveshaping_synthesis.data.utils.preprocess_audio import preprocess_audio, convert_to_float32_audio, make_monophonic, resample_audio
from neural_waveshaping_synthesis.models.neural_waveshaping import NeuralWaveshaping

gin.constant("device", "cuda")
gin.parse_config_file("anon-project-1/gin/models/newt.gin")
gin.parse_config_file("anon-project-1/gin/data/urmp_4second_crepe.gin")
device = torch.device("cuda")

checkpoints = dict(Violin="vn", Flute="fl", Trumpet="tpt")

In [None]:
#@title Load Checkpoint { run: "auto" }
#@markdown Choose from one of three pretrained checkpoints. In future you will be able to upload your own checkpoints too.
selected_checkpoint_name = "Violin" #@param ["Violin", "Flute", "Trumpet"]
selected_checkpoint = checkpoints[selected_checkpoint_name]

checkpoint_path = os.path.join(
    "anon-project-1/checkpoints/nws", selected_checkpoint)
model = NeuralWaveshaping.load_from_checkpoint(
    os.path.join(checkpoint_path, "last.ckpt")).to(device)
model.eval()
data_mean = np.load(
    os.path.join(checkpoint_path, "data_mean.npy"))
data_std = np.load(
    os.path.join(checkpoint_path, "data_std.npy"))

# Audio Input

You now have a few options for getting source audio into the model.
Whichever you choose, monophonic audio will give you best results. Polyphony is likely to result in chaos.

You only need to run one of these cells. Whichever one you ran last will be used as the model input. When you're done, jump down to **Prepare Audio** below.

To start with, why not jump in with the pre-populated YouTube URL?

In [None]:
#@title 1. Get Audio from YouTube

#@markdown It's hard to beat the default video link...

youtube_url = "https://www.youtube.com/watch?v=dYvPCgcFDIo" #@param
start_in_seconds = 6.5 #@param {type: "number"}
length_in_seconds = 20.0 #@param {type: "number"}

!rm *.wav
!youtube-dl --extract-audio --audio-format wav {youtube_url} #-o yt_audio.wav
!mv *.wav yt_audio.wav

rate, audio = wavfile.read("yt_audio.wav")
audio = convert_to_float32_audio(make_monophonic(audio))
audio = audio[int(rate * start_in_seconds):int(rate * (start_in_seconds + length_in_seconds))]
audio = resample_audio(audio, rate, model.sample_rate)
ipd.Audio(audio, rate=model.sample_rate)

OR

In [None]:
#@title 2. Upload an audio file
#@markdown For now, only .wav files are supported.

!rm -rf *.wav
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

rate, audio = wavfile.read(file_name)
audio = convert_to_float32_audio(make_monophonic(audio))
audio = resample_audio(audio, rate, model.sample_rate)
ipd.Audio(audio, rate=model.sample_rate)

OR

In [None]:
#@title 3. Record audio
#@markdown Try singing or whistling into the microphone and becoming an instrument yourself!

record_seconds = 10 #@param {type: "number"}
audio = record_audio(sample_rate=model.sample_rate)
ipd.Audio(audio, rate=model.sample_rate)

# Prepare Audio

In [None]:
#@title Extract Audio Features
#@markdown Here we extract F0 using CREPE and A-weighted loudness.

f0, confidence = extract_f0_with_crepe(audio)
loudness = extract_perceptual_loudness(audio)



In [None]:
#@title Adjust Control Signals { run: "auto" }
#@markdown Our source audio might not quite match the characteristics of the training audio, so let's adjust it to fit
octave_shift = -1 #@param {type: "slider", min: -4, max: 4, step: 1}
loudness_floor = 0 #@param {type: "slider", min: 0, max: 1, step: 0.01}
loudness_scale = 0.5 #@param {type: "slider", min: 0, max: 2, step: 0.01}

f0_shifted = f0 * (2 ** octave_shift)
loudness_floored = loudness * (loudness > loudness_floor) - loudness_floor
loudness_scaled = loudness_floored * loudness_scale
# loudness = loudness * (confidence > 0.4)

f0_norm = (f0_shifted - data_mean[0]) / data_std[0]
loud_norm = (loudness_scaled - data_mean[1]) / data_std[1]

f0_t = torch.tensor(f0_shifted, device=device).float()
f0_norm_t = torch.tensor(f0_norm, device=device).float()
loud_norm_t = torch.tensor(loud_norm, device=device).float()

control = torch.stack((f0_norm_t, loud_norm_t), dim=0)

# Generation Time

In [None]:
#@title Synthesise Audio!
#@markdown Finally, run this cell to get some audio from the model.
start_time = time.time()
out = model(f0_t.expand(1, 1, -1), control.unsqueeze(0))
run_time = time.time() - start_time
rtf = (audio.shape[-1] / model.sample_rate) / run_time
print("Audio generated in %.2f seconds. That's %.1f times faster than the real time threshold!" % (run_time, rtf))
ipd.Audio(out.detach().cpu().numpy(), rate=model.sample_rate)