Copyright 2017 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install magenta
print('Installing Magenta...\n')
!pip install -qU magenta
print('Installing ffmpeg...\n')
!echo "Yes" | apt-get install ffmpeg > /dev/null


print('Downloading Pretrained Models...\n')
# Copy checkpoints from google cloud
# Copying 1GB, takes a minute
print('Getting Instruments Model...\n')
!gsutil -q -m cp -R gs://download.magenta.tensorflow.org/models/nsynth/wavenet-ckpt.tar /content/
print('Getting Voices Model...\n')
!gsutil -q -m cp -R gs://download.magenta.tensorflow.org/models/nsynth/wavenet-voice-ckpt.tar.gz /content/
!cd /content/
!tar -xvf wavenet-ckpt.tar > /dev/null
!tar -xvf wavenet-voice-ckpt.tar.gz > /dev/null


print('Importing Modules...\n')
# Load modules and helper functions
import os
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio
%matplotlib inline

from google.colab import files
from magenta.models.nsynth import utils
from magenta.models.nsynth.wavenet import fastgen
from note_seq.notebook_utils import colab_play as play

Installing Magenta...

Installing ffmpeg...

Downloading Pretrained Models...

Getting Instruments Model...

Getting Voices Model...

Importing Modules...



In [None]:
#@title Choose a Model { vertical-output: true, run: "auto" }
Model = "Voices" #@param ["Instruments", "Voices"] {type:"string"}
ckpts = {'Instruments': '/content/wavenet-ckpt/model.ckpt-200000',
         'Voices': '/content/wavenet-voice-ckpt/model.ckpt-200000'}

ckpt_path = ckpts[Model]
print('Using model pretrained on %s.' % Model)

Using model pretrained on Voices.


In [None]:
#@title Set Sound Length (in Seconds) { vertical-output: true, run: "auto" }
Length = 4.0 #@param {type:"number"}
SR = 16000
SAMPLE_LENGTH = int(SR * Length)

In [None]:
def get_audio_list(file_list, audio_list):
  get_name = lambda f: os.path.splitext(os.path.basename(f))[0]
  names = [get_name(f) for f in file_list]  
  # Pad and peak normalize
  for i in range(len(audio_list)):
    audio_list[i] = audio_list[i] / np.abs(audio_list[i]).max()

    if len(audio_list[i]) < SAMPLE_LENGTH:
      padding = SAMPLE_LENGTH - len(audio_list[i])
      audio_list[i] = np.pad(audio_list[i], (0, padding), 'constant')

  audio_list = np.array(audio_list)
  return audio_list, names

In [None]:
def gen_encodings(audio_list, names):
  audio = np.array(audio_list)
  z = fastgen.encode(audio, ckpt_path, SAMPLE_LENGTH)
  print('Encoded %d files' % z.shape[0])


  # Start with reconstructions
  z_list = [z_ for z_ in z]
  name_list = ['recon_' + name_ for name_ in names]

  # Add all the mean interpolations
  n = len(names)
  for i in range(n - 1):
    for j in range(i + 1, n):
      new_z = (z[i] + z[j]) / 2.0
      new_name = 'interp_' + names[i] + '_X_'+ names[j]
      z_list.append(new_z)
      name_list.append(new_name)

  print("%d total: %d reconstructions and %d interpolations" % (len(name_list), n, len(name_list) - n))

  return z_list, name_list

For fun, we can take a look at the encoding of our audio files. They are compressed representations of the audio but have some structure in their own right, (16 numbers, kind of like 16 channels of audio, so there are 16 different lines, colors are arbitrary). 

In [None]:
# #@title Visualize Audio and Encoding { vertical-output: true, run: "auto" }
# SoundFile = 0 #@param {type:"slider", min:0, max:10, step:1}
# file_number = SoundFile

# try:
#   print(names[file_number])
#   play(audio_list[file_number], sample_rate=SR)
#   # fig, axs = plt.subplots(2, 1, figsize=(12, 10))
#   plt.figure()
#   plt.plot(audio_list[file_number])
#   plt.title('Audio Signal')

#   plt.figure()
#   plt.plot(z_list[file_number])
#   plt.title('NSynth Encoding')
# except Exception as e:
#   print(e)

In [None]:
def synthesize_interpolations(z_list, name_list, directory):
  print('Total Iterations to Complete: %d\n' % SAMPLE_LENGTH)

  encodings = np.array(z_list)
  save_paths = [directory + name + '.wav' for name in name_list]
  fastgen.synthesize(encodings,
                    save_paths=save_paths,
                    checkpoint_path=ckpt_path,
                    samples_per_save=int(SAMPLE_LENGTH / 10))

In [None]:
import pandas as pd

def get_input_paths(dir_path):
  data_temp = []
  for file in os.scandir(dir_path):
    data_temp.append([file.path, file.name])

  return pd.DataFrame(data_temp, columns=["relative_path", "filename"])

In [None]:
df_test = get_input_paths("drive/MyDrive/GMMGroup/Ingredient_One")
df_testest = get_input_paths("drive/MyDrive/GMMGroup/Ingredient_Two")
len(df_test)

2

In [None]:
for i in range(len(df_test)):
  print(df_test.relative_path[i])

drive/MyDrive/GMMGroup/testUpload/s01 (1).wav
drive/MyDrive/GMMGroup/testUpload/s20 (4).wav


In [None]:
for idx in range(len(df_test)):
  file_list, audio_list = [], []
  audio1 = utils.load_audio(df_test.relative_path[i], sample_length=SAMPLE_LENGTH, sr=SR)
  audio2 = utils.load_audio(df_testest.relative_path[i], sample_length=SAMPLE_LENGTH, sr=SR)
  file_list.append(df_test.relative_path[i])
  file_list.append(df_testest.relative_path[i])
  audio_list.append(audio1)
  audio_list.append(audio2)
  a_list, names= get_audio_list(file_list, audio_list)
  z_list, name_list = gen_encodings(a_list, names)
  synthesize_interpolations(z_list, name_list, "drive/MyDrive/GMMGroup/OutputSounds/")