In [1]:
import os
import csv
import umap
import json
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from magenta.models.nsynth import utils
from magenta.models.nsynth.wavenet import fastgen
from os.path import join
from tqdm import tqdm
np.random.seed(8)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.


In [2]:
data_root = 'drumData'
n_fft = 1024
hop_length = n_fft/4
use_logamp = False # boost the brightness of quiet sounds
reduce_rows = 10 # how many frequency bands to average into one
reduce_cols = 1 # how many time steps to average into one
crop_rows = 32 # limit how many frequency bands to use
crop_cols = 32 # limit how many time steps to use
limit = None # set this to 100 to only process 100 samples

In [7]:
drumNames = ["kick", "tom", "snare", "clap", "hi.hat", "ride", "crash"]
drumFingerPrints = {}
drumSamples = {}
for d in drumNames:
    %time drumSamples[d] = np.load(join(data_root, d+'_samples.npy'))
for d in drumNames:
    print (drumSamples[d].shape)

CPU times: user 1.14 ms, sys: 183 ms, total: 184 ms
Wall time: 494 ms
CPU times: user 655 µs, sys: 10.2 ms, total: 10.9 ms
Wall time: 43.8 ms
CPU times: user 995 µs, sys: 61.5 ms, total: 62.5 ms
Wall time: 251 ms
CPU times: user 565 µs, sys: 30.9 ms, total: 31.4 ms
Wall time: 132 ms
CPU times: user 515 µs, sys: 4.31 ms, total: 4.82 ms
Wall time: 17.6 ms
CPU times: user 546 µs, sys: 6.09 ms, total: 6.63 ms
Wall time: 24.4 ms
CPU times: user 1.43 ms, sys: 26.8 ms, total: 28.2 ms
Wall time: 69.8 ms
(5158, 12000)
(422, 12000)
(2546, 12000)
(1324, 12000)
(159, 12000)
(228, 12000)
(723, 12000)


In [4]:
def wavenet_encode(wave_files):
    '''
    neural_sample_rate = 16000
    audio = utils.load_audio(file_path, 
                             sample_length=400000, 
                             sr=neural_sample_rate)
    '''
    encoding = fastgen.encode(wave_files, './wavenet-ckpt/model.ckpt-200000', 12000)
    
    return encoding

In [5]:
crashes = drumSamples["crash"]
kicks = drumSamples["kick"]

sample_kick = kicks[0]
sample_crash = crashes[0]
print(sample_kick.shape)
print(kicks.shape)

(12000,)
(5158, 12000)


In [33]:
wavenet_kick = wavenet_encode(sample_kick)

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000


In [34]:
print(wavenet_kick.shape)
print(np.max(wavenet_kick))
print(np.min(wavenet_kick))

(1, 23, 16)
15.8841
-8.46548


In [None]:
small_drumset = ["tom", "hi.hat", "ride", "crash"]
large_drumset = ["kick", "snare", "clap"]

for drum_name in small_drumset: 
    wavenet_features = wavenet_encode(drumSamples[drum_name])
    print (drumName, wavenet_features.shape)  
        file_path = './drumEmbeddings/' + drumName + '_wavenet.npy'
        np.save(file_path, wavenet_features)

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000


In [11]:
for drumName in large_drumset:
    try:
        samples = drumSamples[drumName] 
        samples_wavenet = []
        (num_samples, sample_length) = samples.shape # e.g samples.shape=(672,12000)
        
        for i in tqdm(range(num_samples)):
            sample = samples[i]
            sample_wavenet = wavenet_encode(sample)
            samples_wavenet.append(sample_wavenet)
        
        #samples_wavenet = wavenet_encode(samples)  # `samples` is a (159,12000) matrix, `samples_wavenet` is (159,23,16)
        
        samples_wavenet = np.asarray(samples_wavenet)
        print (drumName, samples_wavenet.shape)  
        file_path = './drumEmbeddings/' + drumName + '_wavenet.npy'
        np.save(file_path, samples_wavenet)
    except:
            print("error!")
            errors += 1
        
print('errors:', errors)

'\nfor drumName in drumNames:\n    try:\n        samples = drumSamples[drumName] \n        samples_wavenet = []\n        (num_samples, sample_length) = samples.shape # e.g samples.shape=(672,12000)\n        \n        for i in tqdm(range(num_samples)):\n            sample = samples[i]\n            sample_wavenet = wavenet_encode(sample)\n            samples_wavenet.append(sample_wavenet)\n        \n        #samples_wavenet = wavenet_encode(samples)  # `samples` is a (159,12000) matrix, `samples_wavenet` is (159,23,16)\n        \n        samples_wavenet = np.asarray(samples_wavenet)\n        print (drumName, samples_wavenet.shape)  \n        file_path = \'./drumData/\' + drumName + \'_wavenet.npy\'\n        np.save(file_path, samples_wavenet)\n    except:\n            print("error!")\n            errors += 1\n        \nprint(\'errors:\', errors)\n'

In [36]:


def hanoi_encode(drumNames, checkpoint_path, sample_length=64000):
    # Load up the model for encoding and find the encoding of "wav_data"
    session_config = tf.ConfigProto(allow_soft_placement=True)
    batch_size = 1
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        
        hop_length = Config().ae_hop_length
        net = hanoi_load_nsynth(batch_size=batch_size, sample_length=sample_length)
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)
        
        for drumName in drumNames:
            samples = drumSamples[drumName] 
            samples_wavenet = []
            (num_samples, sample_length) = samples.shape # e.g samples.shape=(672,12000)
            
            for i in tqdm(range(num_samples)):
                wav_data = samples[i]
                wav_data, sample_length = utils.trim_for_encoding(wav_data, sample_length,hop_length)
                encodings = sess.run(net["encoding"], feed_dict={net["X"]: wav_data})
                #sample_wavenet = wavenet_encode(sample)
                samples_wavenet.append(encodings)
        samples_wavenet = np.asarray(samples_wavenet)
        print (drumName, samples_wavenet.shape)  
        file_path = './drumEmbeddings/' + drumName + '_wavenet.npy'
        np.save(file_path, samples_wavenet)

def hanoi_load_nsynth(batch_size=1, sample_length=64000):
    """Load the NSynth autoencoder network.
    Args:
    batch_size: Batch size number of observations to process. [1]
    sample_length: Number of samples in the input audio. [64000]
    Returns:
    graph: The network as a dict with input placeholder in {"X"}
    """
    config = Config()
    with tf.device("/gpu:0"):
        x = tf.placeholder(tf.float32, shape=[batch_size, sample_length])
        graph = config.build({"wav": x}, is_training=False)
        graph.update({"X": x})
    return graph

In [40]:
import tensorflow as tf
from magenta.models.nsynth import utils
from magenta.models.nsynth.wavenet.h512_bo16 import Config
from magenta.models.nsynth.wavenet.h512_bo16 import FastGenerationConfig

def testencode(wav_data, checkpoint_path, sample_length=64000):
    if wav_data.ndim == 1:
        wav_data = np.expand_dims(wav_data, 0)
        batch_size = 1
    elif wav_data.ndim == 2:
        batch_size = wav_data.shape[0]

    # Load up the model for encoding and find the encoding of "wav_data"
    session_config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        hop_length = Config().ae_hop_length
        wav_data, sample_length = utils.trim_for_encoding(wav_data, sample_length,
                                                      hop_length)
        print(sample_length)
        net = testload_nsynth(batch_size=batch_size, sample_length=sample_length)
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)
        encodings = sess.run(net["encoding"], feed_dict={net["X"]: wav_data})
    return encodings

def testload_nsynth(batch_size=1, sample_length=64000):
    """Load the NSynth autoencoder network.
    Args:
    batch_size: Batch size number of observations to process. [1]
    sample_length: Number of samples in the input audio. [64000]
    Returns:
    graph: The network as a dict with input placeholder in {"X"}
    """
    config = Config()
    with tf.device("/gpu:0"):
        x = tf.placeholder(tf.float32, shape=[batch_size, sample_length])
        graph = config.build({"wav": x}, is_training=False)
        graph.update({"X": x})
    return graph

In [41]:
encoding = testencode(kicks[0:5], './wavenet-ckpt/model.ckpt-200000', 12000)
print(encoding.shape)

11776
INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000
(5, 23, 16)


In [32]:
hanoi_encode(large_drumset, './wavenet-ckpt/model.ckpt-200000', 12000)

AssertionError: 