In [2]:
import os
import csv
import umap
import json
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from magenta.models.nsynth import utils
from magenta.models.nsynth.wavenet import fastgen
from os.path import join
from tqdm import tqdm
np.random.seed(8)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.


In [3]:
data_root = 'drumData'
n_fft = 1024
hop_length = n_fft/4
use_logamp = False # boost the brightness of quiet sounds
reduce_rows = 10 # how many frequency bands to average into one
reduce_cols = 1 # how many time steps to average into one
crop_rows = 32 # limit how many frequency bands to use
crop_cols = 32 # limit how many time steps to use
limit = None # set this to 100 to only process 100 samples

In [4]:
drumNames = ["kick", "tom", "snare", "clap", "hi.hat", "ride", "crash"]
drumFingerPrints = {}
drumSamples = {}
for d in drumNames:
    %time drumSamples[d] = np.load(join(data_root, d+'_samples.npy'))
for d in drumNames:
    print (drumSamples[d].shape)

CPU times: user 2.17 ms, sys: 334 ms, total: 336 ms
Wall time: 496 ms
CPU times: user 549 µs, sys: 25.7 ms, total: 26.3 ms
Wall time: 47.5 ms
CPU times: user 1.78 ms, sys: 166 ms, total: 168 ms
Wall time: 448 ms
CPU times: user 1.46 ms, sys: 87.5 ms, total: 89 ms
Wall time: 128 ms
CPU times: user 1.01 ms, sys: 12.8 ms, total: 13.8 ms
Wall time: 35.9 ms
CPU times: user 1.73 ms, sys: 16.9 ms, total: 18.7 ms
Wall time: 34.3 ms
CPU times: user 926 µs, sys: 49.7 ms, total: 50.7 ms
Wall time: 79.8 ms
(5158, 12000)
(422, 12000)
(2546, 12000)
(1324, 12000)
(159, 12000)
(228, 12000)
(723, 12000)


In [5]:
def wavenet_encode(wave_files):
    '''
    neural_sample_rate = 16000
    audio = utils.load_audio(file_path, 
                             sample_length=400000, 
                             sr=neural_sample_rate)
    '''
    encoding = fastgen.encode(wave_files, './wavenet-ckpt/model.ckpt-200000', 12000)
    
    return encoding

In [9]:
crashes = drumSamples["crash"]
kicks = drumSamples["kick"]

sample_kick = kicks[0]
sample_crash = crashes[0]
print(sample_kick.shape)
print(kicks.shape)

(12000,)
(5158, 12000)


In [33]:
wavenet_kick = wavenet_encode(sample_kick)

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000


In [34]:
print(wavenet_kick.shape)
print(np.max(wavenet_kick))
print(np.min(wavenet_kick))

(1, 23, 16)
15.8841
-8.46548


In [7]:
small_drumset = ["tom","hi.hat", "ride", "crash"]
large_drumset = ["kick", "snare", "clap"]

for drum_name in small_drumset: 
    wavenet_features = wavenet_encode(drumSamples[drum_name])
    print (drum_name, wavenet_features.shape)  
    file_path = './drumEmbeddings/' + drum_name + '_wavenet.npy'
    np.save(file_path, wavenet_features)

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000
('hi.hat', (159, 23, 16))
INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000
('ride', (228, 23, 16))
INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000
('crash', (723, 23, 16))


In [6]:
import tensorflow as tf
from magenta.models.nsynth import utils
from magenta.models.nsynth.wavenet.h512_bo16 import Config
from magenta.models.nsynth.wavenet.h512_bo16 import FastGenerationConfig

def custom_encode(wav_data, checkpoint_path, drum_name, sample_length=64000):
    batch_size = 1
    samples_wavenet = []
    num_samples = wav_data.shape[0]
    # Load up the model for encoding and find the encoding of "wav_data"
    session_config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        hop_length = Config().ae_hop_length
        net = testload_nsynth(batch_size=batch_size, sample_length=11776)  # hardcore to 11776 for samples of length 12000
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)
        
        for i in tqdm(range(num_samples)):
            sample = wav_data[i]
            sample = np.expand_dims(sample,0)
            sample, sample_length = utils.trim_for_encoding(sample, sample_length,hop_length)
            encodings = sess.run(net["encoding"], feed_dict={net["X"]: sample})
            encodings = encodings.reshape(-1,16)
            samples_wavenet.append(encodings)
        samples_wavenet = np.asarray(samples_wavenet)
        
        file_path = './drumEmbeddings/' + drum_name + '_wavenet.npy'
        np.save(file_path, samples_wavenet)
        print (drum_name, samples_wavenet.shape) 
        
def testload_nsynth(batch_size=1, sample_length=64000):
    """Load the NSynth autoencoder network.
    Args:
    batch_size: Batch size number of observations to process. [1]
    sample_length: Number of samples in the input audio. [64000]
    Returns:
    graph: The network as a dict with input placeholder in {"X"}
    """
    config = Config()
    with tf.device("/gpu:0"):
        x = tf.placeholder(tf.float32, shape=[batch_size, sample_length])
        graph = config.build({"wav": x}, is_training=False)
        graph.update({"X": x})
    return graph

In [7]:
#custom_encode(kicks[0:20], './wavenet-ckpt/model.ckpt-200000', 12000)

In [10]:
drum_name = "clap"
custom_encode(kicks[:5], './wavenet-ckpt/model.ckpt-200000',drum_name, 12000)

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000


100%|██████████| 5/5 [00:14<00:00,  2.87s/it]

('clap', (5, 23, 16))



