In [21]:
import os
import csv
import umap
import json
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from magenta.models.nsynth import utils
from magenta.models.nsynth.wavenet import fastgen
from os.path import join
from tqdm import tqdm
np.random.seed(8)

In [22]:
data_root = 'drumData'
n_fft = 1024
hop_length = n_fft/4
use_logamp = False # boost the brightness of quiet sounds
reduce_rows = 10 # how many frequency bands to average into one
reduce_cols = 1 # how many time steps to average into one
crop_rows = 32 # limit how many frequency bands to use
crop_cols = 32 # limit how many time steps to use
limit = None # set this to 100 to only process 100 samples

In [7]:
drumNames = ["kick", "tom", "snare", "clap", "hi.hat", "ride", "crash"]
drumFingerPrints = {}
drumSamples = {}
for d in drumNames:
    %time drumSamples[d] = np.load(join(data_root, d+'_samples.npy'))

CPU times: user 1 ms, sys: 157 ms, total: 158 ms
Wall time: 487 ms
CPU times: user 660 µs, sys: 12.2 ms, total: 12.8 ms
Wall time: 39.9 ms
CPU times: user 1.41 ms, sys: 76.8 ms, total: 78.2 ms
Wall time: 239 ms
CPU times: user 615 µs, sys: 38.3 ms, total: 38.9 ms
Wall time: 125 ms
CPU times: user 504 µs, sys: 4.77 ms, total: 5.28 ms
Wall time: 15.1 ms
CPU times: user 617 µs, sys: 6.84 ms, total: 7.46 ms
Wall time: 21.3 ms
CPU times: user 1.49 ms, sys: 22.5 ms, total: 24 ms
Wall time: 67.8 ms


In [23]:
def wavenet_encode(audio):
    neural_sample_rate = 16000
    '''
    audio = utils.load_audio(file_path, 
                             sample_length=400000, 
                             sr=neural_sample_rate)
    '''
    encoding = fastgen.encode(audio, './wavenet-ckpt/model.ckpt-200000', len(audio))
    
    return encoding.reshape((-1, 16))

In [24]:
crashes = drumSamples["crash"]
kicks = drumSamples["kick"]

sample_kick = kicks[0]
sample_crash = crashes[0]

In [25]:
wavenet_kick = wavenet_encode(sample_kick)

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000


In [26]:
print(wavenet_kick.shape)
print(np.max(wavenet_kick))
print(np.min(wavenet_kick))

(23, 16)
15.8841
-8.46548


In [29]:
drumNames = ["kick", "tom", "snare", "clap", "hi.hat", "ride", "crash"]
for drumName in tqdm(drumNames):
    samples = drumSamples[drumName] 
    samples_wavenet = []
    (num_samples, sample_length) = samples.shape # e.g samples.shape=(672,12000)
    for i in tqdm(range(num_samples)):
        sample = samples[i]
        sample_wavenet = wavenet_encode(sample)
        samples_wavenet.append(sample_wavenet)
    samples_wavenet = np.asarray(samples_mfcc)
    print (drumName, samples_wavenet.shape)  
    file_path = './drumData/' + drumName + '_wavenet.npy'
    np.save(file_path, samples_wavenet)





  0%|          | 0/7 [00:00<?, ?it/s][A[A[A[A




  0%|          | 0/5158 [00:00<?, ?it/s][A[A[A[A[A


[A[A[A

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000







  0%|          | 1/5158 [00:19<27:17:00, 19.05s/it][A[A[A[A[A

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000







  0%|          | 2/5158 [00:38<27:29:50, 19.20s/it][A[A[A[A[A

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000







  0%|          | 3/5158 [00:59<28:04:08, 19.60s/it][A[A[A[A[A

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000







  0%|          | 4/5158 [01:20<28:45:57, 20.09s/it][A[A[A[A[A

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000







  0%|          | 5/5158 [01:40<28:42:04, 20.05s/it][A[A[A[A[A

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000







  0%|          | 6/5158 [02:00<28:35:20, 19.98s/it][A[A[A[A[A

INFO:tensorflow:Restoring parameters from ./wavenet-ckpt/model.ckpt-200000






[A[A[A[A

KeyboardInterrupt: 

In [30]:
import os
import numpy as np
from scipy.io import wavfile
import tensorflow as tf

from magenta.models.nsynth import utils
from magenta.models.nsynth.wavenet.h512_bo16 import Config
from magenta.models.nsynth.wavenet.h512_bo16 import FastGenerationConfig

In [None]:
def hanoi_encode(wav_data, checkpoint_path, sample_length=64000):
    """Generate an array of embeddings from an array of audio.
    Args:
    wav_data: Numpy array [batch_size, sample_length]
    checkpoint_path: Location of the pretrained model.
    sample_length: The total length of the final wave file, padded with 0s.
    Returns:
    encoding: a [mb, 125, 16] encoding (for 64000 sample audio file).
    """
    if wav_data.ndim == 1:
        wav_data = np.expand_dims(wav_data, 0)
        batch_size = 1
    elif wav_data.ndim == 2:
        batch_size = wav_data.shape[0]

    # Load up the model for encoding and find the encoding of "wav_data"
    session_config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        hop_length = Config().ae_hop_length
        wav_data, sample_length = utils.trim_for_encoding(wav_data, sample_length,
                                                      hop_length)
    net = load_nsynth(batch_size=batch_size, sample_length=sample_length)
    saver = tf.train.Saver()
    saver.restore(sess, checkpoint_path)
    encodings = sess.run(net["encoding"], feed_dict={net["X"]: wav_data})
return encodings