In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import pickle
import time


In [2]:
pickle_in = open('training_source_of_truth.pickle', 'rb')
training_audio = pickle.load(pickle_in)

In [3]:
training_audio.shape

(3358, 220500, 1, 1)

In [4]:
training_audio[0]

array([[[ 0.0000000e+00]],

       [[ 0.0000000e+00]],

       [[-3.0517578e-05]],

       ...,

       [[-6.3751221e-02]],

       [[-5.4992676e-02]],

       [[-4.8553467e-02]]], dtype=float32)

In [6]:
def make_generator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(5*1*256, use_bias=False, input_shape=(100,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Reshape((5,1,256)))
    print('0: ')
    print(model.output_shape)
    assert model.output_shape == (None, 5, 1, 256) # Note: None is the batch size

    model.add(layers.Conv2DTranspose(128, (1, 1), strides=(1, 1), padding='same', use_bias=False))
    print('1: ')
    print(model.output_shape)
    assert model.output_shape == (None, 5, 1, 128)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    
    model.add(layers.Conv2DTranspose(128, (4, 1), strides=(4, 1), padding='same', use_bias=False))
    print('3: ')
    print(model.output_shape)
    assert model.output_shape == (None, 20, 1, 128)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    model.add(layers.Conv2DTranspose(128, (5, 1), strides=(5, 1), padding='same', use_bias=False))
    print('3: ')
    print(model.output_shape)
    assert model.output_shape == (None, 100, 1, 128)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv2DTranspose(64, (3, 1), strides=(3, 1), padding='same', use_bias=False))
    print('4: ')
    print(model.output_shape)
    assert model.output_shape == (None, 300, 1, 64)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    model.add(layers.Conv2DTranspose(32, (3, 1), strides=(3, 1), padding='same', use_bias=False))
    print('5: ')
    print(model.output_shape)
    assert model.output_shape == (None, 900, 1, 32)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    model.add(layers.Conv2DTranspose(16, (5, 1), strides=(5, 1), padding='same', use_bias=False))
    print('4: ')
    print(model.output_shape)
    assert model.output_shape == (None, 4500, 1, 16)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    
    
    model.add(layers.Conv2DTranspose(8, (7, 1), strides=(7, 1), padding='same', use_bias=False))
    print('5: ')
    print(model.output_shape)
    assert model.output_shape == (None, 31500, 1, 8)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv2DTranspose(1, (7, 1), strides=(7, 1), padding='same', use_bias=False, activation='tanh'))
    print('5: ')
    print(model.output_shape)
    assert model.output_shape == (None, 220500, 1, 1)

    return model

In [7]:
def make_discriminator_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(8, (7, 1), strides=(7, 1), padding='same',
                                     input_shape=[220500, 1, 1]))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Conv2D(16, (5, 1), strides=(5, 1), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Conv2D(32, (3, 1), strides=(3, 1), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Conv2D(64, (3, 1), strides=(3, 1), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Conv2D(128, (5, 1), strides=(5, 1), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Conv2D(128, (4, 1), strides=(4, 1), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Conv2D(128, (1, 1), strides=(1, 1), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Flatten())
    model.add(layers.Dense(1))

    return model

In [8]:
generator = make_generator_model()

Instructions for updating:
Colocations handled automatically by placer.
0: 
(None, 5, 1, 256)
1: 
(None, 5, 1, 128)
3: 
(None, 20, 1, 128)
3: 
(None, 100, 1, 128)
4: 
(None, 300, 1, 64)
5: 
(None, 900, 1, 32)
4: 
(None, 4500, 1, 16)
5: 
(None, 31500, 1, 8)
5: 
(None, 220500, 1, 1)


In [9]:
discriminator = make_discriminator_model()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [10]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [11]:
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

In [12]:
def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

In [13]:
# generator_optimizer = tf.keras.optimizers.Adam(1e-4)
# discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

generator_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
discriminator_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)

In [14]:
# TODO FIX CHECKPOINTs
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

In [15]:
EPOCHS = 50
BATCH_SIZE = 50
noise_dim = 100
num_examples_to_generate = 16

# We will reuse this seed overtime (so it's easier)
seed = tf.random.normal([num_examples_to_generate, noise_dim])

In [16]:
# TODO uncomment line below later
#@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, noise_dim])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))


In [25]:
def train(dataset, epochs):
    for epoch in range(epochs):
        print(str(epoch + 1) + ' now in progress....')
        start = time.time()

        for image_batch in dataset:
            train_step(tf.reshape(image_batch, [1,220500,1,1]))#image_batch)

        generate_and_save_audio(generator, epoch + 1, seed)

        # Save the model every 15 epochs
        if (epoch + 1) % 15 == 0:
#             print('fix checkpoint here!!!')
            checkpoint.save(file_prefix = checkpoint_prefix)

        print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

    # Generate after the final epoch
    display.clear_output(wait=True)
    generate_and_save_audio(generator, epochs, seed)


In [26]:
def generate_and_save_audio(model, epoch, test_input):
  # Notice `training` is set to False.
  # This is so all layers run in inference mode (batchnorm).
    predictions = model(test_input, training=False)

    fig = plt.figure(figsize=(4,4))
    output = wav_extractor(predictions)
    librosa.output.write_wav('generated_audio/' + str(epoch) + '_epoch_audio.wav', audio, 22050) #this gives us 10 seconds of audio
    # TODO call wav generation function here
#     for i in range(predictions.shape[0]):
#         plt.subplot(4, 4, i+1)
#         plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')
#         plt.axis('off')

#     plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))
#     plt.show()


In [27]:
def wav_extractor(generated_sound):
    with sess.as_default():
        generated_array = generated_sound.eval()
        out = []
        for i in range(len(generated_array[0])):
            out.append(arr[0][i][0][0]) 
        return np.array(out)

In [None]:
EPOCHS = 10
# for i in train_dataset:
#     train(i, EPOCHS)
train(training_audio, EPOCHS)

0now in progress....
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
