In [None]:
# @title Import libraries
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import librosa
import json
from IPython.display import Audio
import scipy
import matplotlib.pyplot as plt

# make it run on tfu version
print("Tensorflow version " + tf.__version__)


Tensorflow version 2.13.0


In [None]:
tf.test.gpu_device_name()

''

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#@title Hyperparameters
learning_rate = 0.0005 #param {type:"raw"}
num_epochs_to_train =  30#param {type:"integer"}
batch_size = 32 #param {type:"integer"}
vector_dimension = 128 #param {type:"integer"}

hop=256              #hop size (window size = 4*hop)
frame_length=512
fft_size = 1024
fs=44100              #sampling rate
min_level_db=-100     #reference values to normalize data
ref_level_db=20
max_file_size = fs*3

LEARNING_RATE = learning_rate
BATCH_SIZE = batch_size
EPOCHS = num_epochs_to_train
VECTOR_DIM=vector_dimension

shape=128           #length of time axis of split specrograms


In [None]:
#@title Load NSynth Dataset

data = tfds.load("nsynth", split='train[50%:]', shuffle_files=True, try_gcs=True)

In [None]:
# @title Utility functions

def padIfNecessary(in_file, max):
  file_len = len(in_file)
  if file_len < max:
    diff = max - file_len
    zeros = np.zeros(int(diff))
    padded = np.concatenate((in_file, zeros))
    return padded
  elif file_len > max:
    return in_file[:max]
  return in_file

def normalize(array, min, max):
    norm_array = (array - array.min()) / (array.max() - array.min())
    norm_array = norm_array * (max - min) + min
    return norm_array

def denormalize(norm_array, source_min, source_max, new_min, new_max):
    scale_factor = (new_max - new_min) / (source_max - source_min)
    shift_factor = new_min - source_min * scale_factor

    # Scale the input signal to the target range
    denormalized_signal = (norm_array * scale_factor) + shift_factor
    return denormalized_signal


In [None]:
# @title Load and preprocess other dataset
import os
import random
import audiofile
directory = '/content/drive/My Drive/Instrument_Dataset'
def loadMyDataset(path, num_files=1000):
  all_files = os.listdir(path)
  random.shuffle(all_files)
  selected_files = all_files[:num_files]

  audios = []
  for filename in selected_files:
      y, sr = audiofile.read(path+"/"+filename)
      shaped = padIfNecessary(y,max_file_size)
      print(shaped.shape)
      audios.append(y)
  audios = np.array(audios)
  return audios

def preprocessMyDs(path, num_files=1000):
  audios = loadMyDataset(path, num_files=num_files)
  stfts = []
  max_val = 0
  min_val = 0
  n_fft = 510
  for audio in audios:
     if len(audio) < max_file_size:
         audio = np.pad(audio, (0, max_file_size - len(audio)))
     stft = np.abs(librosa.stft(audio, n_fft = n_fft))
     st_max = np.amax(stft)#get max in stft
     st_min = np.amin(stft)#get min in stft
     if st_max > max_val:
       max_val = st_max    #check if lowest or highest value so far
     if st_min < min_val:
       min_val = st_min
     stft = normalize(stft, 0, 1)
     stfts.append(stft)
     print(stft.shape)
  stfts = np.array(stfts)
  return stfts[..., np.newaxis], audios, min_val, max_val


data_stfts, data_audios, data_min, data_max = preprocessMyDs(directory, 5)
print(data_stfts.shape)

(132300,)
(132300,)
(132300,)
(132300,)
(132300,)
(256, 1042)
(256, 1042)
(256, 1042)
(256, 1042)
(256, 1042)
(5, 256, 1042, 1)


  audios = np.array(audios)


In [None]:
Audio(data_audios[1], rate=fs)

In [None]:
# @title small test for preprocessing
def testData(data):
  max_val = 0
  min_val = 0
  for audio in data:
    audios.append(audio)
    stft = np.abs(librosa.stft(audio, n_fft = 510))
    st_max = np.amax(stft)#get max in stft
    st_min = np.amin(stft)#get min in stft
    if st_max > max_val:
      max_val = st_max    #check if lowest or highest value so far
    if st_min < min_val:
      min_val = st_min
    stft = normalize(stft, 0, 1)
    stfts.append(stft)
    print(audio.shape, stft.shape)
  stfts = np.array(stfts)
  audios = data
  return stfts[..., np.newaxis], audios, min_val, max_val

test_stfts, test_audios, test_min, test_max = testData(data)
test_stfts.shape, test_min, test_max

In [None]:
# @title prepare nsynth dataset
def prepareDataset(data):
  numberOfClips = 10000 #param #number of audio files to use in training
  smallerData = data.take(numberOfClips)
  # get audio arrays
  stfts = []
  max_val = 0
  min_val = 0
  for i in smallerData:
    audio = i['audio'].numpy()
    audio = np.pad(audio,500)#was 512
    stft = np.abs(librosa.stft(audio, n_fft=510))# had [:-1] and no n_fft

    st_max = np.amax(stft)#get max in stft
    st_min = np.amin(stft)#get min in stft
    if st_max > max_val:
      max_val = st_max    #check if lowest or highest value so far
    if st_min < min_val:
      min_val = st_min

    stft = normalize(stft, 0, 1)
    stfts.append(stft)
  stfts = np.array(stfts,dtype=np.float32)
  return stfts[..., np.newaxis], min_val, max_val

x_train, data_min, data_max = prepareDataset(data)
data_max, data_min

(237.67918, 0)

In [None]:
# @title Post process
def postProcessSingle(stft, min, max):
    # Convert the log spectrogram back to amplitude spectrogram
    stft = stft[:,:,0]
    stft = denormalize(stft,0,1,min,max)
    y = librosa.griffinlim(stft, n_fft = 510)
    return y

def postProcessBatch(stfts):
    audios = []
    stfts = stfts[:,:,0]
    for i in range(stfts.size):
      audio = postProcessSingle(stfts[0])
      audios.append(audio)
    audios = np.array(audios)
    return audios


for val in range(5):

  audi0 = audios[val]
  spec =stfts[val]
  aud = postProcessSingle(spec, test_min, test_max)
  lol = Audio(audi0,rate=fs)
  loll = Audio(aud, rate=fs)
  #axs[val,0].plot(x, audi0)
  #axs[val,1].plot(x, aud)
  display(lol)
  display(loll)

'\nx = np.arange(0, 65000)\nfig, axs = plt.subplots(5, 2)\nfor val in range(5):\n\n  audi0 = test_audios[val]\n  spec =test_stfts[val]\n  aud = postProcessSingle(spec, test_min, test_max)\n  aud = np.pad(aud, (0,103))\n  print(aud.shape)\n  lol = Audio(audi0,rate=fs)\n  loll = Audio(aud, rate=fs)\n  #axs[val,0].plot(x, audi0)\n  #axs[val,1].plot(x, aud)\n  display(lol)\n  display(loll)\n'

In [None]:
#@title Variational autoencoder Class
from keras import Model
from keras.layers import Input, Conv2D, ReLU, BatchNormalization, Flatten, Dense, Reshape, Conv2DTranspose, Activation, Lambda
from keras.losses import MeanSquaredError
from keras import backend as K
import os
import pickle

def _calculate_reconstruction_loss(y_target, y_predicted):
    error = y_target - y_predicted
    reconstruction_loss = K.mean(K.square(error), axis=[1, 2, 3])
    return reconstruction_loss


def calculate_kl_loss(model):
    # wrap `_calculate_kl_loss` such that it takes the model as an argument,
    # returns a function which can take arbitrary number of arguments
    # (for compatibility with `metrics` and utility in the loss function)
    # and returns the kl loss
    def _calculate_kl_loss(*args):
        kl_loss = -0.5 * K.sum(1 + model.log_variance - K.square(model.mu) -
                               K.exp(model.log_variance), axis=1)
        return kl_loss
    return _calculate_kl_loss


class VAE:
    """
    VAE represents a Deep Convolutional variational autoencoder architecture
    with mirrored encoder and decoder components.
    """

    def __init__(self,
                 input_shape,
                 conv_filters,
                 conv_kernels,
                 conv_strides,
                 latent_space_dim):
        self.input_shape = input_shape # [28, 28, 1]
        self.conv_filters = conv_filters # [2, 4, 8]
        self.conv_kernels = conv_kernels # [3, 5, 3]
        self.conv_strides = conv_strides # [1, 2, 2]
        self.latent_space_dim = latent_space_dim # 2
        self.reconstruction_loss_weight = 1000000

        self.encoder = None
        self.decoder = None
        self.model = None

        self._num_conv_layers = len(conv_filters)-1
        self._shape_before_bottleneck = None
        self._model_input = None

        self._build()

    def summary(self):
        self.encoder.summary()
        self.decoder.summary()
        self.model.summary()

    def compile(self, learning_rate=0.0001, clip_norm=1):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.model.compile(optimizer=optimizer,
                           loss=self._calculate_combined_loss,
                           metrics=[_calculate_reconstruction_loss,
                                    calculate_kl_loss(self)])

    def train(self, x_train, batch_size, num_epochs):
        self.model.fit(x_train,
                       x_train,
                       batch_size=batch_size,
                       epochs=num_epochs,
                       shuffle=True)

    def save(self, save_folder="."):
        self._create_folder_if_it_doesnt_exist(save_folder)
        self._save_parameters(save_folder)
        self._save_weights(save_folder)

    def load_weights(self, weights_path):
        self.model.load_weights(weights_path)

    def reconstruct(self, images):
        latent_representations = self.encoder.predict(images)
        reconstructed_images = self.decoder.predict(latent_representations)
        return reconstructed_images, latent_representations

    @classmethod
    def load(cls, save_folder="."):
        parameters_path = os.path.join(save_folder, "parameters.pkl")
        with open(parameters_path, "rb") as f:
            parameters = pickle.load(f)
        autoencoder = VAE(*parameters)
        weights_path = os.path.join(save_folder, "weights.h5")
        autoencoder.load_weights(weights_path)
        return autoencoder

    def _calculate_combined_loss(self, y_target, y_predicted):
        reconstruction_loss = _calculate_reconstruction_loss(y_target, y_predicted)
        kl_loss = calculate_kl_loss(self)()
        combined_loss = self.reconstruction_loss_weight * reconstruction_loss\
                                                         + kl_loss
        return combined_loss

    def _create_folder_if_it_doesnt_exist(self, folder):
        if not os.path.exists(folder):
            os.makedirs(folder)

    def _save_parameters(self, save_folder):
        parameters = [
            self.input_shape,
            self.conv_filters,
            self.conv_kernels,
            self.conv_strides,
            self.latent_space_dim
        ]
        save_path = os.path.join(save_folder, "parameters.pkl")
        with open(save_path, "wb") as f:
            pickle.dump(parameters, f)

    def _save_weights(self, save_folder):
        save_path = os.path.join(save_folder, "weights.h5")
        self.model.save_weights(save_path)

    def _build(self):
        self._build_encoder()
        self._build_decoder()
        self._build_autoencoder()

    def _build_autoencoder(self):
        model_input = self._model_input
        model_output = self.decoder(self.encoder(model_input))
        self.model = Model(model_input, model_output, name="autoencoder")

    def _build_decoder(self):
        decoder_input = self._add_decoder_input()
        dense_layer = self._add_dense_layer(decoder_input)
        reshape_layer = self._add_reshape_layer(dense_layer)
        conv_transpose_layers = self._add_conv_transpose_layers(reshape_layer)
        decoder_output = self._add_decoder_output(conv_transpose_layers)
        self.decoder = Model(decoder_input, decoder_output, name="decoder")

    def _add_decoder_input(self):
        return Input(shape=self.latent_space_dim, name="decoder_input")

    def _add_dense_layer(self, decoder_input):
        num_neurons = np.prod(self._shape_before_bottleneck) # [1, 2, 4] -> 8
        dense_layer = Dense(num_neurons, name="decoder_dense")(decoder_input)
        return dense_layer

    def _add_reshape_layer(self, dense_layer):
        return Reshape(self._shape_before_bottleneck)(dense_layer)

    def _add_conv_transpose_layers(self, x):
        """Add conv transpose blocks."""
        # loop through all the conv layers in reverse order and stop at the
        # first layer
        for layer_index in reversed(range(1, self._num_conv_layers)):
            x = self._add_conv_transpose_layer(layer_index, x)
        return x

    def _add_conv_transpose_layer(self, layer_index, x):
        layer_num = self._num_conv_layers - layer_index
        conv_transpose_layer = Conv2DTranspose(
            filters=self.conv_filters[layer_index],
            kernel_size=self.conv_kernels[layer_index],
            strides=self.conv_strides[layer_index],
            padding="same",
            name=f"decoder_conv_transpose_layer_{layer_num}"
        )
        x = conv_transpose_layer(x)
        x = ReLU(name=f"decoder_relu_{layer_num}")(x)
        x = BatchNormalization(name=f"decoder_bn_{layer_num}")(x)
        return x

    def _add_decoder_output(self, x):
        conv_transpose_layer = Conv2DTranspose(
            filters=1,
            kernel_size=self.conv_kernels[0],
            strides=self.conv_strides[0],
            padding="same",
            name=f"decoder_conv_transpose_layer_{self._num_conv_layers}"
        )
        x = conv_transpose_layer(x)
        output_layer = Activation("sigmoid", name="sigmoid_layer")(x)
        return output_layer

    def _build_encoder(self):
        encoder_input = self._add_encoder_input()
        conv_layers = self._add_conv_layers(encoder_input)
        bottleneck = self._add_bottleneck(conv_layers)
        self._model_input = encoder_input
        self.encoder = Model(encoder_input, bottleneck, name="encoder")

    def _add_encoder_input(self):
        return Input(shape=self.input_shape, name="encoder_input")

    def _add_conv_layers(self, encoder_input):
        """Create all convolutional blocks in encoder."""
        x = encoder_input
        for layer_index in range(self._num_conv_layers):
            x = self._add_conv_layer(layer_index, x)
        return x

    def _add_conv_layer(self, layer_index, x):
        """Add a convolutional block to a graph of layers, consisting of
        conv 2d + ReLU + batch normalization.
        """
        layer_number = layer_index + 1
        conv_layer = Conv2D(
            filters=self.conv_filters[layer_index],
            kernel_size=self.conv_kernels[layer_index],
            strides=self.conv_strides[layer_index],
            padding="same",
            name=f"encoder_conv_layer_{layer_number}"
        )
        x = conv_layer(x)
        x = ReLU(name=f"encoder_relu_{layer_number}")(x)
        x = BatchNormalization(name=f"encoder_bn_{layer_number}")(x)
        return x

    def _add_bottleneck(self, x):
        """Flatten data and add bottleneck with Guassian sampling (Dense
        layer).
        """
        self._shape_before_bottleneck = K.int_shape(x)[1:]
        x = Flatten()(x)
        self.mu = Dense(self.latent_space_dim, name="mu")(x)
        self.log_variance = Dense(self.latent_space_dim,
                                  name="log_variance")(x)

        def sample_point_from_normal_distribution(args):
            mu, log_variance = args
            epsilon = K.random_normal(shape=K.shape(self.mu), mean=0.,
                                      stddev=1.)
            sampled_point = mu + K.exp(log_variance / 2) * epsilon
            return sampled_point

        x = Lambda(sample_point_from_normal_distribution,
                   name="encoder_output")([self.mu, self.log_variance])
        return x


In [None]:
# @title Verify input shape
x_train.shape

(10000, 256, 512, 1)

In [None]:
# @title Train the Model on the dataset
def train(train_data, learning_rate, batch_size, epochs):
  tf.compat.v1.disable_eager_execution()
  autoEncoder = VAE(input_shape=(256, 512, 1),
                            conv_filters=(512, 256, 64, 32),
                            conv_kernels=(3, 3, 3, 3),
                            conv_strides=(1, 2, 2, (2,1)),
                            latent_space_dim=VECTOR_DIM
                            )
  autoEncoder.summary()
  autoEncoder.compile(learning_rate, clip_norm = 0.95)
  autoEncoder.train(train_data, batch_size, epochs)
  return autoEncoder

autoencoder = train(train_data=x_train,
                    learning_rate=LEARNING_RATE,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS)

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 256, 512, 1  0           []                               
                                )]                                                                
                                                                                                  
 encoder_conv_layer_1 (Conv2D)  (None, 256, 512, 51  5120        ['encoder_input[0][0]']          
                                2)                                                                
                                                                                                  
 encoder_relu_1 (ReLU)          (None, 256, 512, 51  0           ['encoder_conv_layer_1[0][0]']   
                                2)                                                          

AttributeError: ignored

In [None]:
from codecs import encode
subset = x_train[:32]
reconstructed, enc = autoencoder.reconstruct(subset)
aud = postProcessSingle(subset[1], data_min, data_max)
aud1 = postProcessSingle(reconstructed[1], data_min, data_max)
#encoded = postProcessSingle(enc[1], data_min, data_max)
print(enc[0].shape)
display(Audio(aud, rate=fs))
display(Audio(aud1,rate=fs))
#display(Audio(encoded, rate=fs))


(128,)


In [None]:
autoencoder.save("/content/drive/My Drive/Model1")