In [4]:
import tensorflow as tf
import numpy as np
import IPython.display as ipd
import librosa
import random
import os

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# @title Load and preprocess data
# Constants for normalization
max_file_size = 44100 * 3  # 3 seconds at your sample rate
mean_norm=False
max_num_files = 2527
def normalize(array):
    std_dev = np.std(array)
    mean = np.mean(array)
    normalized_array = (array - mean) / std_dev
    return normalized_array, std_dev, mean

def denormalize(data, original_std, original_mean):
    denormalized_data = (data * original_std) + original_mean
    return denormalized_data


def adjust_size(in_file, max):
    file_len = len(in_file)
    if file_len < max:
        diff = max - file_len
        zeros = np.zeros(int(diff), dtype=in_file.dtype)  # Ensure zeros have the same data type
        padded = np.concatenate((in_file, zeros))
        return padded
    elif file_len > max:
        return in_file[:max]
    # Return the original in_file if it's already of the desired length
    return in_file

directory = '/content/drive/My Drive/Instrument_Dataset'

def loadMyDataset(path, num_files=1000):
    if num_files > max_num_files:
        num_files = max_num_files
    all_files = os.listdir(path)
    random.shuffle(all_files)
    selected_files = all_files[:num_files]

    audios = []
    count = 1
    for filename in selected_files:
        y, sr = librosa.load(path + "/" + filename,sr=44100)

        # Normalize the audio by subtracting the mean and dividing by the standard deviation
        if mean_norm:
            y,_,__ = normalize(y)

        shaped = adjust_size(y, max_file_size)
        print(f"{count} of {num_files} files processed", end="\r")
        count += 1
        audios.append(shaped)

    # Stack the normalized and adjusted audio files
    audios = np.array(audios)
    dim_added = tf.expand_dims(audios, axis=2)

    return dim_added, audios
dataset, audios = loadMyDataset(directory, num_files=10)

In [7]:
# @title Model Structure
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam

class VAE(Model):
    def __init__(self, latent_dim, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.latent_dim = latent_dim
        self.encoder = self.build_encoder()
        self.decoder = self.build_decoder()
        self.optimizer = Adam(learning_rate=1e-3)
        self.loss_metric = MeanSquaredError()

    def build_encoder(self):
        input = Input(shape=(132300, 1))
        x = tf.keras.layers.Flatten()(input)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dense(128, activation='relu')(x)
        z_mean = tf.keras.layers.Dense(self.latent_dim)(x)
        z_log_var = tf.keras.layers.Dense(self.latent_dim)(x)
        z = Lambda(self.reparameterize)([z_mean, z_log_var])
        return Model(input, [z_mean, z_log_var, z])

    def build_decoder(self):
        input = Input(shape=(self.latent_dim,))
        x = input
        x = tf.keras.layers.Dense(128, activation='relu')(x)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dense(132300, activation='sigmoid')(x)
        decoded = tf.keras.layers.Reshape((132300, 1))(x)
        return Model(input, decoded)

    def reparameterize(self, args):
        z_mean, z_log_var = args
        epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], self.latent_dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        return reconstructed

In [9]:
# @title Load model
latent_dim = 128
loaded_model = tf.saved_model.load("/content/drive/My Drive/drunk_model_saved")

In [23]:
# @title Predict from input
selected_file = 1
audio_comp = audios[selected_file]
reference_audios = dataset
#reference_audios = audios.reshape(1, -1)
print(reference_audios.shape)
generated_audio = loaded_model(reference_audios)[selected_file]
generated_audio = np.squeeze(generated_audio)
display(ipd.Audio(audio_comp, rate=44100))
display(ipd.Audio(generated_audio, rate=44100))

(10, 132300, 1)
