In [2]:
!pip install audiofile

Collecting audiofile
  Downloading audiofile-1.3.0-py3-none-any.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m17.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting audeer (from audiofile)
  Downloading audeer-1.20.1-py3-none-any.whl (23 kB)
Collecting audmath>=1.2.1 (from audiofile)
  Downloading audmath-1.4.0-py3-none-any.whl (23 kB)
Installing collected packages: audmath, audeer, audiofile
Successfully installed audeer-1.20.1 audiofile-1.3.0 audmath-1.4.0


In [3]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
import os
import random
import audiofile
import IPython.display as ipd

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# @title Hyperparameters
max_seconds = 3
fs = 44100
max_file_size = max_seconds * fs
latent_dim = 256
num_epochs = 30
max_num_files = 2527
batch_size = 16
l2_strength=0.005
learning_rate = .0001
dropout_rate = .009
grad_clip = .8
mean_norm = False

In [6]:
# @title Load and preprocess data
# Constants for normalization
max_file_size = 44100 * 3  # 3 seconds at your sample rate
def normalize(array):
    std_dev = np.std(array)
    mean = np.mean(array)
    normalized_array = (array - mean) / std_dev
    return normalized_array, std_dev, mean

def denormalize(data, original_std, original_mean):
    denormalized_data = (data * original_std) + original_mean
    return denormalized_data


def adjust_size(in_file, max):
    file_len = len(in_file)
    if file_len < max:
        diff = max - file_len
        zeros = np.zeros(int(diff), dtype=in_file.dtype)  # Ensure zeros have the same data type
        padded = np.concatenate((in_file, zeros))
        return padded
    elif file_len > max:
        return in_file[:max]
    # Return the original in_file if it's already of the desired length
    return in_file

directory = '/content/drive/My Drive/Instrument_Dataset'

def loadMyDataset(path, num_files=1000):
    if num_files > max_num_files:
        num_files = max_num_files
    all_files = os.listdir(path)
    random.shuffle(all_files)
    selected_files = all_files[:num_files]

    audios = []
    count = 1
    for filename in selected_files:
        y, sr = audiofile.read(path + "/" + filename)

        # Normalize the audio by subtracting the mean and dividing by the standard deviation
        if mean_norm:
            y,_,__ = normalize(y)

        shaped = adjust_size(y, max_file_size)
        print(f"{count} of {num_files} files processed", end="\r")
        count += 1
        audios.append(shaped)

    # Stack the normalized and adjusted audio files
    audios = np.array(audios)
    dim_added = tf.expand_dims(audios, axis=2)

    return dim_added, audios
dataset, audios = loadMyDataset(directory, num_files=3000)




In [7]:
audios.shape, dataset.shape

((2526, 132300), TensorShape([2526, 132300, 1]))

In [None]:
for i in range(8):
  display(ipd.Audio(audios[i],rate=fs))
  print(dataset[i].shape)

In [12]:
# @title Create Vae class
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError

class VAE(Model):
    def __init__(self, latent_dim, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.latent_dim = latent_dim
        self.encoder = self.build_encoder()
        self.decoder = self.build_decoder()
        self.optimizer = Adam(learning_rate=1e-3)
        self.loss_metric = MeanSquaredError()

    def build_encoder(self):
        input = Input(shape=(132300,))
        x = Dense(2048, activation='relu', kernel_regularizer=l2(l2_strength))(input)
        x = Dense(1024, activation='relu', kernel_regularizer=l2(l2_strength))(x)
        x = Dense(512, activation='relu', kernel_regularizer=l2(l2_strength))(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(l2_strength))(x)
        z_mean = Dense(self.latent_dim)(x)
        z_log_var = Dense(self.latent_dim)(x)
        z = Lambda(self.reparameterize)([z_mean, z_log_var])
        return Model(input, [z_mean, z_log_var, z])

    def build_decoder(self):
        input = Input(shape=(self.latent_dim,))
        x = Dense(256, activation='relu', kernel_regularizer=l2(l2_strength))(input)
        x = Dense(512, activation='relu', kernel_regularizer=l2(l2_strength))(x)
        x = Dense(1024, activation='relu', kernel_regularizer=l2(l2_strength))(x)
        x = Dense(2048, activation='relu', kernel_regularizer=l2(l2_strength))(x)
        x = Dense(132300, activation='sigmoid', kernel_regularizer=l2(l2_strength))(x)
        return Model(input, x)

    def reparameterize(self, args):
        z_mean, z_log_var = args
        epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], self.latent_dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        return reconstructed


In [12]:
# @title Create conv1d dense combo VAE
from tensorflow.keras.layers import Input, Conv1D, Flatten, Dense, Reshape, Lambda, LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError

class VAE(Model):
    def __init__(self, latent_dim, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.latent_dim = latent_dim
        self.encoder = self.build_encoder()
        self.decoder = self.build_decoder()
        self.optimizer = Adam(learning_rate=1e-3)
        self.loss_metric = MeanSquaredError()

    def build_encoder(self):
        input = Input(shape=(132300, 1))
        x = Conv1D(64, 3, activation=LeakyReLU(alpha=0.2), padding='same')(input)
        x = Conv1D(256, 3, activation=LeakyReLU(alpha=0.2), padding='same')(x)
        x = Flatten()(x)
        z_mean = Dense(self.latent_dim)(x)
        z_log_var = Dense(self.latent_dim)(x)
        z = Lambda(self.reparameterize)([z_mean, z_log_var])
        return Model(input, [z_mean, z_log_var, z])

    def build_decoder(self):
        input = Input(shape=(self.latent_dim,))
        x = Dense(132300)(input)
        x = Reshape((132300, 1))(x)
        x = Conv1D(256, 3, activation=LeakyReLU(alpha=0.2), padding='same')(x)
        x = Conv1D(64, 3, activation=LeakyReLU(alpha=0.2), padding='same')(x)
        decoded = Conv1D(1, 3, activation='sigmoid', padding='same')(x)
        return Model(input, decoded)

    def reparameterize(self, args):
        z_mean, z_log_var = args
        epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], self.latent_dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        return reconstructed


In [15]:
# @title Train the model
from tensorflow.keras.callbacks import EarlyStopping

def train_vae(data, learning_rate, num_epochs, latent_dim, batch_size):
    vae = VAE(latent_dim)
    !nvidia-smi
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipvalue = grad_clip)
    vae.compile(loss='mean_squared_error', optimizer=optimizer)
    !nvidia-smi
    # Define early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    callbacks = [early_stopping]

    target_length = 44100 * 3  # 3 seconds

    # Modify your data as needed
    x_train = data  # Your input data
    y_train = data  # Use the same data as target (autoencoder)

    vae.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks, validation_split=0.2)
    return vae

In [18]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow_model_optimization.python.core.sparsity import keras as sparsity

def train_vae(data, learning_rate, num_epochs, latent_dim, batch_size, pruning_params=None):
    # Create VAE model
    vae = VAE(latent_dim)

    # Define the optimizer with the learning rate and gradient clipping
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipvalue=grad_clip)

    # Compile the model with 'mean_squared_error' loss
    vae.compile(loss='mean_squared_error', optimizer=optimizer)

    # Define early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    callbacks = [early_stopping]

    target_length = 44100 * 3  # 3 seconds

    # Modify your data as needed
    x_train = data  # Your input data
    y_train = data  # Use the same data as target (autoencoder)

    # Apply pruning to the model
    if pruning_params:
        pruned_model = sparsity.prune_low_magnitude(vae, **pruning_params)
        pruned_model.compile(loss='mean_squared_error', optimizer=optimizer)  # Recompile the pruned model
        callbacks.append(sparsity.UpdatePruningStep())

        # Train the pruned model
        pruned_model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks, validation_split=0.2)

        # Strip pruning for inference
        final_model = sparsity.strip_pruning(pruned_model)

        return final_model
    else:
        # If no pruning, train the original model
        vae.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks, validation_split=0.2)
        return vae


ModuleNotFoundError: ignored

In [16]:
trained = train_vae(dataset, learning_rate, num_epochs=num_epochs, latent_dim=latent_dim,batch_size=batch_size)

ResourceExhaustedError: ignored

In [16]:
reference_audio = audios[100]
normalized_audio, og_std, og_mean = normalize(reference_audio)
if mean_norm:
  latent_representation = trained.encoder.predict(normalized_audio.reshape(1, -1))[0]
else:
  latent_representation = trained.encoder.predict(reference_audio.reshape(1, -1))[0]
# Generate audio from the latent representation
generated_audio = trained.decoder.predict(latent_representation)
print(generated_audio.shape)
if mean_norm:
  generated_audio = denormalize(generated_audio, og_std, og_mean)
generated_audio = np.squeeze(generated_audio)
print(generated_audio.shape)
print(normalized_audio.shape)
print("Reference Audio")
display(ipd.Audio(reference_audio, rate=fs))
print("Reconstructed Audio")
display(ipd.Audio(generated_audio, rate=fs))
generated_audio

(1, 132300)
(132300,)
(132300,)
Reference Audio


Reconstructed Audio


array([0.40683606, 0.40682888, 0.40682912, ..., 0.40686077, 0.40690425,
       0.40691212], dtype=float32)

In [33]:
# @title Save model architecture and weights
tf.saved_model.save(trained, "/content/drive/My Drive/drunk_model_saved")
trained.save_weights("/content/drive/My Drive/drunk_model_weights.h5")


In [None]:
import json
# @title Load model
from keras.models import model_from_json

# Define a custom object dictionary if needed (only for custom layers or loss functions)
custom_objects = {}

# Load the model architecture from the JSON file
json_file = open("/content/drive/My Drive/drunk_model.json", 'r')
model_json = json_file.read()
json_file.close()
print(model_json)
# Create a new model from the JSON configuration
vae = model_from_json(model_json, custom_objects=custom_objects)

# Load the model weights
vae.load_weights("/content/drive/My Drive/drunk_model_weights.h5")

# Compile the model
vae.compile(optimizer='adam', loss='your_loss_function')

{"class_name": "VAE", "config": {"latent_dim": 256}, "keras_version": "2.13.1", "backend": "tensorflow"}


AttributeError: ignored