In [None]:
import numpy as np
import pandas as pd
import scipy
import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd
from sklearn import metrics
import tensorflow as tf

import os
import soundfile as sf
import time
import random

import data
import model_utils

%matplotlib inline
%config IPCompleter.greedy=True

# Training Parameters

In [None]:
# Main parameters
sr = 22050

# Choose which genre to model
genre = "Metal"

# Size of frame (in samples) that is fed to the model during training
frame = 64

# Chunk == sample
chunk = 1

# Duration of data in seconds
duration = 30

# Model name for saving the model
model_name_save = 'csc475_gan_baseline'

# Use saved data in dataset/experiments folder
use_saved_data = False

# Type of input; sequential or random frame order
sequential_input = False

if sequential_input==True:
    shuffle_state = False
elif sequential_input==False:
    shuffle_state = True

# Ratio of test data to training data
test_ratio = 0.2

# Use mu law companding
mu_law=False

# Batch size for training
batch_size_para = 64

# Epochs during training
epochs_ = 50

# Choose which test file segment to inspect visually
# Between 0 and 5 for default 2 minute duration
index = 4

In [None]:
# Disabling GPU for Mac M1 chips running tensorflow metal, do not use if not on Mac with M1 chip. 
# (RNNs run slowly on GPU)
tf.config.experimental.set_visible_devices([], 'GPU')

# Loading Training Data

In [None]:
# Building the dataset
file_data_path = os.path.join("dataset", "fileData.csv")
effect_data_path = os.path.join("dataset", "effectData.csv")

# Loading the dry and wet audio
clean_audio_path = os.path.join("dataset", "experiments", "clean_data.wav")
effect_audio_path = os.path.join("dataset", "experiments", "effect_data.wav")

if (use_saved_data == False):
    signal, wet = data.create_data(genre, effect_data_path, file_data_path, mu_comp=mu_law, srate=sr, 
                                   duration=duration, type="random")
    scipy.io.wavfile.write(clean_audio_path, rate=sr, data=signal)
    scipy.io.wavfile.write(effect_audio_path, rate=sr, data=wet)
else:
    signal, _ = librosa.load(clean_audio_path, sr=sr)
    wet, _ = librosa.load(effect_audio_path, sr=sr)
    

# Size of frames in training dataset
training_dataset_ = (int) ((len(signal) / frame) * (1 - test_ratio))

# Size of frames for testing (not the proper testset, details below)
testing_dataset_ = (int) ((len(signal) / frame) * test_ratio)

# Whether to filter the audio
filtered = False

# Creating a high pass filter
numtaps = 91
cutoff = 0.015
b = scipy.signal.firwin(numtaps, cutoff, width=None, window='hamming', pass_zero='highpass')

# Creating a lowpass filter
numtaps = 41
cutoff = 0.92

b2 = scipy.signal.firwin(numtaps, cutoff, width=None, window='hamming', pass_zero='lowpass')

# Optionally high pass audio to emphasize high frequency information, low pass to avoid aliasing artifacts
if filtered is True:
    # High Pass Filter
    signal = scipy.signal.lfilter(b, 1, signal)
    wet = scipy.signal.lfilter(b, 1, wet)
    # Low Pass Filter
    signal = scipy.signal.lfilter(b2, 1, signal)
    wet = scipy.signal.lfilter(b2, 1, wet)

In [None]:
print('Comparing the original dry audio to the wet audio as a reference' )
print('Mean absolute error: %.4f'% metrics.mean_absolute_error(signal, wet))
print('Mean squared error: %.4f'% metrics.mean_squared_error(signal, wet))
print('Coefficient of determination (R2 score): %.4f'% metrics.r2_score(signal, wet))

In [None]:
train_length = (int) (len(signal) * (1 - test_ratio))
test_length = (int) (len(signal) * test_ratio)

features_train = signal[0:train_length]
features_test = signal[train_length:train_length + test_length]

targets_train = wet[0:train_length]
targets_test = wet[train_length:train_length + test_length]

length_in_seconds = features_train.size / sr
length_for_wet = targets_train.size / sr
print('Length of training dry audio is {} seconds'.format(length_in_seconds)) 
print('Length of training wet audio is {} seconds'.format(length_for_wet))

# Constructing the baseline models

In [None]:
# Define the GAN architecture
class Generator(tf.keras.models.Model):
    def __init__(self):
        super(Generator, self).__init__()
        self.dense1 = tf.keras.layers.Dense(256, activation='relu')
        self.dense2 = tf.keras.layers.Dense(512, activation='relu')
        self.dense3 = tf.keras.layers.Dense(frame * chunk, activation='tanh')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)

class Discriminator(tf.keras.models.Model):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.dense1 = tf.keras.layers.Dense(512, activation='relu')
        self.dense2 = tf.keras.layers.Dense(256, activation='relu')
        self.dense3 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)

# Initialize generator, discriminator, and GAN
generator = Generator()
discriminator = Discriminator()

model = tf.keras.Sequential([generator, discriminator], name=model_name_save)

# Define loss functions
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

# Define optimizers
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

model.summary()

# Training the Model

In [None]:
noise_dim = 100

def generate_noise(batch_size, noise_dim):
    return tf.random.normal([batch_size, noise_dim])

@tf.function
def train_step(audio, effects):
    noise = generate_noise(tf.shape(audio)[0], noise_dim)
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_audio = generator(tf.concat([noise, effects], axis=1), training=True)

        real_output = discriminator(tf.concat([audio, effects], axis=1), training=True)
        fake_output = discriminator(tf.concat([generated_audio, effects], axis=1), training=True)

        gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
        disc_loss_real = cross_entropy(tf.ones_like(real_output), real_output)
        disc_loss_fake = cross_entropy(tf.zeros_like(fake_output), fake_output)
        disc_loss = disc_loss_real + disc_loss_fake

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

# Define function to apply effect to audio
def apply_effect(audio, effect):
    noise = generate_noise(1, noise_dim)
    generated_audio = generator(tf.concat([noise, effect], axis=1), training=False)
    return generated_audio.numpy()[0]

# train GAN
def train_gan(audio, effects, epochs, batch_size):
    for epoch in range(epochs):
        print('Epoch {}'.format(epoch + 1))
        start = time.time()
        counter = 0
        
        for i in range(0, len(audio), batch_size):
            batch_audio = []
            batch_effects = []
            
            if (audio[i:i+batch_size].size == batch_size and effects[i:i+batch_size].size == batch_size):
                batch_audio.append(audio[i:i+batch_size])
                batch_effects.append(effects[i:i+batch_size])

                train_step(batch_audio, batch_effects)
                counter += 1
                print('Batch {} / {}'.format(counter, (int) (len(audio) / batch_size)), end='\r')
                
                
        print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

train_gan(features_train, targets_train, epochs=epochs_, batch_size=batch_size_para)

# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model accuracy')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'val'], loc='upper left')
# plt.show()

In [None]:
tar_pred = apply_effect(features_test, targets_test)
train_tar_pred = apply_effect(features_train, targets_train)

# Mean squared error (lower the better)
print('Mean squared error: {}'.format(metrics.mean_squared_error(targets_test, tar_pred)))

# Mean absolute error (lower the better)
print('Mean absolute error: %.4f'% metrics.mean_absolute_error(targets_test, tar_pred))

# Median absolute error (lower the better)
print('Median absolute error: %.4f'% metrics.median_absolute_error(targets_test, tar_pred))

# Coefficient of determination (r2 score): 1 is perfect prediction (it can get arbitrary negative)
print('Coefficient of determination (R2 score): %.4f'% metrics.r2_score(targets_test, tar_pred))

# Explained variance score: 1 is perfect prediction (it can get arbitrary worse)
print('Explained variance score: %.4f'% metrics.explained_variance_score(targets_test, tar_pred))
    
model_utils.plot_result(targets_train, targets_test, train_tar_pred, tar_pred)

In [None]:
# Saving the model (ignore warnings)
model.save(os.path.join('dataset', 'models',model_name_save + '.keras'))

# Evaluating the Model

In [None]:
start = time.time()

prepared_audio = model_utils.prepare_audio_seq(dry_test, index=index, frame=frame)
testfile = model.predict(prepared_audio)
testfile = testfile.flatten()

stop = time.time()

inference = stop - start 
print('Inference of 5 Seconds of audio took {} seconds with a samplerate of {}'.format(inference, sr))

original = dry_test[index]
original_wet = wet_test[index]

if mu_law == True:
    original = librosa.mu_expand(dry_test[index])
    testfile = np.round(testfile)
    testfile = librosa.mu_expand(testfile)
    original_wet = librosa.mu_expand(wet_test[index])

model_utils.plot_waveform(original, 'Input')
ipd.display(ipd.Audio(original, rate=sr))

model_utils.plot_waveform(testfile, 'Predicted Output')
ipd.display(ipd.Audio(testfile, rate=sr))

model_utils.plot_waveform(original_wet, 'Correct Output')
ipd.display(ipd.Audio(original_wet, rate=sr))

In [None]:
# Comparing short segments of the waveforms of the dry audio, predicted audio and the target wet audio
model_utils.compare_waveforms(
    original, 
    testfile, 
    original_wet, 
    model_name_save + ' ' + genre, 
    10000, 
    10200
)
model_utils.compare_waveforms(
    original, 
    testfile, 
    original_wet, 
    model_name_save + ' ' + genre, 
    40000, 
    40200
)

In [None]:
model_utils.plot_spectrogram_hz(original, sr, 'Input ' + model_name_save, 'hz')
model_utils.plot_spectrogram_hz(original_wet, sr, 'Original Output ' + model_name_save + ' ' + genre, 'hz')
model_utils.plot_spectrogram_hz(testfile, sr, 'Predicted Output ' + model_name_save + ' ' + genre, 'hz')

In [None]:
def model_description(model):
    print(model.summary())
    print(model_name_save)
    print('The effect modeled: {}'.format(genre))
    print('Size of input audio frame is {}'.format(frame))
    print('Total length of audio in training dataset: {} seconds'.format(features_train.size / sr))
    print('Total number of frames in the training set: {}'.format(features_train.shape[0]))
    print('Number of epochs: {}'.format(epochs_))
    print('Batch size during training: {}'.format(batch_size_para))
    print('Sequential input: {}'.format(sequential_input))

# Function to compute metrics on multiple segments of the test set. k_fold determines how many
# segments are analysed. If random is set to false, it will compute metrics for index number 0 to k_fold,
# enabling the user to compute metrics for the whole test set if k_fold is set to dry_shape[0].
# Else if random is True then the function will randomly pick k_fold number of segments from the test set.
def avg_metrics_on_predictions(k_fold=5, randomized=False):
    predicted = np.zeros((dry_test.shape[0],dry_test.shape[1]))
    original_wet = np.zeros((dry_test.shape[0],dry_test.shape[1]))
    r2 = np.array([])
    mae = np.array([])
    timer = 0
    
    for i in range(k_fold):
        if randomized==False:
            start = time.time()
            to_predict = model_utils.prepare_audio_seq(dry_test, i, frame=frame)
            prediction = model.predict(to_predict)
            prediction = prediction.flatten()
            stop = time.time()
            timer += (stop-start)

            predicted[i]= prediction
            original_wet[i] = wet_test[i]

            r2 = np.append(r2, metrics.r2_score(original_wet[i], predicted[i]))
            mae = np.append(mae, metrics.mean_absolute_error(original_wet[i], predicted[i]))

        elif randomized==True:
            random_choice = random.randint(0,k_fold)
            start = time.time()
            to_predict = model_utils.prepare_audio_seq(dry_test, random_choice, frame=frame)
            prediction = model.predict(to_predict)
            prediction = prediction.flatten()
            stop = time.time()
            timer += (stop-start)

            predicted[i]= prediction
            original_wet[i] = wet_test[random_choice]

            r2 = np.append(r2, metrics.r2_score(original_wet[i], predicted[i]))
            mae = np.append(mae, metrics.mean_absolute_error(original_wet[i], predicted[i]))

    print('The model: {}'.format(model_description(model)))
    print('R2 individual scores for segments is {}'.format(r2))
    print('Mae individual scores for segments is {}'.format(mae))
    print('Overall average metrics for original wet audio vs predicted on test set:' )

    MAE_ = metrics.mean_absolute_error(original_wet, predicted)
    R2_ = metrics.r2_score(original_wet, predicted)
    EN_MAE_ = model_utils.energy_normalized_mae(original_wet, predicted)
    ESR_ = model_utils.esr(original_wet, predicted)
    
    print('Energy Normalized Mae: {}'.format(EN_MAE_))
    print('Mae: {}'.format(MAE_))
    print('R2: {}'.format(R2_) )
    print('ESR: {}'.format(ESR_))
    print('Inference time for {} seconds of audio was {} seconds'.format((dry_test.size/sr),(timer)))
    inference_time = timer / (dry_test.size/sr)

    return MAE_, R2_, EN_MAE_, ESR_, inference_time

In [None]:
# Predicting the whole test set and getting metrics
MAE_, R2_, EN_MAE_, ESR_, inference_time = avg_metrics_on_predictions(k_fold=dry_test.shape[0], randomized=False)

# Compiling Results into a CSV

In [None]:
experiments_path = os.path.join('dataset', 'experiments')

sf.write(os.path.join(experiments_path, model_name_save + genre +'.wav'), testfile, sr)
sf.write(os.path.join(experiments_path, genre +  str(index) + '.wav'), original_wet, sr)

In [None]:
csv_path = os.path.join(experiments_path, 'experiments-'+ genre + '.csv') 

if (os.path.isfile(csv_path)):
    dataset = pd.read_csv(csv_path, header=None)
else:
    dataset = pd.DataFrame(columns=[
        'Model Name', 'Effect', 'Frame', 'Sequential Input', 'Training Dataset', 
        'Hidden Units', 'Batch Size', 'Epochs', 'MAE', 'R2', 'Inference Time',
        'EN MAE'
    ])

In [None]:
# Adding a row to the dataframe
s_row = pd.Series([
    model_name_save,genre,frame,sequential_input,training_dataset_, hidden_units, 
    batch_size_para, epochs_, MAE_, R2_, inference_time, EN_MAE_
], index=dataset.columns)
 
# Append the above pandas Series object as a row to the existing pandas DataFrame
dataset.loc[len(dataset)] = s_row

# Displaying the dataframe
dataset

In [None]:
# Saving the updated dataframe back to the csv file.
if (os.path.isfile(csv_path)):
    dataset.to_csv(csv_path, header=False, index=False)
else:
    dataset.to_csv(csv_path, index=False)

# Create Data for Demoing the Model

In [None]:
# Create a pentatonic scale for demo
# Keep running till first 5 seconds sound decent
clean, effect = data.create_data(genre, effect_data_path, file_data_path, mu_comp=mu_law, srate=sr, 
                               duration=15, type="pentatonic")
length = (int) (len(clean) / frame)

ipd.display(ipd.Audio(clean, rate=sr))
clean_test, effect_test, _, _, _, _ = create_dataset(clean, effect, 0, length, frame)

In [None]:
# Predict audio with model and save files for later use
prepared_audio = prepare_audio_seq(clean_test, index=0)
predicted = model.predict(prepared_audio)
predicted = predicted.flatten()

ipd.display(ipd.Audio(predicted, rate=sr))
ipd.display(ipd.Audio(effect_test[0], rate=sr))

sf.write(os.path.join(experiments_path, 'Predicted_Demo.wav'), predicted, sr)
sf.write(os.path.join(experiments_path, 'Effect_Demo.wav'), effect_test[0], sr)
sf.write(os.path.join(experiments_path, 'Clean_Demo.wav'), clean_test[0], sr)

# Using Model on Audio Recorded by Team

In [None]:
# Load in files we made as to test
def predict_whole_audio(model, audio_file):
    audio, _ = librosa.load(os.path.join(experiments_path, audio_file))
    audio_length = (int) (len(audio) / frame)
    audio_test, _, _, _, _, _ = create_dataset(audio, audio, 0, audio_length, frame, test_ratio=1.0)

    predicted_whole = []

    for i in range(audio_test.shape[0]):
        prepared_audio = prepare_audio_seq(audio_test, index=i)
        predicted = model.predict(prepared_audio)
        predicted = predicted.flatten()
        predicted_whole = np.hstack([predicted_whole, predicted])

    return audio, predicted_whole

print("Predicting Mono1")
mono1, mono1_predicted = predict_whole_audio(model, 'mono1.wav')

print("Predicting Mono2")
mono2, mono2_predicted = predict_whole_audio(model, 'mono2.wav')

print("Predicting Poly")
poly, poly_predicted = predict_whole_audio(model, 'poly.wav')

In [None]:
print("Mono1 File")
ipd.display(ipd.Audio(mono1, rate=sr))
ipd.display(ipd.Audio(mono1_predicted, rate=sr))

print("Mono2 File")
ipd.display(ipd.Audio(mono2, rate=sr))
ipd.display(ipd.Audio(mono2_predicted, rate=sr))

print("Poly File")
ipd.display(ipd.Audio(poly, rate=sr))
ipd.display(ipd.Audio(poly_predicted, rate=sr))

sf.write(os.path.join(experiments_path, 'mono1_predicted.wav'), mono1_predicted, sr)
sf.write(os.path.join(experiments_path, 'mono2_predicted.wav'), mono2_predicted, sr)
sf.write(os.path.join(experiments_path, 'poly_predicted.wav'), poly_predicted, sr)