In [1]:
import json
import numpy as np
import keras.backend as K
from keras.models import Model
from keras.layers import Input, LSTM, RepeatVector
import soundfile
import librosa

Using TensorFlow backend.


In [2]:
# Cuda dingen, alleen runnen als je cuda hebt
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

Num GPUs Available:  1


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3074494930933498576,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 3143997849
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 8718163073303112109
 physical_device_desc: "device: 0, name: GeForce GTX 970, pci bus id: 0000:01:00.0, compute capability: 5.2"]

In [3]:
train_path = 'Downsampled/nl/MFCC_json_files/MFCC_train.json'
test_path = 'Downsampled/nl/MFCC_json_files/MFCC_test.json'
validate_path = 'Downsampled/nl/MFCC_json_files/MFCC_validate.json'

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    
    MFCC_dataset = []
    for key in json_data:
        MFCC_array = np.array(json_data[key])
        MFCC_dataset.append(MFCC_array.T)
    return MFCC_dataset

def get_max_frame_length(MFCC_dataset):
    return max([MFCC.shape[0] for MFCC in MFCC_dataset])

def pad_data(MFCC_dataset, number_of_frames):
    padded_MFCC_dataset = []
    for MFCC in MFCC_dataset:
        new_MFCC = np.pad(MFCC, ((number_of_frames-MFCC.shape[0], 0), (0, 0)), 'constant')
        padded_MFCC_dataset.append(new_MFCC)
    return np.array(padded_MFCC_dataset)

In [13]:
MFCC_train_set = load_data(train_path)
MFCC_test_set = load_data(test_path)
MFCC_validate_set = load_data(validate_path)

max_frames_train = get_max_frame_length(MFCC_train_set)
max_frames_test = get_max_frame_length(MFCC_test_set)
max_frames_validate = get_max_frame_length(MFCC_validate_set)

new_number_of_frames = max([max_frames_train, max_frames_test, max_frames_validate])

padded_MFCC_train_set = pad_data(MFCC_train_set, new_number_of_frames)
padded_MFCC_test_set = pad_data(MFCC_test_set, new_number_of_frames)
padded_MFCC_validate_set = pad_data(MFCC_validate_set, new_number_of_frames)

In [15]:
padded_MFCC_train_set.shape

(3409, 401, 12)

In [18]:
batch_size = 1
epochs = 5
latent_dim = 150
input_dim = 12
timesteps = new_number_of_frames

inputs = Input(shape=(timesteps, input_dim))
encoded = LSTM(latent_dim)(inputs)

decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(input_dim, return_sequences=True)(decoded)

sequence_autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)

sequence_autoencoder = Model(inputs, decoded)
sequence_autoencoder.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [19]:
sequence_autoencoder.fit(padded_MFCC_train_set,
                         padded_MFCC_train_set,
                         batch_size=batch_size,
                         epochs=epochs)

Epoch 1/5
 272/3409 [=>............................] - ETA: 59:05 - loss: -1438.5563

KeyboardInterrupt: 

In [55]:
prediction = sequence_autoencoder.predict(padded_MFCC_test_set)

In [57]:
wav_signal = librosa.feature.inverse.mfcc_to_audio(prediction[0])
soundfile.write('test_sound.wav', wav_signal, 22050)