In [5]:
import json
import numpy as np
import keras.backend as K
from keras.models import Model, load_model
from keras.layers import Input, LSTM, RepeatVector
from keras.callbacks import EarlyStopping
import soundfile
import librosa
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Check if GPU is available for tensorflow
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 12100731930851906461,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 3143997849
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 10465110706734145639
 physical_device_desc: "device: 0, name: GeForce GTX 970, pci bus id: 0000:01:00.0, compute capability: 5.2"]

In [10]:
def load_data(file_path):
    '''
    Load in the data from file_path
    '''
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    
    MFCC_dataset = []
    keys = []
    for key in json_data:
        keys.append(key)
        MFCC_array = np.array(json_data[key])
        MFCC_dataset.append(MFCC_array.T)
    return MFCC_dataset, keys

def get_max_frame_length(MFCC_dataset):
    '''
    Get the maximum frame length of an MFCC dataset
    '''
    return max([MFCC.shape[0] for MFCC in MFCC_dataset])

def data_length_histogram(MFCC_dataset, cutoff, plot=False):
    '''
    Filter out datapoints that are either too long or too short
    to reduce the size of the dataset when padding is applied
    '''
    cutoff = cutoff / 100 if cutoff > 1 else cutoff
    lengths = [len(mfcc) for mfcc in MFCC_dataset]
    binlen = range(max(lengths) + 1)
    total = len(lengths)
    n_to_remove = int((1 - cutoff) * total)
    
    if plot:
        plt.subplot(2, 1, 1)
        plt.hist(lengths, bins=binlen)
    
    for _ in range(n_to_remove):
        lengths = [len(mfcc) for mfcc in MFCC_dataset]
        u_list = np.array(np.unique(np.array(lengths), return_counts=True, return_index=True)).T
        u_list = np.array(sorted(u_list, key=lambda x: x[0]))
        if u_list[0][2] < u_list[-1][2]:
            MFCC_dataset = np.delete(MFCC_dataset, u_list[0][1])
        else:
            MFCC_dataset = np.delete(MFCC_dataset, u_list[-1][1])
    
    if plot:
        plt.subplot(2, 1, 2)
        plt.hist(lengths, bins=binlen)
        plt.show()
        
    return MFCC_dataset

def pad_data(MFCC_dataset, number_of_frames):
    '''
    Pad MFCC_dataset to the specified number of frames
    '''
    padded_MFCC_dataset = []
    original_shapes = []
    for MFCC in MFCC_dataset:
        original_shapes.append(MFCC.shape)
        new_MFCC = np.pad(MFCC, ((number_of_frames-MFCC.shape[0], 0), (0, 0)), 'constant')
        padded_MFCC_dataset.append(new_MFCC)
    return np.array(padded_MFCC_dataset), np.array(original_shapes)

def unpad_data(padded_MFCC_dataset, original_shapes):
    '''
    Unpad padded MFCC_dataset to the original number of frames
    '''
    unpadded_MFCCs = []
    for i, padded_MFCC in enumerate(padded_MFCC_dataset):
        index = padded_MFCC.shape[0] - original_shapes[i][0]
        original_MFCC = padded_MFCC[index:]
        unpadded_MFCCs.append(original_MFCC)
    return unpadded_MFCCs

def scale_data(dataset, feature_range=(0, 1)):
    '''
    Rescale the data to within the specified range
    '''
    scaler = MinMaxScaler(feature_range=feature_range)
    scaled_dataset = []
    min_values = []
    max_values = []
    for data in dataset:
        scaler.fit(data)
        min_values.append(scaler.data_min_)
        max_values.append(scaler.data_max_)
        scaled_data = scaler.transform(data)
        scaled_dataset.append(scaled_data)
    return np.array(scaled_dataset), np.array(min_values), np.array(max_values)

def unscale_data(scaled_dataset, min_values, max_values, feature_range=(0, 1)):
    unscaled_dataset = []
    for i, data in enumerate(scaled_dataset):
        data_std = (data - feature_range[0]) / (feature_range[1] - feature_range[0])
        unscaled_data = data_std * (max_values[i] - min_values[i]) + min_values[i]
        unscaled_dataset.append(unscaled_data)
    return np.array(unscaled_dataset)

In [11]:
# Setting data paths
train_path = 'Downsampled/nl/MFCC_json_files/MFCC_train.json'
test_path = 'Downsampled/nl/MFCC_json_files/MFCC_test.json'
validate_path = 'Downsampled/nl/MFCC_json_files/MFCC_validate.json'

# Loading the data
MFCC_train_set, keys_train_set = load_data(train_path)
MFCC_test_set, keys_test_set = load_data(test_path)
MFCC_validate_set, keys_validate_set = load_data(validate_path)

# Filter data based on length
MFCC_train_set = data_length_histogram(MFCC_train_set, 90, plot=False)
MFCC_test_set = data_length_histogram(MFCC_test_set, 90, plot=False)
MFCC_validate_set = data_length_histogram(MFCC_validate_set, 90, plot=False)

# Scaling the data
scaled_MFCC_train_set, min_values_train, max_values_train = scale_data(MFCC_train_set)
scaled_MFCC_test_set, min_values_test, max_values_test = scale_data(MFCC_test_set)
scaled_MFCC_validate_set, min_values_validate, max_values_validate = scale_data(MFCC_validate_set)

# Padding the data
max_frames_train = get_max_frame_length(scaled_MFCC_train_set)
max_frames_test = get_max_frame_length(scaled_MFCC_test_set)
max_frames_validate = get_max_frame_length(scaled_MFCC_validate_set)

new_number_of_frames = max([max_frames_train, max_frames_test, max_frames_validate])

prepped_MFCC_train_set, _ = pad_data(scaled_MFCC_train_set, new_number_of_frames)
prepped_MFCC_test_set, _ = pad_data(scaled_MFCC_test_set, new_number_of_frames)
prepped_MFCC_validate_set, _ = pad_data(scaled_MFCC_validate_set, new_number_of_frames)

In [9]:
# Setting the model parameters
batch_size = 10
epochs = 10
latent_dim = 500
input_dim = 12
timesteps = new_number_of_frames
temp_dim = round(((input_dim * timesteps) + latent_dim) / 2)

# Making the model
inputs = Input(shape=(timesteps, input_dim))
encoded = LSTM(temp_dim, return_sequences=True, activation='sigmoid')(inputs)
encoded = LSTM(latent_dim, activation='sigmoid')(encoded)

decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(temp_dim, return_sequences=True, activation='sigmoid')(decoded)
decoded = LSTM(input_dim, return_sequences=True, activation='sigmoid')(decoded)

sequence_autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)

sequence_autoencoder.compile(optimizer='rmsprop', loss='mse')

In [10]:
# Set callbacks
callbacks = [EarlyStopping(patience=2, mode="min", restore_best_weights=True),]

# Training the model
sequence_autoencoder.fit(prepped_MFCC_train_set,
                         prepped_MFCC_train_set,
                         batch_size=batch_size,
                         epochs=epochs,
                         callbacks=callbacks,
                         validation_data=(prepped_MFCC_validate_set, prepped_MFCC_validate_set))

Train on 3069 samples, validate on 658 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.callbacks.History at 0x23e6e733eb8>

In [11]:
sequence_autoencoder.save('nl_2lstm')

In [8]:
seqseq = load_model('Modellen Remco/nl_2lstm')

In [12]:
print('\n# Evaluate on test data')
results = seqseq.evaluate(prepped_MFCC_test_set, prepped_MFCC_test_set, batch_size=10)
print('test loss, test acc:', results)


# Evaluate on test data
test loss, test acc: 0.06167837891354025


In [18]:
prediction = sequence_autoencoder.predict(scaled_MFCC_test_set)

In [19]:
reconstructed_MFCC = unscale_data(prediction, min_values_test, max_values_test)

In [18]:
print(keys_test_set[200])

19509909


In [27]:
wav_signal = librosa.feature.inverse.mfcc_to_audio(reconstructed_MFCC[200].T)
soundfile.write('test_sound.wav', wav_signal, 22050)