In [12]:
import json
import numpy as np
import keras.backend as K
from keras.models import Model
from keras.layers import Input, LSTM, RepeatVector
import soundfile
import librosa
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [13]:
train_path = 'Downsampled/nl/MFCC_json_files/MFCC_train.json'
test_path = 'Downsampled/nl/MFCC_json_files/MFCC_test.json'
validate_path = 'Downsampled/nl/MFCC_json_files/MFCC_validate.json'

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    
    MFCC_dataset = []
    keys = []
    for key in json_data:
        keys.append(key)
        MFCC_array = np.array(json_data[key])
        MFCC_dataset.append(MFCC_array.T)
    return MFCC_dataset, keys

def get_max_frame_length(MFCC_dataset):
    return max([MFCC.shape[0] for MFCC in MFCC_dataset])

def data_length_histogram(MFCC_dataset, cutoff, plot=False):
    cutoff = cutoff/100 if cutoff > 1 else cutoff
    lengths = [len(mfcc) for mfcc in MFCC_dataset]
    binlen = range(max(lengths)+1)
    total = len(lengths)
    n_to_remove = int((1 - cutoff) * total)
    
    if plot:
        plt.subplot(2, 1, 1)
        plt.hist(lengths, bins=binlen)
    
    for _ in range(n_to_remove):
        lengths = [len(mfcc) for mfcc in MFCC_dataset]
        u_list = np.array(np.unique(np.array(lengths), return_counts=True, return_index=True)).T
        u_list = np.array(sorted(u_list, key=lambda x: x[0]))
        if u_list[0][2] < u_list[-1][2]:
            MFCC_dataset = np.delete(MFCC_dataset, u_list[0][1])
        else:
            MFCC_dataset = np.delete(MFCC_dataset, u_list[-1][1])
    
    if plot:
        plt.subplot(2, 1, 2)
        plt.hist(lengths, bins=binlen)
        plt.show()
        
    return MFCC_dataset

def pad_data(MFCC_dataset, number_of_frames):
    padded_MFCC_dataset = []
    for MFCC in MFCC_dataset:
        new_MFCC = np.pad(MFCC, ((number_of_frames-MFCC.shape[0], 0), (0, 0)), 'constant')
        padded_MFCC_dataset.append(new_MFCC)
    return np.array(padded_MFCC_dataset)

MFCC_train_set, keys_train_set = load_data(train_path)
MFCC_test_set, keys_test_set = load_data(test_path)
MFCC_validate_set, keys_validate_set = load_data(validate_path)

MFCC_train_set = data_length_histogram(MFCC_train_set, 90, plot=False)
MFCC_test_set = data_length_histogram(MFCC_test_set, 90, plot=False)
MFCC_validate_set = data_length_histogram(MFCC_validate_set, 90, plot=False)

In [14]:
max_frames_train = get_max_frame_length(MFCC_train_set)
max_frames_test = get_max_frame_length(MFCC_test_set)
max_frames_validate = get_max_frame_length(MFCC_validate_set)

new_number_of_frames = max([max_frames_train, max_frames_test, max_frames_validate])

padded_MFCC_train_set = pad_data(MFCC_train_set, new_number_of_frames)
padded_MFCC_test_set = pad_data(MFCC_test_set, new_number_of_frames)
padded_MFCC_validate_set = pad_data(MFCC_validate_set, new_number_of_frames)

In [15]:
def scale_data(dataset, feature_range=(-1, 1)):
    scaler = MinMaxScaler(feature_range=feature_range)
    scaled_dataset = []
    min_values = []
    max_values = []
    for data in dataset:
        scaler.fit(data)
        min_values.append(scaler.data_min_)
        max_values.append(scaler.data_max_)
        scaled_data = scaler.transform(data)
        scaled_dataset.append(scaled_data)
    return np.array(scaled_dataset), np.array(min_values), np.array(max_values)

def unscale_data(scaled_dataset, min_values, max_values, feature_range=(-1, 1)):
    unscaled_dataset = []
    for i, data in enumerate(scaled_dataset):
        data_std = (data - feature_range[0]) / (feature_range[1] - feature_range[0])
        unscaled_data = data_std * (max_values[i] - min_values[i]) + min_values[i]
        unscaled_dataset.append(unscaled_data)
    return np.array(unscaled_dataset)

scaled_MFCC_train_set, min_values_train, max_values_train = scale_data(padded_MFCC_train_set)
scaled_MFCC_test_set, min_values_test, max_values_test = scale_data(padded_MFCC_test_set)
scaled_MFCC_validate_set, min_values_validate, max_values_validate = scale_data(padded_MFCC_validate_set)

In [22]:
batch_size = 4
epochs = 40
latent_dim = 500
input_dim = 12
timesteps = new_number_of_frames

inputs = Input(shape=(timesteps, input_dim))
encoded = LSTM(latent_dim)(inputs)

decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(input_dim, return_sequences=True)(decoded)

sequence_autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)

sequence_autoencoder.compile(optimizer='rmsprop', loss='mse')

In [23]:
sequence_autoencoder.fit(scaled_MFCC_train_set,
                         scaled_MFCC_train_set,
                         batch_size=batch_size,
                         epochs=epochs,
                         validation_data=(scaled_MFCC_validate_set, scaled_MFCC_validate_set))

Train on 2873 samples, validate on 411 samples
Epoch 1/40
Epoch 2/40

KeyboardInterrupt: 

In [18]:
prediction = sequence_autoencoder.predict(scaled_MFCC_test_set)

In [19]:
reconstructed_MFCC = unscale_data(prediction, min_values_test, max_values_test)

In [18]:
print(keys_test_set[200])

19509909


In [27]:
wav_signal = librosa.feature.inverse.mfcc_to_audio(reconstructed_MFCC[200].T)
soundfile.write('test_sound.wav', wav_signal, 22050)