In [None]:
import numpy as np
import os
from tqdm import tqdm
from scipy.io import wavfile
import pandas as pd
import random
import time

import speech_features as sf

In [None]:
def create_dataframe(root="./audio_files", samples=-1, training_percentage=0.8,
                     mean_normalization=True, nfilt=40, nfft=400, preemph=0.97,
                     winlen=0.025, winstep=0.01, numcep=12, ceplifter=22,
                     save=False, verbose=False, exclude=None):
    
    AUDIO_FILES_ROOT = root

    folders = [f for f in os.listdir(AUDIO_FILES_ROOT) if os.path.isdir(AUDIO_FILES_ROOT + "/" + f) == True]
    validation_list = [line.rstrip() for line in open('./audio_files/validation_list.txt')]
    testing_list = [line.rstrip() for line in open('./audio_files/testing_list.txt')]
        
    if verbose:
        print("### CLASSES ###")
        print(folders)
        print("\n")
        print("### DATASET SETTINGS ###")
        print("# of samples per class = " + str(samples))
        print("% of training samples = " + str(training_percentage))
        print("Length of the window = " + str(winlen))
        print("Step of the window = " + str(winstep))
        print("# of filters = " + str(nfilt))
        print("FFT size = " + str(nfft))
        print("Pre-emphasis coefficient = " + str(preemph))
        print("# of Cepstrum to return = " + str(numcep))
        print("Lifter coefficient = " + str(ceplifter))
        print("\n")
        
    # Silence creation
    filename_column = []
    class_column = []
    duration_column = []
    frequency_sampling_column = []
    number_samples_column = []
    samples_column = []
    filter_banks_column = []
    filter_banks_shape_column = []
    mfcc_column = []
    mfcc_shape_column = []
    mfcc_deltas_column = []
    mfcc_deltas_shape_column = []
    ssc_column = []
    ssc_shape_column = []
    purpose_column = []
    
    noise_file_path = AUDIO_FILES_ROOT + "/_background_noise_/"

    noise_wav_files = [wavfile.read(noise_file_path + f) for f in os.listdir(noise_file_path) if f[f.find('.'):] == ".wav"]
    normalized_wav_files = []

    for file in noise_wav_files:
        normalized_wav_files.append(file[1] / 32767)
        
    key = 'silence'
    min_noise_sound = 0.05
    max_noise_sound = 0.3
    
    if samples == -1:
        silence_max = 2000
    else:
        silence_max = samples
    values = []
    fs = 16000
    
    silence_training = training_percentage * silence_max
    silence_validation = (silence_max - silence_training) / 2
    silence_test = silence_validation
    s_training_counter = 0
    s_validation_counter = 0
    s_test_counter = 0

    if verbose:
        print ("### silence (" + str(silence_max) + " samples) ###")
    for i in range(silence_max):
        #noise = functions.noiseSelector(noiseDict, sampleRate)
        bn_index = random.randint(0,len(noise_wav_files)-1)
        background_noise = normalized_wav_files[bn_index]

        initial_index = random.randint(0, len(background_noise) - fs)
        final_index = initial_index + fs
        noise = background_noise[initial_index:final_index]

        sig = noise * np.random.uniform(high = max_noise_sound, low = min_noise_sound)
        
        fbank_feat = sf.logfbank(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)
        
        mfcc_feat = sf.mfcc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph,
                            numcep=numcep, ceplifter=ceplifter)
        mfcc_delta_feat = sf.delta(mfcc_feat, 2)
        mfcc_delta_delta_feat = sf.delta(mfcc_delta_feat, 2)
        mfcc_complete = np.concatenate((np.concatenate((mfcc_feat, mfcc_delta_feat),axis=1), mfcc_delta_delta_feat),axis=1)
        
        ssc_feat = sf.ssc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)
        
        if mean_normalization:
            fbank_feat -= (np.mean(fbank_feat, axis=0) + 1e-8)
            mfcc_feat -= (np.mean(mfcc_feat, axis=0) + 1e-8)
        
        number_of_samples = len(sig)
        duration = number_of_samples / fs
        
        if i < silence_training:
            purpose = "training"
            s_training_counter += 1
        elif i < silence_training + silence_validation:
            purpose = "validation"
            s_validation_counter += 1
        else:
            purpose = "testing"
            s_test_counter += 1
        
        filename_column.append("")
        class_column.append(key)
        duration_column.append(duration)
        frequency_sampling_column.append(fs)
        number_samples_column.append(number_of_samples)
        samples_column.append(np.array(sig))
        filter_banks_column.append(np.matrix(fbank_feat))
        filter_banks_shape_column.append(fbank_feat.shape)
        mfcc_column.append(np.matrix(mfcc_feat))
        mfcc_shape_column.append(mfcc_feat.shape)
        mfcc_deltas_column.append(np.matrix(mfcc_complete))
        mfcc_deltas_shape_column.append(mfcc_complete.shape)
        ssc_column.append(np.matrix(ssc_feat))
        ssc_shape_column.append(ssc_feat.shape)
        purpose_column.append(purpose)
    
    if verbose:
        print(" - created " + str(s_training_counter) + " training samples, " + str(s_validation_counter) + 
              " validation samples and " + str(s_test_counter) + " test samples")
    
    for folder in folders:
        if folder in exclude:
            if verbose:
                print("### " + folder + " NOT USED ###")
        else:
            c_training_counter = 0
            c_validation_counter = 0
            c_test_counter = 0
            wav_files = [f for f in os.listdir(AUDIO_FILES_ROOT + "/" + folder) if f[f.find('.'):] == ".wav"]
            if verbose:
                print("### " + folder + " (" + str(len(wav_files)) + " files) ###")
            for wav_file in wav_files:
                file_path = AUDIO_FILES_ROOT + "/" + folder + "/" + wav_file
                fs, signal = wavfile.read(file_path)
                signal = signal / 32767
                number_of_samples = len(signal)
                duration = number_of_samples / fs
                
                if duration != 1:
                    s = np.zeros(fs)
                    needed_zeros = fs - number_of_samples
                    offset = needed_zeros // 2
                    s[offset:number_of_samples + offset] = signal
                    signal = s
                    number_of_samples = len(signal)
                    duration = number_of_samples / fs
                    
                fbank_feat = sf.logfbank(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)
        
                mfcc_feat = sf.mfcc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph,
                                    numcep=numcep, ceplifter=ceplifter)
                mfcc_delta_feat = sf.delta(mfcc_feat, 2)
                mfcc_delta_delta_feat = sf.delta(mfcc_delta_feat, 2)
                mfcc_complete = np.concatenate((np.concatenate((mfcc_feat, mfcc_delta_feat),axis=1), mfcc_delta_delta_feat),axis=1)

                ssc_feat = sf.ssc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)

                if mean_normalization:
                    fbank_feat -= (np.mean(fbank_feat, axis=0) + 1e-8)
                    mfcc_feat -= (np.mean(mfcc_feat, axis=0) + 1e-8)
                
                purpose = ""
                if (folder + "/" + wav_file) in validation_list:
                    purpose = "validation"
                    c_validation_counter += 1
                elif (folder + "/" + wav_file) in testing_list:
                    purpose = "testing"
                    c_test_counter += 1
                else:
                    purpose = "training"
                    c_training_counter += 1
                    
                filename_column.append(wav_file)
                class_column.append(folder)
                duration_column.append(duration)
                frequency_sampling_column.append(fs)
                number_samples_column.append(number_of_samples)
                samples_column.append(np.array(signal))
                filter_banks_column.append(np.matrix(fbank_feat))
                filter_banks_shape_column.append(fbank_feat.shape)
                mfcc_column.append(np.matrix(mfcc_feat))
                mfcc_shape_column.append(mfcc_feat.shape)
                mfcc_deltas_column.append(np.matrix(mfcc_complete))
                mfcc_deltas_shape_column.append(mfcc_complete.shape)
                ssc_column.append(np.matrix(ssc_feat))
                ssc_shape_column.append(ssc_feat.shape)
                purpose_column.append(purpose)
            if verbose:
                print(" - created " + str(c_training_counter) + " training samples, " + str(c_validation_counter) + 
                      " validation samples and " + str(c_test_counter) + " test samples")
    
    d = {'filename': filename_column, 'class': class_column, 'duration': duration_column,
         'frequency sampling': frequency_sampling_column, 'number of samples': number_samples_column,
         'samples': samples_column, 'filter banks': filter_banks_column, 'filter banks shape': filter_banks_shape_column,
         'mfcc': mfcc_column, 'mfcc shape': mfcc_shape_column,
         'mfcc deltas': mfcc_deltas_column, 'mfcc deltas shape': mfcc_deltas_shape_column,
         'ssc': ssc_column, 'ssc shape': ssc_shape_column,
         'purpose': purpose_column}
    df = pd.DataFrame(data=d)
    
    filename_column = []
    class_column = []
    duration_column = []
    frequency_sampling_column = []
    number_samples_column = []
    samples_column = []
    filter_banks_column = []
    filter_banks_shape_column = []
    mfcc_column = []
    mfcc_shape_column = []
    mfcc_deltas_column = []
    mfcc_deltas_shape_column = []
    ssc_column = []
    ssc_shape_column = []
    purpose_column = []
    
    min_noise_sound = 0.05
    max_noise_sound = 0.2
    
    fs = 16000
    
    samples_training = training_percentage * samples
    samples_validation = (samples - samples_training) / 2
    samples_test = samples_validation
    
    print("### Creating new training samples ###")
    df_train = df[df["purpose"] == "training"]
    for name, group in df_train.groupby(["class"]):
        
        needed_samples = int(samples_training - len(group))
        df_samples = group["samples"].values
        
        print(" - " + name + ", needed samples: " + str(needed_samples))
        
        for i in range(needed_samples):
            sig_index = random.randint(0,len(df_samples)-1)
            signal = df_samples[sig_index]

            bn_index = random.randint(0,len(noise_wav_files)-1)
            background_noise = normalized_wav_files[bn_index]
            initial_index = random.randint(0, len(background_noise) - fs)
            final_index = initial_index + fs
            noise = background_noise[initial_index:final_index]

            sig = signal + (noise * np.random.uniform(high = max_noise_sound, low = min_noise_sound))
            
            fbank_feat = sf.logfbank(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)
        
            mfcc_feat = sf.mfcc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph,
                                numcep=numcep, ceplifter=ceplifter)
            mfcc_delta_feat = sf.delta(mfcc_feat, 2)
            mfcc_delta_delta_feat = sf.delta(mfcc_delta_feat, 2)
            mfcc_complete = np.concatenate((np.concatenate((mfcc_feat, mfcc_delta_feat),axis=1), mfcc_delta_delta_feat),axis=1)

            ssc_feat = sf.ssc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)

            if mean_normalization:
                fbank_feat -= (np.mean(fbank_feat, axis=0) + 1e-8)
                mfcc_feat -= (np.mean(mfcc_feat, axis=0) + 1e-8)
            
            number_of_samples = len(signal)
            duration = number_of_samples / fs
            
            filename_column.append("")
            class_column.append(name)
            duration_column.append(duration)
            frequency_sampling_column.append(fs)
            number_samples_column.append(number_of_samples)
            samples_column.append(np.array(sig))
            filter_banks_column.append(np.matrix(fbank_feat))
            filter_banks_shape_column.append(fbank_feat.shape)
            mfcc_column.append(np.matrix(mfcc_feat))
            mfcc_shape_column.append(mfcc_feat.shape)
            mfcc_deltas_column.append(np.matrix(mfcc_complete))
            mfcc_deltas_shape_column.append(mfcc_complete.shape)
            ssc_column.append(np.matrix(ssc_feat))
            ssc_shape_column.append(ssc_feat.shape)
            purpose_column.append("training")
            
    print("### Creating new validation samples ###")
    df_validation = df[df["purpose"] == "validation"]
    for name, group in df_validation.groupby(["class"]):
        
        needed_samples = int(samples_validation - len(group))
        df_samples = group["samples"].values
        
        print(" - " + name + ", needed samples: " + str(needed_samples))
        
        for i in range(needed_samples):
            sig_index = random.randint(0,len(df_samples)-1)
            signal = df_samples[sig_index]

            bn_index = random.randint(0,len(noise_wav_files)-1)
            background_noise = normalized_wav_files[bn_index]
            initial_index = random.randint(0, len(background_noise) - fs)
            final_index = initial_index + fs
            noise = background_noise[initial_index:final_index]

            sig = signal + (noise * np.random.uniform(high = max_noise_sound, low = min_noise_sound))
            
            fbank_feat = sf.logfbank(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)
        
            mfcc_feat = sf.mfcc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph,
                                numcep=numcep, ceplifter=ceplifter)
            mfcc_delta_feat = sf.delta(mfcc_feat, 2)
            mfcc_delta_delta_feat = sf.delta(mfcc_delta_feat, 2)
            mfcc_complete = np.concatenate((np.concatenate((mfcc_feat, mfcc_delta_feat),axis=1), mfcc_delta_delta_feat),axis=1)

            ssc_feat = sf.ssc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)

            if mean_normalization:
                fbank_feat -= (np.mean(fbank_feat, axis=0) + 1e-8)
                mfcc_feat -= (np.mean(mfcc_feat, axis=0) + 1e-8)
            
            number_of_samples = len(signal)
            duration = number_of_samples / fs
            
            filename_column.append("")
            class_column.append(name)
            duration_column.append(duration)
            frequency_sampling_column.append(fs)
            number_samples_column.append(number_of_samples)
            samples_column.append(np.array(sig))
            filter_banks_column.append(np.matrix(fbank_feat))
            filter_banks_shape_column.append(fbank_feat.shape)
            mfcc_column.append(np.matrix(mfcc_feat))
            mfcc_shape_column.append(mfcc_feat.shape)
            mfcc_deltas_column.append(np.matrix(mfcc_complete))
            mfcc_deltas_shape_column.append(mfcc_complete.shape)
            ssc_column.append(np.matrix(ssc_feat))
            ssc_shape_column.append(ssc_feat.shape)
            purpose_column.append("validation")
            
    print("### Creating new test samples ###")
    df_test = df[df["purpose"] == "testing"]
    for name, group in df_test.groupby(["class"]):
        
        needed_samples = int(samples_test - len(group))
        df_samples = group["samples"].values
        
        print(" - " + name + ", needed samples: " + str(needed_samples))
        
        for i in range(needed_samples):
            sig_index = random.randint(0,len(df_samples)-1)
            signal = df_samples[sig_index]

            bn_index = random.randint(0,len(noise_wav_files)-1)
            background_noise = normalized_wav_files[bn_index]
            initial_index = random.randint(0, len(background_noise) - fs)
            final_index = initial_index + fs
            noise = background_noise[initial_index:final_index]

            sig = signal + (noise * np.random.uniform(high = max_noise_sound, low = min_noise_sound))
            
            fbank_feat = sf.logfbank(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)
        
            mfcc_feat = sf.mfcc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph,
                                numcep=numcep, ceplifter=ceplifter)
            mfcc_delta_feat = sf.delta(mfcc_feat, 2)
            mfcc_delta_delta_feat = sf.delta(mfcc_delta_feat, 2)
            mfcc_complete = np.concatenate((np.concatenate((mfcc_feat, mfcc_delta_feat),axis=1), mfcc_delta_delta_feat),axis=1)

            ssc_feat = sf.ssc(sig, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, preemph=preemph)
            
            if mean_normalization:
                fbank_feat -= (np.mean(fbank_feat, axis=0) + 1e-8)
                mfcc_feat -= (np.mean(mfcc_feat, axis=0) + 1e-8)
            
            number_of_samples = len(signal)
            duration = number_of_samples / fs
            
            filename_column.append("")
            class_column.append(name)
            duration_column.append(duration)
            frequency_sampling_column.append(fs)
            number_samples_column.append(number_of_samples)
            samples_column.append(np.array(sig))
            filter_banks_column.append(np.matrix(fbank_feat))
            filter_banks_shape_column.append(fbank_feat.shape)
            mfcc_column.append(np.matrix(mfcc_feat))
            mfcc_shape_column.append(mfcc_feat.shape)
            mfcc_deltas_column.append(np.matrix(mfcc_complete))
            mfcc_deltas_shape_column.append(mfcc_complete.shape)
            ssc_column.append(np.matrix(ssc_feat))
            ssc_shape_column.append(ssc_feat.shape)
            purpose_column.append("testing")
            
    d2 = {'filename': filename_column, 'class': class_column, 'duration': duration_column,
         'frequency sampling': frequency_sampling_column, 'number of samples': number_samples_column,
         'samples': samples_column, 'filter banks': filter_banks_column, 'filter banks shape': filter_banks_shape_column,
         'mfcc': mfcc_column, 'mfcc shape': mfcc_shape_column,
         'mfcc deltas': mfcc_deltas_column, 'mfcc deltas shape': mfcc_deltas_shape_column,
         'ssc': ssc_column, 'ssc shape': ssc_shape_column,
         'purpose': purpose_column}
    df2 = pd.DataFrame(data=d2)
    
    df = df.append(df2, ignore_index = True)
    
    folders.append("silence")
    
    settings = [samples, training_percentage, folders,
                mean_normalization, nfilt, nfft, preemph,
                winlen, winstep, numcep, ceplifter]
    
    if save:
        print("Saving file...")
        df.to_hdf('dataframe.h5', key='df', mode='w')
        print("Done!")
        
    return df, settings

In [None]:
def save_dataset(dataframe, settings, path = "./dataset", coefficients = ["filter banks", "mfcc", "mfcc deltas", "ssc"]):
    
    millis = int(round(time.time() * 1000))
    dataset_folder = millis
    
    file = open(path + "/" + str(dataset_folder) + "/settings.txt", "w") 
    file.write("### DATASET SETTINGS ###")
    file.write("# of samples per class = " + str(settings[0]))
    file.write("% of training samples = " + str(settings[1]))
    file.write("Classes: " + str(settings[2]))
    file.write("Length of the window = " + str(settings[7]))
    file.write("Step of the window = " + str(settings[8]))
    file.write("# of filters = " + str(settings[4]))
    file.write("FFT size = " + str(settings[5]))
    file.write("Pre-emphasis coefficient = " + str(settings[6]))
    file.write("# of Cepstrum to return = " + str(settings[9]))
    file.write("Lifter coefficient = " + str(settings[10]))
    file.write("Mean normalization: " + str(settings[3]))
    file.close() 
    
    for coeff in coefficients:
        filename_coeff = coeff.replace(" ", "_")

        print("***" + coeff.upper() + "***")

        print("### TRAINING ###")
        print("- dataframe filtering")
        df_train = dataframe[dataframe.purpose == "training"]
        X_train_raw = df_train[coeff].values
        X_train = []
        print("- X_train creation")
        for mat in X_train_raw:
            X_train.append(mat)
        X_train = np.array(X_train)
        print("- y_train creation")
        y_train = (pd.get_dummies(df_train["class"])).values

        print("X_train shape: ", X_train.shape)
        print("y_train shape: ", y_train.shape)

        print("### VALIDATION ###")
        print("- dataframe filtering")
        df_validation = dataframe[dataframe.purpose == "validation"]
        X_validation_raw = df_validation[coeff].values
        X_validation = []
        print("- X_train creation")
        for mat in X_validation_raw:
            X_validation.append(mat)
        X_validation = np.array(X_validation)
        print("- y_train creation")
        y_validation = (pd.get_dummies(df_validation["class"])).values

        print("X_validation shape: ", X_validation.shape)
        print("y_validation shape: ", y_validation.shape)

        print("### TESTING ###")
        print("- dataframe filtering")
        df_test = dataframe[dataframe.purpose == "testing"]
        X_test_raw = df_test[coeff].values
        X_test = []
        print("- X_test creation")
        for mat in X_test_raw:
            X_test.append(mat)
        X_test = np.array(X_test)
        print("- y_test creation")
        y_test = (pd.get_dummies(df_test["class"])).values

        print("X_test shape: ", X_test.shape)
        print("y_test shape: ", y_test.shape)

        print(" - saving files")
        np.save(path + "/" + str(dataset_folder) + "/" + filename_coeff + "_training_set.npy", X_train)
        np.save(path + "/" + str(dataset_folder) + "/" + filename_coeff + "_validation_set.npy", X_validation)
        np.save(path + "/" + str(dataset_folder) + "/" + filename_coeff + "_test_set.npy", X_test)

        np.save(path + "/" + str(dataset_folder) + "/" + filename_coeff + "_training_labels.npy", y_train)
        np.save(path + "/" + str(dataset_folder) + "/" + filename_coeff + "_validation_labels.npy", y_validation)
        np.save(path + "/" + str(dataset_folder) + "/" + filename_coeff + "_test_labels.npy", y_test)

        print("")

In [None]:
df, settings = create_dataframe(verbose=True, samples=3000, exclude=["_background_noise_"])
save_dataset(df, settings)