In [237]:
# Import Dependencies
from os import listdir
from os.path import isdir, join
import librosa
import random
import numpy as np
import matplotlib.pyplot as plt
import python_speech_features
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm

In [238]:
# Dataset Path
dataset_path = "../../Dataset/speech_dataset"
for index, name in enumerate(tqdm(listdir(dataset_path))):
    pass

100%|██████████| 35/35 [00:00<00:00, 118196.97it/s]


In [239]:
# Create an all targets list
all_targets = [name for name in listdir(dataset_path) if isdir(join(dataset_path, name))]

# Truncated Background Noise if exists
#targets.remove("_background_noise_")

# See how many files are in each
num_samples = 0
for target in all_targets:
    num_samples += len(listdir(join(dataset_path, target)))
print('Total samples:', num_samples)

Total samples: 105829


In [240]:
# Settings
DEBUG = False
targetList = targets
featureSetsFile = "mfcc.npz" # NPZ file 
sampleToWorkOn  = 0.1   # Percentage of dataset to use
trainSize       = 0.8
validationSize  = 0.1   # Ratio of Validation Dataset
testSize        = 0.1   # Ratio of test dataset
sampleRate      = 8000  # Sample Rate
numMFCC         = 16    # Number MFCC
lenMFCC         = 16    # Recording duration, 16 means 1 second

In [241]:
# Create list of filenames with ground truth of y
# Examples for filenames c1d39ce8_nohash_1.wav is in folder dog
# DOG is in index 0
# so the file name c1d39ce8_nohash_1.wav is identified by dog with label 0
filenames = []
y = []

for index, target in enumerate(targetList):
    filenames.append(listdir(join(dataset_path, target)))
    sampleNum = len(filenames[index])
    y.append(np.ones(sampleNum) * index)

In [242]:
# Check ground truth y vector
if DEBUG :
    print(y)
    
sampleTotal = 0
for item in y:
    sampleNum   = len(item)
    sampleTotal += sampleNum
    
print("Total : {}".format(sampleTotal))

Total : 105829


In [243]:
# Convert the 2d array to 1d array
filenames = [item for sublist in filenames for item in sublist]
y = [item for sublist in y for item in sublist]

In [244]:
# Associate filenames with true output and shuffle
filenamesY = list(zip(filenames, y))
random.shuffle(filenamesY)
filenames, y = zip(*filenamesY)

In [245]:
# Only keep the specified number of samples 
print(len(filenames))
filenames = filenames[:int(len(filenames) * sampleToWorkOn)]
print(len(filenames))

105829
10582


In [246]:
# Calculate Validation and Test Set Ratio size
sizeOfTrain      = int(len(filenames) * trainSize)
sizeOfValidation = int(len(filenames) * validationSize)
sizeOfTest       = int(len(filenames) * testSize)

# Break the Training, Validation, and Test Dataset
filenamesTrain      = filenames[:sizeOfTrain]
filenamesValidation = filenames[sizeOfTrain: (sizeOfTrain + sizeOfValidation)]
filenamesTest       = filenames[-1 * sizeOfTest:]

yTrain      = y[:sizeOfTrain]
yValidation = y[sizeOfTrain: (sizeOfTrain + sizeOfValidation)]
yTest       = y[-1 * sizeOfTest:]

print("Filenames len : ", len(filenamesTrain), len(filenamesValidation), len(filenamesTest))
print("Label         : ", len(yTrain),len(yValidation), len(yTest))

Filenames len :  8465 1058 1058
Label         :  8465 1058 1058


In [258]:
# Function : Create MFCC from path
def calcMFCC(path):
    
    # Load Wavefile
    signal, fs = librosa.load(path, sr = sampleRate)
    
    mfccs = python_speech_features.base.mfcc(signal,
                                             samplerate=fs,
                                             winlen=0.256,
                                             winstep=0.050,
                                             numcep=numMFCC,
                                             nfilt = 26,
                                             nfft  = 2048,
                                             preemph=0.0,
                                             ceplifter=0,
                                             appendEnergy=False,
                                             winfunc=np.hanning)
    return mfccs.transpose()

def librosaCalcMFCC(path):
    
    # Load Wavefile
    signal, fs = librosa.load(path, sr = sampleRate)
    print(signal)
    # Using Librosa for extract Features
    mfccs = librosa.feature.mfcc(signal, fs, n_mfcc=numMFCC)
    return mfccs

def librosaCalcMeanMFCC(mfccs):
    return np.mean(mfccs.T,axis=0)

In [261]:
# TEST: Construct test set by computing MFCC of each wave file
propCnt = 0
xTest = []
yTest = []

for index, filename in enumerate(filenamesTrain):
    
    # Break when hit 500
    if index >= 500:
        break
    
    # Create the path from given filename and target item
    path = join(dataset_path, targetList[int(yTrain[index])], filename)
    
    # Create MFCCS
    mfccs = librosaCalcMFCC(path)
    
    if mfccs.shape[1] == lenMFCC:
        xTest.append(librosaCalcMeanMFCC(mfccs))
        yTest.append(yTrain[index])
    else:
        print("Dropped : ", index, mfccs.shape)
        propCnt += 1
    

[ 6.3827189e-05  1.4324569e-04  5.1897270e-05 ...  4.5337505e-04
  7.3693611e-04 -6.3810451e-04]
[-0.00080846 -0.00190635 -0.001181   ... -0.00263951 -0.00157926
 -0.00148964]
[-4.9322308e-04 -4.5364466e-04  4.0499150e-04 ... -3.3455231e-04
 -1.5844422e-04 -7.6022498e-06]
[ 0.0002432   0.00054144  0.00071414 ... -0.00065441 -0.0005218
 -0.00035293]
[-3.4042652e-05  1.1335365e-04  2.5664992e-04 ... -2.7893239e-04
 -4.4611431e-04 -3.3800380e-04]
[0.00040451 0.00059419 0.00052022 ... 0.00579396 0.00559899 0.00598806]
[-0.00035961 -0.00109951 -0.00025986 ...  0.0011864  -0.00040442
 -0.0018603 ]
[ 9.5615520e-05  1.3719844e-04  2.0127975e-04 ...  5.9088485e-05
  1.6769319e-04 -1.8408817e-04]
[ 0.0001806   0.00052535  0.00059547 ...  0.00171484  0.00053885
 -0.00097556]
[ 3.5792557e-04  6.6830131e-04  6.9645722e-04 ... -3.5181860e-05
  2.4343487e-04  2.7777540e-04]
[-0.00107294 -0.0038816  -0.00307207 ...  0.00332634  0.0020104
  0.00014433]
[-0.00010388 -0.00029759 -0.00035441 ...  0.000180

[-1.6001610e-04 -1.4297123e-04 -6.4549742e-05 ... -3.0606394e-04
 -1.1362198e-04 -4.2543310e-04]
[-0.00035284 -0.00070196 -0.0010157  ... -0.03348159 -0.03295289
 -0.03039019]
[-5.4350389e-06  5.1100586e-05  6.6539629e-05 ... -4.0218114e-05
 -6.0257280e-05 -3.8225615e-05]
[-0.00106134 -0.0013454  -0.00129177 ... -0.00194774 -0.00375454
 -0.00309441]
[ 0.00012148 -0.00015787 -0.00016996 ...  0.00039999  0.00018673
  0.00025657]
[ 0.00030734  0.00064978  0.00108555 ... -0.00043374 -0.00034512
 -0.00049439]
[0.00013805 0.00017805 0.0001284  ... 0.00028006 0.00032487 0.00062622]
[ 4.2185871e-08 -7.6205069e-08  1.1652745e-07 ...  1.0515237e-05
 -3.5089502e-05  3.2036096e-06]
[ 4.3001855e-06  6.6187535e-04 -4.9353048e-04 ... -2.6200435e-04
 -1.7183770e-04 -3.8154959e-04]
[-7.9830979e-06 -4.8331691e-05 -5.3937649e-05 ...  1.1381633e-05
  3.5021019e-06  3.2239925e-05]
[-0.00012763 -0.00056342 -0.00060519 ...  0.00045637  0.00014451
  0.00024683]
[ 6.2955558e-05  5.9872105e-05  1.9083474e-05 ..

[-0.00261271 -0.00827669 -0.0124301  ...  0.00543012  0.02080207
  0.02125171]
[ 0.00144498  0.00231973  0.00248928 ... -0.00309044 -0.00290468
 -0.00238127]
[4.6406476e-06 1.0118053e-04 1.9180853e-04 ... 7.2199677e-05 5.4255463e-05
 4.4150744e-05]
[ 0.00234616  0.00423758  0.00582609 ... -0.00433919 -0.00061656
 -0.00332215]
[-0.00060637 -0.00058458  0.0003648  ...  0.00103786  0.00047587
  0.00113748]
[-1.2351023e-05 -3.7902824e-05 -1.7453094e-04 ...  2.5750572e-04
  9.9945981e-05  3.8124379e-04]
[-0.00056856 -0.00204598 -0.00317334 ... -0.0029193  -0.00354979
 -0.00563001]
[-0.00051533 -0.00107356 -0.00072464 ... -0.00140455 -0.00122799
 -0.001204  ]
[ 1.48151258e-07  3.95130648e-08 -4.76072017e-07 ... -9.21257833e-06
  6.32945448e-05  1.03425344e-04]
[-0.00048166 -0.00067226 -0.00078939 ... -0.00073593 -0.00071345
 -0.00088404]
[-0.00451874 -0.00704507 -0.00649711 ... -0.0258032  -0.09538031
 -0.14682637]
[-0.00152725 -0.00304817 -0.00308785 ... -0.00253903 -0.00224643
 -0.00299625

[0.00027916 0.00042572 0.00038189 ... 0.00034675 0.00059903 0.00090269]
[-0.00023926 -0.00042029 -0.00051656 ... -0.00038354 -0.00057799
 -0.00070866]
[-3.3382054e-05  4.3693100e-05  5.9353955e-05 ...  2.1172262e-04
  3.7824731e-02  8.1777841e-02]
[ 0.00103134  0.00261259  0.00493456 ...  0.00079009 -0.00078171
 -0.00181609]
[ 0.01485856  0.02186837  0.01886167 ... -0.03564543 -0.04056512
 -0.03547468]
[ 0.0018361   0.00262585  0.00214148 ...  0.00616436  0.00083444
 -0.00691423]
[ 8.0367907e-05  1.6992830e-04  1.2638094e-04 ... -3.3194330e-04
 -4.9026735e-04 -2.5382894e-04]
[-9.5967269e-05 -1.3262709e-04 -1.6920570e-04 ...  9.1277361e-05
  1.2599862e-04  1.2384505e-04]
[ 4.6022978e-06 -4.0878945e-06 -3.6966929e-05 ... -9.7574521e-05
 -1.1157866e-04 -1.0819777e-04]
[2.0891255e-04 8.7875006e-04 4.8989512e-04 ... 1.7987360e-04 2.6775585e-04
 6.5609293e-06]
[-0.00020099  0.0002334   0.00010647 ... -0.00032253 -0.00036525
 -0.00020411]
[-5.3066109e-05 -3.9843544e-05  1.7356902e-05 ... -4.8

 -0.00071514]
[-0.00268317 -0.0031826  -0.0023361  ... -0.00023566 -0.00387202
 -0.00233451]
[0.00086593 0.00206303 0.00231313 ... 0.00362927 0.00266955 0.0030226 ]
[-1.2106587e-04 -8.7044093e-05 -3.3317885e-04 ... -2.4128162e-04
 -1.4055612e-04  2.1507767e-04]
[ 3.1587624e-04 -2.1573479e-04 -7.3972071e-04 ... -3.4119144e-05
 -1.0293069e-03 -7.2620099e-04]
[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ... -1.0639418e-06
  2.1694170e-05  3.4597237e-05]
[ 0.00037336  0.00037939  0.00013131 ... -0.00668896 -0.00631447
 -0.00648743]
[-0.00032165  0.01873744  0.01786771 ... -0.00412088 -0.01521735
 -0.01839661]
[-3.8177543e-05 -4.8778813e-05 -1.8323351e-04 ... -1.8089380e-04
 -1.2166939e-04 -9.0360627e-05]
[ 1.3724297e-04  2.2304970e-04  2.6875601e-05 ... -1.0626140e-04
 -1.4497236e-04  2.5542694e-04]
[0.00346594 0.00658552 0.00408563 ... 0.00834107 0.00897899 0.00901977]
[ 0.00068607  0.00939433  0.00918982 ... -0.0062296   0.00269408
  0.00076949]
[-0.00047254 -0.00423966 -0.00563651 ... 

[ 0.00647319  0.02326813  0.03268779 ... -0.0046411   0.00421025
  0.01751581]
[-0.00064699 -0.00280395 -0.00040227 ...  0.00422468  0.00316387
  0.00155551]
[ 0.00131179  0.00517504  0.00525209 ...  0.00234088 -0.00022095
 -0.00228472]
[-0.00123086 -0.00225499 -0.00153765 ... -0.00234275 -0.00316112
 -0.00310796]
[-4.9950737e-05  4.0240641e-04  4.1476629e-05 ...  3.7259940e-04
  3.1466148e-04  3.3641458e-04]
[-4.4001623e-05 -7.2575662e-05 -9.1356029e-05 ...  1.2124891e-03
  7.7872071e-04  4.7795320e-04]
[-1.5824806e-08  1.6930027e-08 -1.7277117e-08 ... -6.4879700e-06
  4.2729021e-06  0.0000000e+00]
Dropped :  482 (16, 14)
[-2.2650245e-05 -4.7558628e-05 -3.0412082e-05 ... -5.4299311e-05
 -6.2340143e-05 -6.2271582e-05]
[ 0.00375998  0.00923623  0.01040842 ... -0.00580176 -0.00584775
 -0.00636104]
[-0.00871956 -0.01502968 -0.01163783 ...  0.0136424   0.01298343
  0.0146389 ]
[ 1.77431491e-06 -1.02881813e-05 -1.34687274e-04 ... -1.38855465e-02
  3.59564344e-03 -3.36082943e-04]
[ 0.0004494

In [260]:
print(xTest[0].shape)
print("% of problematic samples : ", propCnt / 500)

(16,)
% of problematic samples :  0.08


In [250]:
from playsound import playsound

# Get the path for audio wav file
#idx  = 200
#path = 
    
def extractMFCC(path):
    
    # Create the mfccs for the path
    mfccs = librosaCalcMFCC(path)
    
    # Show the MFCCS data
    fig = plt.figure()
    plt.imshow(mfccs, cmap="magma", origin="lower")
    return mfccs

In [251]:
from tqdm import tqdm

def extractFeatures(inFiles, inY):
    probCnt = 0
    outputX = []
    outputY = []
    
    for index, filename in (enumerate(tqdm(inFiles))):

        # Create the path from given filename and target item
        path = join(dataset_path, targetList[int(inY[index])], filename)
    
        # Check to make sure if it's a valid audio file
        if not path.endswith(".wav"):
            continue
        
        # Create MFCCS
        mfccs = librosaCalcMFCC(path)
        
        if mfccs.shape[1] == lenMFCC:
            outputX.append(mfccs)
            outputY.append(inY[index])
            
        else:
            #print("Dropped : ", index,mfccs.shape)
            probCnt += 1
            
    return outputX, outputY, probCnt

In [252]:
# Create train, validation, and test sets
x_train, y_train, prob = extractFeatures(filenamesTrain, yTrain)
print('Removed percentage:', prob / len(yTrain))

100%|██████████| 8465/8465 [06:52<00:00, 20.50it/s]


Removed percentage: 0.08222090962787951


In [253]:
x_val, y_val, prob = extractFeatures(filenamesValidation, yValidation)
print('Removed percentage:', prob / len(yValidation))

100%|██████████| 1058/1058 [00:54<00:00, 19.38it/s]

Removed percentage: 0.07750472589792061





In [215]:
x_test, y_test, prob = extractFeatures(filenamesTest, yTest)
print('Removed percentage:', prob / len(yTest))

  0%|          | 3/1058 [00:04<24:44,  1.41s/it]


FileNotFoundError: [Errno 2] No such file or directory: '../../Dataset/speech_dataset/dog/7cf14c54_nohash_2.wav'

In [254]:
# Save the feature and truth vector
np.savez(featureSetsFile, 
         x_train=x_train,
         y_train=y_train,
         x_val=x_val,
         y_val=y_val)

In [255]:
featureSets = np.load(featureSetsFile)
featureSets.files

['x_train', 'y_train', 'x_val', 'y_val']

In [256]:
len(featureSets['x_train'])

7769

In [257]:
print(featureSets['x_val'])

[[[-7.25969849e+02 -7.26068481e+02 -6.59551514e+02 ... -7.00086914e+02
   -7.07084900e+02 -7.12952332e+02]
  [ 5.25147867e+00  4.90287304e+00  8.96009293e+01 ...  7.02440786e+00
    6.12832355e+00  8.84836292e+00]
  [ 4.40290356e+00  3.98606586e+00  6.93258209e+01 ... -9.18589115e+00
   -1.08306675e+01 -3.30080199e+00]
  ...
  [-1.69124269e+00 -2.23980093e+00 -4.88618231e+00 ...  5.15847397e+00
    3.38997030e+00  1.54002047e+00]
  [-1.58687711e+00 -1.82382262e+00 -4.70918846e+00 ...  5.22296715e+00
    2.69502664e+00 -2.74960446e+00]
  [-3.44252396e+00 -3.40610433e+00 -8.09551239e+00 ... -1.28959351e+01
   -1.34932327e+01 -1.18514862e+01]]

 [[-6.56499573e+02 -6.33133911e+02 -5.25914673e+02 ... -6.55781616e+02
   -6.56047974e+02 -6.55436157e+02]
  [-2.38684130e+00 -1.78339958e+01  5.48939972e+01 ...  5.03278255e+00
    4.96612883e+00  5.40577888e+00]
  [ 3.74901199e+00 -9.98030281e+00  1.89405098e+01 ...  3.90711021e+00
    4.42873287e+00  3.93001771e+00]
  ...
  [ 4.75789070e-01 -7.0

In [236]:
# Checking with not trained data

# Load the model
model_path = "wake_word.model"
model = keras.models.load_model(model_path)

# Get the path for audio wav file
folderName = "Test"
fileName   = "1.wav"
path = join(dataset_path, folderName, fileName)
print(path)

# Put the data to array
xData     = []
mfcc_test = extractMFCC(path)
xData.append(mfcc_test)
finalReshape = xData[0].reshape(xData[0].shape[0],
                                xData[0].shape[1], 
                                1)

predictions = model.predict(np.expand_dims(finalReshape, 0))
predictions = predictions[0][0]
print(predictions)

../../Dataset/speech_dataset/Test/1.wav




FileNotFoundError: [Errno 2] No such file or directory: '../../Dataset/speech_dataset/Test/1.wav'