### End2End keyword spotting - Demo

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import scipy.io.wavfile as wav
import numpy as np
from demo import *
from postprocessing import compute_mfcc_frames, probability_smoothing, reduce_false_alarms
import keras
import IPython
from models import *

Using TensorFlow backend.


In [2]:
all_kws_sets = [['people', 'because', 'think', 'world'], ['something', 'different', 'actually', 'important'], 
                ['another', 'percent', 'problem', 'technology'], ['information', 'experience', 'government', 'computer']]
all_path2_model = ['set1_results/cnn_parada_set1_5_epochs_new_db.h5', 'set2_results/cnn_parada_set2_4_epochs_new_db.h5', 
                   'set3_results/cnn_parada_set3_4_epochs_new_db.h5', 'set4_results/cnn_parada_set4_3_epochs_new_db.h5']

**0. Setup parameters**

- Set 1 : *people*, *because*, *think*, *world*
- Set 2 : *something*, *different*, *actually*, *important*
- Set 3 : *another*, *percent*, *problem*, *technology*
- Set 4 : *information*, *experience*, *government*, *computer*

In [115]:
_set = 3

In [116]:
keywords = all_kws_sets[_set - 1]
path2_model = all_path2_model[_set - 1]
model = keras.models.load_model(path2_model)

In [186]:
wav_filename = '/ali_set3_1.wav'
path2test_file = os.path.join('/aimlx/Datasets/test_phrases/set' + str(_set) + wav_filename)
num_features = 40
xdim = 98
shift = 0.001
frame_dur = 1.0
w_smooth = 3
non_keyword_label = len(keywords)
segment_size = 5

**1. Load test audio file and compute MFCC frames**

In [187]:
fs, sig = wav.read(path2test_file)
sig_frames = compute_mfcc_frames(sig, fs, xdim=xdim, shift=shift, num_features=num_features, verbose=1)

100%|██████████| 8892/8892 [00:18<00:00, 487.32it/s]


In [188]:
sig_frames.shape

(8892, 98, 40, 1)

In [189]:
IPython.display.Audio(sig, rate=fs)

**2. Predict label of each frame**

In [190]:
y_pred = model.predict(sig_frames, verbose=1)



**3. Post processing and segment-level integration**

In [191]:
y_pred_modified = reduce_false_alarms(y_pred)
y_pred_smoothed_post = probability_smoothing(y_pred_modified, w_smooth=w_smooth)
y_prediction_smoothed_post = np.argmax(y_pred_smoothed_post, axis=1) 

predicted_segments = frames2segments(y_prediction_smoothed_post, non_keyword_label, segment_size)

100%|██████████| 8890/8890 [00:00<00:00, 15358.06it/s]
100%|██████████| 8892/8892 [00:00<00:00, 21466.84it/s]


In [192]:
np.unique(predicted_segments, return_counts=True)

(array([0, 1, 2, 3, 4]), array([  33,   44,   12,   67, 1622]))

**4. Extract keyword occurrences**

In [193]:
all_kw_occurrences = keywords_found(keywords, predicted_segments, window_dur=frame_dur, shift=shift, segment_size=segment_size)

Found 1 occurence(s) of the keyword: another
Found 2 occurence(s) of the keyword: percent
Found 1 occurence(s) of the keyword: problem
Found 1 occurence(s) of the keyword: technology


In [194]:
# Keyword you want to listen to
kw = 'percent' 

In [195]:
kw_occurrences = list(filter(lambda x: x[0] == keywords.index(kw), all_kw_occurrences))[0][1]
occurences_gen = iter(kw_occurrences)

In [196]:
start, end = next(occurences_gen)
IPython.display.Audio(sig[int(start*fs): int(end*fs)], rate=fs)