### End2End keyword spotting - Demo

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import scipy.io.wavfile as wav
import numpy as np
from demo import *
from postprocessing import compute_mfcc_frames, probability_smoothing, reduce_false_alarms
import keras
import IPython
from models import *

Using TensorFlow backend.


In [2]:
all_kws_sets = [['people', 'because', 'think', 'world'], ['something', 'different', 'actually', 'important'], 
                ['another', 'percent', 'problem', 'technology'], ['information', 'experience', 'government', 'computer']]
all_path2_model = ['set1_results/cnn_parada_set1_5_epochs_new_db.h5', 'set2_results/cnn_parada_set2_4_epochs_new_db.h5', 
                   'set3_results/cnn_parada_set3_4_epochs_new_db.h5', 'set3_results/dnn_set3_4_epochs_new_db.h5',
                   'set4_results/cnn_parada_set4_3_epochs_new_db.h5']

**0. Setup parameters**

- Set 1 : *people*, *because*, *think*, *world*
- Set 2 : *something*, *different*, *actually*, *important*
- Set 3 : *another*, *percent*, *problem*, *technology*
- Set 4 : *information*, *experience*, *government*, *computer*

In [89]:
_set = 1

In [91]:
keywords = all_kws_sets[_set - 1]
#path2_model = all_path2_model[_set - 1]
#path2_model = 'set2_results/dnn_set2_4_epochs_new_db.h5'
path2_model = 'set1_results/cnn_parada_set1_5_epochs_new_db.h5'
model = keras.models.load_model(path2_model)

In [84]:
talks = np.load('test_ted_talks_50.npy', allow_pickle=True)

In [94]:
wav_filename = '/ali_set1_1.wav'
path2test_file = os.path.join('/aimlx/Datasets/test_phrases/set' + str(_set) + wav_filename)

#path2test_file = os.path.join('/aimlx/Datasets/TEDLIUM_release-3/data/wav', talks[4] + '.wav')
shift = 0.001
w_smooth = 3
non_keyword_label = len(keywords)
segment_size = 5

**1. Load test audio file and compute MFCC frames**

In [95]:
fs, sig = wav.read(path2test_file)
sig_frames = compute_mfcc_frames(sig, fs, xdim=98, shift=shift, num_features=40, verbose=1)

100%|██████████| 8176/8176 [00:17<00:00, 480.04it/s]


In [96]:
sig_frames.shape

(8176, 98, 40, 1)

In [97]:
IPython.display.Audio(sig, rate=fs)

**2. Predict label of each frame**

In [98]:
y_pred = model.predict(sig_frames, verbose=1)



**3. Post processing and segment-level integration**

In [99]:
y_pred_modified = reduce_false_alarms(y_pred)
y_pred_smoothed_post = probability_smoothing(y_pred_modified, w_smooth=w_smooth)
y_prediction_smoothed_post = np.argmax(y_pred_smoothed_post, axis=1) 

predicted_segments = frames2segments(y_prediction_smoothed_post, non_keyword_label, segment_size)

100%|██████████| 8174/8174 [00:00<00:00, 14796.09it/s]
100%|██████████| 8176/8176 [00:00<00:00, 20501.75it/s]


In [100]:
np.unique(predicted_segments, return_counts=True)

(array([0, 1, 3, 4]), array([ 140,   84,   58, 1353]))

**4. Extract keyword occurrences**

In [101]:
all_kw_occurrences = keywords_found(keywords, predicted_segments, window_dur=1.0, shift=shift, segment_size=segment_size)

Found 1 occurence(s) of the keyword: people
Found 1 occurence(s) of the keyword: because
Found 1 occurence(s) of the keyword: world


In [108]:
# Keyword you want to listen to
kw = 'world' 

In [109]:
kw_occurrences = list(filter(lambda x: x[0] == keywords.index(kw), all_kw_occurrences))[0][1]
occurences_gen = iter(kw_occurrences)

In [110]:
start, end = next(occurences_gen)
IPython.display.Audio(sig[int(start*fs): int(end*fs)], rate=fs)