### End2End keyword spotting - Demo

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import scipy.io.wavfile as wav
import numpy as np
from demo import *
from postprocessing import compute_mfcc_frames, probability_smoothing, reduce_false_alarms
import keras
import IPython
from models import *

Using TensorFlow backend.


In [2]:
all_kws_sets = [['people', 'because', 'think', 'world'], ['something', 'different', 'actually', 'important'], 
                ['another', 'percent', 'problem', 'technology'], ['information', 'experience', 'government', 'computer']]
all_path2_model = ['set1_results/cnn_parada_set1_5_epochs_new_db.h5', 'set2_results/cnn_parada_set2_4_epochs_new_db.h5', 
                   'set3_results/cnn_parada_set3_4_epochs_new_db.h5', 'set3_results/dnn_set3_4_epochs_new_db.h5',
                   'set4_results/cnn_parada_set4_3_epochs_new_db.h5']

**0. Setup parameters**

- Set 1 : *people*, *because*, *think*, *world*
- Set 2 : *something*, *different*, *actually*, *important*
- Set 3 : *another*, *percent*, *problem*, *technology*
- Set 4 : *information*, *experience*, *government*, *computer*

In [82]:
_set = 2

In [83]:
keywords = all_kws_sets[_set - 1]
#path2_model = all_path2_model[_set - 1]
path2_model = 'set2_results/dnn_set2_4_epochs_new_db.h5'
model = keras.models.load_model(path2_model)

In [84]:
talks = np.load('test_ted_talks_50.npy', allow_pickle=True)

In [85]:
wav_filename = '/zak_set2_1.wav'
path2test_file = os.path.join('/aimlx/Datasets/test_phrases/set' + str(_set) + wav_filename)

#path2test_file = os.path.join('/aimlx/Datasets/TEDLIUM_release-3/data/wav', talks[4] + '.wav')
shift = 0.001
w_smooth = 3
non_keyword_label = len(keywords)
segment_size = 5

In [86]:
from ipywebrtc import AudioStream, AudioRecorder, CameraStream

In [87]:
camera = CameraStream(constraints=
                      {'facing_mode': 'user',
                       'audio': True
                       #'video': { 'width': 640, 'height': 480 }
                       })

In [88]:
recorder = AudioRecorder(stream=camera, codecs='opus')
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), codecs='opus', stream=CameraStream(constraints={'audio': …

**1. Load test audio file and compute MFCC frames**

In [89]:
fs, sig = wav.read(path2test_file)
sig_frames = compute_mfcc_frames(sig, fs, xdim=98, shift=shift, num_features=40, verbose=1)

100%|██████████| 5409/5409 [00:11<00:00, 475.34it/s]


In [90]:
sig_frames.shape

(5409, 98, 40, 1)

In [91]:
IPython.display.Audio(sig, rate=fs)

**2. Predict label of each frame**

In [92]:
y_pred = model.predict(sig_frames, verbose=1)



**3. Post processing and segment-level integration**

In [93]:
y_pred_modified = reduce_false_alarms(y_pred)
y_pred_smoothed_post = probability_smoothing(y_pred_modified, w_smooth=w_smooth)
y_prediction_smoothed_post = np.argmax(y_pred_smoothed_post, axis=1) 

predicted_segments = frames2segments(y_prediction_smoothed_post, non_keyword_label, segment_size)

100%|██████████| 5407/5407 [00:00<00:00, 14409.86it/s]
100%|██████████| 5409/5409 [00:00<00:00, 21687.11it/s]


In [94]:
np.unique(predicted_segments, return_counts=True)

(array([1, 2, 3, 4]), array([ 21,  25, 101, 934]))

**4. Extract keyword occurrences**

In [95]:
all_kw_occurrences = keywords_found(keywords, predicted_segments, window_dur=1.0, shift=shift, segment_size=segment_size)

Found 1 occurence(s) of the keyword: different
Found 1 occurence(s) of the keyword: actually
Found 1 occurence(s) of the keyword: important


In [102]:
# Keyword you want to listen to
kw = 'important' 

In [103]:
kw_occurrences = list(filter(lambda x: x[0] == keywords.index(kw), all_kw_occurrences))[0][1]
occurences_gen = iter(kw_occurrences)

In [104]:
start, end = next(occurences_gen)
IPython.display.Audio(sig[int(start*fs): int(end*fs)], rate=fs)