# Real time demo - trigger word detection

**"Activate"** is the trigger word trained for the model.
For alternative trigger word, you need to re-train the model to recognize it. More detail refer to [Trigger word detection - v1.ipynb](./Trigger word detection - v1.ipynb)

In [19]:
import numpy as np
import time
from pydub import AudioSegment
import random
import sys
import io
import os
import glob
import IPython
from td_utils import *
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import cv2
import keyboard
# To generate wav file from np array.
from scipy.io.wavfile import write
%matplotlib inline

In [20]:
# Use 1101 for 2sec input audio
Tx = 5511 # The number of time steps input to the model from the spectrogram
n_freq = 101 # Number of frequencies input to the model at each time step of the spectrogram

In [21]:
# Use 272 for 2sec input audio
Ty = 1375# The number of time steps in the output of our model

## Build the model

In [22]:
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.optimizers import Adam

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Load a pre-train model

In [26]:
model = load_model('./models/tr_model.h5')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





In [None]:
hand_cascade = cv2.CascadeClassifier('rpalm.xml')

### Detect trigger word functions

In [71]:
def detect_triggerword_spectrum(x,model):
    """
    Function to predict the location of the trigger word.
    
    Argument:
    x -- spectrum of shape (freqs, Tx)
    i.e. (Number of frequencies, The number time steps)

    Returns:
    predictions -- flattened numpy array to shape (number of output time steps)
    """
    # the spectogram outputs  and we want (Tx, freqs) to input into the model
    x  = x.swapaxes(0,1)
    x = np.expand_dims(x, axis=0)
    predictions = model.predict(x)
    return predictions.reshape(-1)

def has_new_triggerword(predictions, chunk_duration, feed_duration, threshold=0.6):
    """
    Function to detect new trigger word in the latest chunk of input audio.
    It is looking for the rising edge of the predictions data belongs to the
    last/latest chunk.
    
    Argument:
    predictions -- predicted labels from model
    chunk_duration -- time in second of a chunk
    feed_duration -- time in second of the input to model
    threshold -- threshold for probability above a certain to be considered positive

    Returns:
    True if new trigger word detected in the latest chunk
    """
    predictions = predictions > threshold
    chunk_predictions_samples = int(len(predictions) * chunk_duration / feed_duration)
    chunk_predictions = predictions[-chunk_predictions_samples:]
    level = chunk_predictions[0]
    for pred in chunk_predictions:
        if pred > level:
            return True
        else:
            level = pred
    return False

# Record audio stream from mic

In [94]:
chunk_duration = 1 # Each read length in seconds from mic.
fs = 44100 # sampling rate for mic
chunk_samples = int(fs * chunk_duration) # Each read length in number of samples.

# Each model input data duration in seconds, need to be an integer numbers of chunk_duration
feed_duration = 10
feed_samples = int(fs * feed_duration)

assert feed_duration/chunk_duration == int(feed_duration/chunk_duration)

In [95]:
def get_spectrogram(data):
    """
    Function to compute a spectrogram.
    
    Argument:
    predictions -- one channel / dual channel audio data as numpy array

    Returns:
    pxx -- spectrogram, 2-D array, columns are the periodograms of successive segments.
    """
    nfft = 200 # Length of each window segment
    fs = 8000 # Sampling frequencies
    noverlap = 120 # Overlap between windows
    nchannels = data.ndim
    if nchannels == 1:
        pxx, _, _ = mlab.specgram(data, nfft, fs, noverlap = noverlap)
    elif nchannels == 2:
        pxx, _, _ = mlab.specgram(data[:,0], nfft, fs, noverlap = noverlap)
    return pxx

-

In [96]:
def plt_spectrogram(data):
    """
    Function to compute and plot a spectrogram.
    
    Argument:
    predictions -- one channel / dual channel audio data as numpy array

    Returns:
    pxx -- spectrogram, 2-D array, columns are the periodograms of successive segments.
    """
    nfft = 200 # Length of each window segment
    fs = 8000 # Sampling frequencies
    noverlap = 120 # Overlap between windows
    nchannels = data.ndim
    if nchannels == 1:
        pxx, _, _, _ = plt.specgram(data, nfft, fs, noverlap = noverlap)
    elif nchannels == 2:
        pxx, _, _, _ = plt.specgram(data[:,0], nfft, fs, noverlap = noverlap)
    return pxx

...

### Audio stream

In [97]:
def get_audio_input_stream(callback):
    stream = pyaudio.PyAudio().open(
        format=pyaudio.paInt16,
        channels=1,
        rate=fs,
        input=True,
        frames_per_buffer=chunk_samples,
        input_device_index=0,
        stream_callback=callback)
    return stream

-

In [120]:
import pyaudio
from queue import Queue
from threading import Thread
import sys
import time


# Queue to communiate between the audio callback and main thread
q = Queue()
activate=False
deactivate=False
run = True

silence_threshold =500

# Run the demo for a timeout seconds
timeout = time.time() + 60

# Data buffer for the input wavform
data = np.zeros(feed_samples, dtype='int16')

def callback(in_data, frame_count, time_info, status):
    global run, timeout, data, silence_threshold    
    if time.time() > timeout:
        run = False        
    data0 = np.frombuffer(in_data, dtype='int16')
    if np.abs(data0).mean() < silence_threshold:
        sys.stdout.write('-')
        return (in_data, pyaudio.paContinue)
    else:
        sys.stdout.write('.')
    data = np.append(data,data0)    
    if len(data) > feed_samples:
        data = data[-feed_samples:]
        # Process data async by sending a queue.
        q.put(data)
    return (in_data, pyaudio.paContinue)

stream = get_audio_input_stream(callback)
stream.start_stream()

def sound():
    try:
        model = load_model('./models/tr_model.h5')
        global run,stream,activate,deactivate
        while run:
            data = q.get()
            spectrum = get_spectrogram(data)
            preds = detect_triggerword_spectrum(spectrum,model)
            new_trigger = has_new_triggerword(preds, chunk_duration, feed_duration)
            if new_trigger:
                keyboard.press_and_release('Alt+Tab')
                keyboard.press_and_release('Alt+Tab')
                if activate==True:
                    deactivate=True
                sys.stdout.write('1')
                activate=True
                if deactivate==True:
                    break
    except (KeyboardInterrupt, SystemExit):
        stream.stop_stream()
        stream.close()
        timeout = time.time()
        run = False
    stream.stop_stream()
    stream.close()
t1 = Thread(target=sound,args=())
t1.start()
cap = cv2.VideoCapture(0)
while True:
    if activate:
        ret, img = cap.read()
        img=cv2.resize(img,(1200,800))
        img=cv2.flip(img,1)
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.rectangle(img, (0,400), (400, 800), (0,0,255), 4)
        cv2.putText(img,'Break',(0,400), font, 2,(0,0,255),4)
        cv2.rectangle(img, (800, 400), (1200, 800), (0,0,255), 4) 
        cv2.putText(img,'Gas',(800,400), font, 2,(0,0,255),4)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        break_box=gray[400:800,0:400]
        acc_box=gray[400:800,800:1200]
        acc_box=cv2.flip(acc_box,1)
        break_true = len(hand_cascade.detectMultiScale(break_box, 1.1, 2))>0
        acc_true = len(hand_cascade.detectMultiScale(acc_box, 1.1, 2))>0
        if break_true:
            cv2.rectangle(img, (0,400), (400, 800), (0,255,0), 4)
            cv2.putText(img,'Break',(0,400), font, 2,(0,255,0),4)
            keyboard.press('Left Arrow')
        else:
            keyboard.release('Left Arrow')
        if acc_true:
            cv2.rectangle(img, (800,400), (1200, 800), (0,255,0), 4)
            cv2.putText(img,'Gas',(800,400), font, 2,(0,255,0),4)
            keyboard.press('Right Arrow')
        else:
            keyboard.release('Right Arrow')
        img=cv2.resize(img,(400,300))
        cv2.imshow('game',img)
    if(cv2.waitKey(10) & 0xFF == ord('q')):
        break
    if deactivate:
        break
cap.release()
cv2.destroyAllWindows()
stream.stop_stream()
stream.close()

...---



---.1-..-----..1

In [117]:
stream.stop_stream()
stream.close()
cap.release()
cv2.destroyAllWindows()