In [1]:
import tflite_runtime.interpreter as tflite
from time import time
import numpy as np
#import tensorflow as tf

# TFLITE_FILE_PATH = 'logmelcalc.tflite'

logmelcalc_interpreter = tflite.Interpreter(model_path="logmelcalc.tflite")
logmelcalc_interpreter.allocate_tensors()

input_index = logmelcalc_interpreter.get_input_details()[0]["index"]
output_details = logmelcalc_interpreter.get_output_details()

s=time()
for i in range(1000):
    t = np.expand_dims(np.random.random(16000).astype("float32"),axis=0)
    logmelcalc_interpreter.set_tensor(input_index,t)
    logmelcalc_interpreter.invoke()
    output_data = logmelcalc_interpreter.get_tensor(output_details[0]['index'])
print("tflite optimized:",(time()-s)/1000)

# s = time()
# for i in range(1000):
#     q=logmelcalc.predict(t)
# print("No optimization:",(time()-s)/1000)

tflite optimized: 0.00296718168258667


In [2]:
baseModel_interpreter = tflite.Interpreter(model_path="./baseModel_revised.tflite")
baseModel_interpreter.allocate_tensors()

base_model_inp = baseModel_interpreter.get_input_details()
base_model_out = baseModel_interpreter.get_output_details()

s=time()
for i in range(1000):
    baseModel_interpreter.set_tensor(
        base_model_inp[0]["index"],
        np.expand_dims(np.random.random((98,64)),axis=(0,-1)).astype("float32")
    )
#     t = np.expand_dims(np.random.random(16000).astype("float32"),axis=0)
#     baseModel_interpreter.set_tensor(input_index,t)
    baseModel_interpreter.invoke()
    output_data = baseModel_interpreter.get_tensor(base_model_out[0]['index'])
print("tflite optimized:",(time()-s)/1000)


tflite optimized: 0.006154918909072876


In [3]:
def audioToVector(inpAudio:np.array) -> np.array :
    
    logmelcalc_interpreter.set_tensor(input_index,np.expand_dims(inpAudio/inpAudio.max(),axis=0).astype("float32"))
    logmelcalc_interpreter.invoke()
    logmel_output = logmelcalc_interpreter.get_tensor(output_details[0]['index'])
    baseModel_interpreter.set_tensor(
        base_model_inp[0]["index"],
        np.expand_dims(logmel_output,axis=(0,-1)).astype("float32")
    )
    baseModel_interpreter.invoke()
    output_data = baseModel_interpreter.get_tensor(base_model_out[0]['index'])
    
    return output_data


In [4]:

def randomCrop(x:np.array,length=16000)->np.array :
    assert(x.shape[0]>length)
    frontBits = random.randint(0,x.shape[0]-length) 
    return x[frontBits:frontBits+length]

def addPadding(x:np.array,length=16000)->np.array :
    assert(x.shape[0]<length)
    bitCountToBeAdded = length - x.shape[0]
    frontBits = random.randint(0,bitCountToBeAdded)
    #print(frontBits, bitCountToBeAdded-frontBits)
    new_x = np.append(np.zeros(frontBits),x)
    new_x = np.append(new_x,np.zeros(bitCountToBeAdded-frontBits))
    return new_x

def removeExistingPadding(x:np.array)->np.array:
    lastZeroBitBeforeAudio = 0 
    firstZeroBitAfterAudio = len(x)
    for i in range(len(x)):
      if x[i]==0:
        lastZeroBitBeforeAudio = i
      else:
        break
    for i in range(len(x)-1,1,-1):
      if x[i]==0:
        firstZeroBitAfterAudio = i
      else:
        break
    return x[lastZeroBitBeforeAudio:firstZeroBitAfterAudio]

def fixPaddingIssues(x:np.array,length=16000)-> np.array:
    x = removeExistingPadding(x)
    #print("Preprocessing Shape",x.shape[0])
    if(x.shape[0]>16000):
      return randomCrop(x,length=length)
    elif(x.shape[0]<16000):
      return addPadding(x,length=length)
    else:
      return x


In [56]:
from os import listdir
from os.path import isdir
import librosa

WAKE_WORD = "wakewords/google"

assert(isdir(WAKE_WORD))

import random
import numpy as np

sound_files = sorted([ WAKE_WORD+"/"+x for x in listdir(WAKE_WORD) if ".mp3" in x ])

uk_embeddings = []
us_embeddings = []

embeddings = []

for sound_file in sound_files:
    x , _ = librosa.load(sound_file,sr=16000)
    x = fixPaddingIssues(x)
    embeddings.append(audioToVector(x)[0])
embeddings = np.array(embeddings)



In [55]:
x , _ = librosa.load("reference_audios/python/python-chittu1.wav",sr=SAMPLE_RATE)
t=augumentAudio(x[:SAMPLE_RATE if len(x)>SAMPLE_RATE else -1],sample_rate=SAMPLE_RATE)
print(t.shape)
ipd.Audio(augumentAudio(x,sample_rate=SAMPLE_RATE),rate=SAMPLE_RATE)

(16000,)


In [58]:
import pyaudio
import numpy as np
from time import time, sleep
# import cv2
from IPython.display import clear_output
from matplotlib import pyplot as plt
import IPython.display as ipd
import librosa.display

CHUNK = 2000
RATE = 16000
SECOND = 30
p=pyaudio.PyAudio()
stream=p.open(format=pyaudio.paInt16,channels=1,rate=RATE,input=True,
              frames_per_buffer=CHUNK)

inpAudio = np.zeros(RATE)

print("Start Speaking")
#for i in range(int(SECOND*RATE/CHUNK)): #go for a few seconds

def mapToConfidence(array: np.array, threshold = 0.25):
    array[array>threshold] = threshold
    conf = (threshold - array)/threshold
    
    out = 0.0
    for i in conf :
        out += (1-out)*i
    return out

while True :
    data = np.frombuffer(stream.read(CHUNK),dtype=np.int16)
    inpAudio = np.append(inpAudio[CHUNK:],data)
    #copy = inpAudio.copy()
        #dist = np.linalg.norm(reference_embedding,realtime_embedding)        
    """
    realtime_embedding = audioToVector(inpAudio)
    realtime_dist = np.sqrt(np.sum((mean_embeddings - realtime_embedding)**2,axis=1))
    #print(realtime_dist)
    if((realtime_dist<thresholds).any()):
        print("Yes",np.min(realtime_dist),np.argmin(realtime_dist))
    """
    
    
    #realtime_dist = np.sqrt(np.sum((embeddings - realtime_embedding)**2,axis=1))
    #    #print(realtime_dist)
    #if((realtime_dist<0.1).any()):
    #    print("Yes",np.min(realtime_dist),np.argmin(realtime_dist))
    
    copy = inpAudio.copy()
    upperPoint = max( 
        (
            inpAudio/inpAudio.max()
        )[:1600]
    )
    if (upperPoint <= 0.2):
        realtime_embedding = audioToVector(inpAudio)
        realtime_dist = np.sqrt(np.sum((embeddings - realtime_embedding)**2,axis=1))
        confidence = mapToConfidence(np.sort(realtime_dist)[:3])
        #print(confidence)
        if(confidence>0.85):
            print("Yep",confidence)
        
    
print(output_data.shape)


Start Speaking
Yep 0.8555049502505461
Yep 0.9464780198452638
Yep 0.9846356966552527
Yep 0.9358076319176261
Yep 0.8572522695985985
Yep 0.9012858558609542
Yep 0.883693771515393
Yep 0.925247088284492
Yep 0.9326274360112189
Yep 0.9293416630641613
Yep 0.8676957512217858
Yep 0.8862979279136987
Yep 0.9361186503545275
Yep 0.9124970385241923


KeyboardInterrupt: 