1. To record a audio. Do pre-processing. Do Cloud ai inferencing.
2. Then create the input-tensor dump of that audio.
3. Feed the input-tensor to arduino nano 33.
4. Do inference in Resource-constrain Embedded systems i.e., here we use Arduino nano 33 BLE with tinyml

Setup necessary libraries

In [1]:
import os
import zipfile
import tensorflow as tf
import numpy as np
import seaborn as sns
import pathlib
from IPython import display
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report

# Defining the squeeze function
def squeeze(audio, labels):
  audio = tf.squeeze(audio, axis=-1)
  return audio, labels

# Convert waveform to spectrogram
def get_spectrogram(waveform):
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    return spectrogram[..., tf.newaxis]

# dimension reduction, audio is [124,129,1] to be reduced to [64,64,1]
resizing_layer = tf.keras.layers.Resizing(64, 64)

# normalise the audio spectrogram
normalise_layer = tf.keras.layers.Normalization()

In [3]:
# Load the tflite model for Inferencing

interpreter = tf.lite.Interpreter(model_path="SC_LiteModel_notOpt.tflite")
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
interpreter.allocate_tensors()

In [4]:
# Get an audio sample (mono, duration=1s, SampleRate=16khz, format=.wav, encoding=PCM-16bit)
label_names=np.array(["Forward","Reverse","Unknown"])

def Get_AudioTensor(path,):
  Input = tf.io.read_file(str(path))
  x, sample_rate = tf.audio.decode_wav(Input, desired_channels=1, desired_samples=16000,)
  audio, labels = squeeze(x, 'yes')
  display.display(display.Audio(audio, rate=16000))
  x = get_spectrogram(audio)
  x = resizing_layer(x)
  x = normalise_layer(x)
  x = tf.expand_dims(x, axis=0)
  return x

def Predict_label(input_tensor):
  interpreter.set_tensor(input_details[0]["index"], input_tensor)
  interpreter.invoke()
  output_tensor = interpreter.get_tensor(output_details[0]["index"])
  return output_tensor

def RunInference(path): # returns the input, output tensors
  input_tensor = Get_AudioTensor(path)
  print("Input Tensor shape : ",input_tensor.shape)
  PredictedLabel=Predict_label(input_tensor)
  print("Output Tensor shape : ",PredictedLabel.shape)
  print("Output tensor - ",PredictedLabel[0])
  print("Prediction - ",label_names[np.argmax(PredictedLabel[0])])
  return input_tensor, PredictedLabel

In [5]:
# Get the c array of the Input_tensor and get it stored in a hex file - For memory dumping in Arduino
import binascii
import struct

def convert_to_c_array(bytes) -> str:
  hexstr = binascii.hexlify(bytes).decode("UTF-8")
  hexstr = hexstr.upper()
  array = ["0x" + hexstr[i:i + 2] for i in range(0, len(hexstr), 2)]
  array = [array[i:i+10] for i in range(0, len(array), 10)]
  return ",\n  ".join([", ".join(e) for e in array])

def StoreAudioTensor(TensorInput): #Stores the tensor dump in AudioInputTensor.h
  c_array=""
  endlineAdderCount=0
  for axis0 in tf.range(tf.shape(TensorInput)[0]):
    for axis1 in tf.range(tf.shape(TensorInput)[1]):
      for axis2 in tf.range(tf.shape(TensorInput)[2]):
        for axis3 in tf.range(tf.shape(TensorInput)[3]):
          element = TensorInput[axis0, axis1, axis2, axis3].numpy()
          bytes_representation = struct.pack('<f', element) # float32 little-endian
          c_array=c_array+convert_to_c_array(bytes_representation)+", "
          if(endlineAdderCount==3):
            c_array=c_array+"\n "
          endlineAdderCount=(endlineAdderCount+1) % 4

  c_file = "const unsigned char Audio_0 [] PROGMEM = {\n " + c_array + "0x00\n};"
  print("writing into ./AudioInputTensor.h")
  open("AudioInputTensor.h", "w").write(c_file)
  print("Done")

def StoreAudioTensorInFloat(TensorInput): #Stores the tensor dump in AudioInputTensor.h
  c_array=""
  endlineAdderCount=0
  for axis0 in tf.range(tf.shape(TensorInput)[0]):
    for axis1 in tf.range(tf.shape(TensorInput)[1]):
      for axis2 in tf.range(tf.shape(TensorInput)[2]):
        for axis3 in tf.range(tf.shape(TensorInput)[3]):
          element = TensorInput[axis0, axis1, axis2, axis3].numpy()
          c_array=c_array+np.array2string(element)+"f, "
          if(endlineAdderCount==3):
            c_array=c_array+"\n "
          endlineAdderCount=(endlineAdderCount+1) % 4

  c_file = "const float Audio_0 [] = {\n " + c_array + "0.0f\n};"
  print("writing into ./AudioInputTensor.h")
  open("AudioInputTensor.h", "w").write(c_file)
  print("Done")

In [9]:
# Read a sample audio and infer
AudioPath='live_test3.wav'

AudioTensor,LabelTensor=RunInference(AudioPath)
#StoreAudioTensor(AudioTensor)

Input Tensor shape :  (1, 64, 64, 1)
Output Tensor shape :  (1, 3)
Output tensor -  [3.6554030e-01 6.2328955e-04 6.3383633e-01]
Prediction -  Unknown


In [10]:
StoreAudioTensorInFloat(AudioTensor)

writing into ./AudioInputTensor.h
Done
