In [None]:
"""將tensorflow檔轉成tensorflow lite，使機器負載量變小"""
from tensorflow import lite
from tensorflow.keras import models

# Parameters
keras_model_filename = './h5_normalize/recording9_fbank.h5' #訓練好的模型
tflite_filename = './tflite_normalize/recording9_fbank.tflite' #預建置檔案

# Convert model to TF Lite model
model = models.load_model(keras_model_filename) #載入本來的模型
converter = lite.TFLiteConverter.from_keras_model(model) #將模型載入轉換器
tflite_model = converter.convert() #進行轉換
open(tflite_filename, 'wb').write(tflite_model) #輸出轉換後的模型


In [6]:
#即時
import sounddevice as sd
import numpy as np
import scipy.signal
import python_speech_features
import tensorflow as tf
import librosa
#import RPi.GPIO as GPIO

# Parameters
word_threshold = 0.7 #預測值>0.7
rec_duration = 0.5 #每一段錄音持續時間
sample_rate = 16000 #取樣率(依MIC不同而改變)
num_channels = 1 #音訊深度
model_path = './tflite_normalize/recording9_fbank.tflite'
words = ['backgroundNoise', 'ㄏㄧㄡ', 'ㄟ', '他', '你', '其他', '吼', '啦', '嗯', '好', '我', '的', '著', '那', '阿']

# Sliding window
window = np.zeros(8000)#取樣音頻數據變數

# GPIO 
#GPIO.setwarnings(False)
#GPIO.setmode(GPIO.BOARD)
#GPIO.setup(8, GPIO.OUT, initial=GPIO.LOW)

# Load model (interpreter)
interpreter = tf.lite.Interpreter(model_path)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# This gets called every 0.5 seconds
def sd_callback(rec, frames, time, status):

    #GPIO.output(led_pin, GPIO.LOW)
    
    # Notify if errors
    if status:
        print('Error:', status)
    
    # Remove 2nd dimension from recording sample
    #壓縮成1D張量
    rec = np.squeeze(rec)

    #將音訊輸入到window
    window = rec
    S = np.abs(librosa.stft(window)) #將整個window音訊做stft，並轉成絕對值
    
    if np.sum(S) >= 3500: #判斷S的總和是否>=3500，如果>=3500，代表有講話
        
        window = window.astype(np.float)

        window = (window - window.mean()) / (window.max() - window.min())
    
        # Compute features
        features = python_speech_features.base.logfbank(window,
                                                        samplerate=16000,
                                                        winlen=0.025,
                                                        winstep=0.01,
                                                        nfilt=26,
                                                        nfft=512,
                                                        lowfreq=0,
                                                        highfreq=None,
                                                        preemph=0.97)
        
        # Make prediction from model
        in_tensor = np.float32(features.reshape(1, features.shape[0], features.shape[1], 1))
        #設定輸入張量
        interpreter.set_tensor(input_details[0]['index'], in_tensor)
        #進行預測
        interpreter.invoke()
        #取得輸出張量
        output_data = interpreter.get_tensor(output_details[0]['index'])
        
        val = output_data[0]#取得預測值
        val = val.tolist() #np.ndarray to list
        list_val_max = max(val) #取得最大值
        list_val_maxIndex = val.index(max(val)) #取得最大值的索引  
        
        if(list_val_max > word_threshold):
            print(words[list_val_maxIndex])#輸出相對應的字詞
            print("MAX:" + str(list_val_max))#輸出預測值當中最大的值
    
# Start streaming from microphone
with sd.InputStream(channels=num_channels,
                    samplerate=sample_rate,
                    blocksize=int(sample_rate * rec_duration),
                    callback=sd_callback):
    while True:
        
        pass


backgroundNoise
MAX:0.9948199391365051
的
MAX:0.9882595539093018
backgroundNoise
MAX:0.9999996423721313
backgroundNoise
MAX:0.9775601625442505
嗯
MAX:0.9849178194999695
你
MAX:0.9983115196228027
其他
MAX:0.829176664352417
backgroundNoise
MAX:0.9903884530067444
backgroundNoise
MAX:1.0
backgroundNoise
MAX:0.9999833106994629
backgroundNoise
MAX:0.9870778322219849
其他
MAX:0.7135568261146545
你
MAX:0.8458034992218018
其他
MAX:0.9446187019348145
你
MAX:0.9990611672401428
你
MAX:0.8070201277732849
ㄟ
MAX:0.8645926117897034
backgroundNoise
MAX:0.9998080134391785
ㄏㄧㄡ
MAX:0.9747885465621948
我
MAX:0.9314422607421875
ㄟ
MAX:0.9985009431838989
你
MAX:0.9999996423721313
你
MAX:0.9999996423721313
backgroundNoise
MAX:0.7746793627738953
backgroundNoise
MAX:0.9893921613693237
ㄏㄧㄡ
MAX:0.9856022000312805
其他
MAX:0.8326053619384766
嗯
MAX:1.0
那
MAX:0.9875824451446533
其他
MAX:0.9861457943916321
backgroundNoise
MAX:0.9979947805404663
ㄟ
MAX:0.9999992847442627
ㄟ
MAX:0.9999915361404419
backgroundNoise
MAX:0.9992175102233887
ㄟ
MA

KeyboardInterrupt: 

In [18]:
#非即時
import numpy as np
import scipy.signal
import python_speech_features
import tensorflow as tf
import librosa

def sound(window,s,m):

    S = np.abs(librosa.stft(window)) #將整個window音訊做stft，並轉成絕對值
    
    if np.sum(S) >= 3500: #判斷S的總和是否>=3500，如果>=3500，代表有明顯的聲音
        window = window.astype(np.float)

        window = (window - window.mean()) / (window.max() - window.min())

        features = python_speech_features.base.logfbank(window,
                                                        samplerate=16000,
                                                        winlen=0.025,
                                                        winstep=0.01,
                                                        nfilt=26,
                                                        nfft=512,
                                                        lowfreq=0,
                                                        highfreq=None,
                                                        preemph=0.97)
        
        # Make prediction from model
        in_tensor = np.float32(features.reshape(1, features.shape[0], features.shape[1], 1))
        #設定輸入張量
        interpreter.set_tensor(input_details[0]['index'], in_tensor)
        #進行預測
        interpreter.invoke()
        #取得輸出張量
        output_data = interpreter.get_tensor(output_details[0]['index'])
        
        val = output_data[0]#取得預測值
        val = val.tolist() #np.ndarray to list
        list_val_max = max(val) #取得最大值
        list_val_maxIndex = val.index(max(val)) #取得最大值的索引  
        
        if(list_val_max > 0.7):
            print(words[list_val_maxIndex] + " " + str(m) + "分" + str(s) + "秒" + " 預測值:" + str(list_val_max))#輸出相對應的字詞
        #if(best_candidate_probability > 0.7): # treshold
        #    if(str(all_targets[best_candidate_index]) != "backgroundNoise"):
        #        data.append(str(all_targets[best_candidate_index]) + " " + str(m) + "分" + str(s) + "秒" + " 預測值:" + str(best_candidate_probability)) 
        
#main
# Parameters
model_path = './tflite_normalize/recording9_fbank.tflite'
words = ['backgroundNoise', 'ㄏㄧㄡ', 'ㄟ', '他', '你', '其他', '吼', '啦', '嗯', '好', '我', '的', '著', '那', '阿']

data = []
start = 0 #一開始的索引值
end = 4000 #一開始的索引值
s = 0 #秒
m = 0 #分
duration = 60 #讀音檔的總時間
sample_rate = 16000 #取樣率

#載入音檔
y, sr = librosa.load("./TestWAV/TEST.wav",sr=sample_rate) 

#用0填滿時間，以讓滑動視窗都有東西
total_sec = len(y) / 16000 #算音檔總時間
if total_sec < 1: #如果音檔<1s
    new_y = np.zeros(16000) #new_y補足1s
    new_y[0:len(y)] = y #將原先的音訊載入到新的陣列
elif total_sec - (int(total_sec)) != 0: #如果音檔非整數秒，ex:原先音訊長度1.2s ，1.2 - 1 != 0 ， 將長度補到2s
    total_sec = (int(total_sec) + 1) #新時間為原時間整數+1
    new_y = np.zeros(total_sec * 16000) #補足到整數秒
    new_y[0:len(y)] = y #將原先的音訊載入到新的陣列
else:
    new_y = y

# Sliding window
window = np.zeros(8000)#取樣音頻數據變數

# Load model (interpreter)
interpreter = tf.lite.Interpreter(model_path)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

while True:
    s = s + 0.25 #增加秒數
    if(s >= 60): #60秒 轉成 1分
        s = 0
        m = m + 1

    window[:4000] = window[4000:] #把音訊載入window
    window[4000:] = new_y[start:end] #把音訊載入window
    
    sound(window,s,m) #呼叫sound()

    if(end == len(new_y)): #如果移動到最後，break
        break
    
    start = start + 4000 #向後移動
    end = end + 4000 #向後移動    

嗯 0分0.5秒 預測值:0.9999884366989136
嗯 0分0.75秒 預測值:1.0
嗯 0分1.0秒 預測值:0.9999039173126221
