In [24]:
import sounddevice as sd
import numpy as np
import scipy.signal
import python_speech_features
import tensorflow as tf
from PyQt5.QtCore import QTime
import librosa
from tensorflow.keras import layers, models

# This gets called every 0.5 seconds
def sound(window,s,m):
    
    S = np.abs(librosa.stft(window)) #將整個window音訊做stft，並轉成絕對值
    
    if np.sum(S) >= 500: #判斷S的總和是否>=500，如果>=500，代表有明顯的聲音
        window = window.astype(np.float)

        window = (window - window.mean()) / (window.max() - window.min())

        features = python_speech_features.base.logfbank(window,
                                                        samplerate=16000,
                                                        winlen=0.025,
                                                        winstep=0.01,
                                                        nfilt=26,
                                                        nfft=512,
                                                        lowfreq=0,
                                                        highfreq=None,
                                                        preemph=0.97)

        features = np.float32(features.reshape(1, features.shape[0], features.shape[1], 1))

        prediction = model.predict(features).reshape((14, ))
        prediction /= prediction.sum()
        best_candidate_index = prediction.argmax()

        best_candidate_probability = prediction[best_candidate_index]

        if(best_candidate_probability > 0.7): # treshold
            if(str(all_targets[best_candidate_index]) != "backgroundNoise"):
                data.append(str(all_targets[best_candidate_index]) + " " + str(m) + "分" + str(s) + "秒" + " 預測值:" + str(best_candidate_probability)) 

#main
# Parameters
model_path = './h5/recordingTest.h5'
all_targets = ['backgroundNoise', 'ㄏㄧㄡ', 'ㄟ', '他', '你', '吼', '啦', '嗯', '好', '我', '的', '著', '那', '阿']

data = []
data_ho = []
start = 0 #一開始的索引值
end = 4000 #一開始的索引值
s = 0 #秒
m = 0 #分
duration = 6.5 #讀音檔的總時間
sample_rate = 16000 #取樣率

#載入音檔
y, sr = librosa.load("./TestWAV/B.wav",sr=sample_rate,duration=duration) 

# Sliding window
window = np.zeros(8000)#取樣音頻數據變數

# Load model
model = models.load_model(model_path)


while True:
    s = s + 0.25 #增加秒數
    if(s >= 60): #60秒 轉成 1分
        s = 0
        m = m + 1

    window[:4000] = window[4000:] #把音訊載入window
    window[4000:] = y[start:end] #把音訊載入window
    
    sound(window,s,m) #呼叫sound()

    if(end == (16000 * duration)): #如果移動到最後，break
        break
    
    start = start + 4000 #向後移動
    end = end + 4000 #向後移動    
    
    
for i in range(len(data)):
        print(data[i])

ㄟ 0分0.75秒 預測值:0.99052024
你 0分1.5秒 預測值:0.9999995
你 0分1.75秒 預測值:0.9999027
著 0分2.0秒 預測值:0.99904364
你 0分2.5秒 預測值:0.9128454
著 0分3.75秒 預測值:0.99880385
你 0分4.25秒 預測值:0.7080329
你 0分4.5秒 預測值:0.99999905
你 0分4.75秒 預測值:0.96504885
你 0分5.25秒 預測值:0.9996731
嗯 0分6.0秒 預測值:0.99992204
啦 0分6.25秒 預測值:0.9999999
他 0分6.5秒 預測值:0.9546656
