
| 類別               | 數量 (原始) | 數量 (擴增後的訓練集) | 數量 (擴增後的驗證集)                    |
|--------------------|--------------|------------------------|------------------------------------------|
| 0: Environment     | 1400           | 4142                    | 20                                       |
| 1: en_help         | 169           | 512                    | 30                                       |
| 2: ch_help         | 493           | 1520                    | 30                                       |
| 3: ja_help         | 86          | 203                    | 30                                       |
| 4: tw_help         | 278           | 828                    | 40                            |
| 5: alarm           | 64          | 214                    | 31                                       |


- 先將粵語與客語的移除 模型整體收斂的更好 模型效能也有增加
- 還有許多人們在聊天的環境音

<!-- Counter({0: 1400, 2: 493, 4: 278, 7: 201, 1: 169, 3: 86, 5: 64, 6: 30}) -->

In [54]:
import numpy as np
import os
import tensorflow as tf
import soundfile as sf
from IPython.display import Audio
# from scipy.io import wavfile
import random
import librosa


In [55]:
sr = 16000

model_path = "/home/sail/sound_project/sound_ai_v3/sys/step_5_conv_int8/save_tflite_model_s5/20241029_18/6C_model_v3qat_model_uec_v2.2_sail1014_20241029183111.tflite"

labels = ['other', 'en_help', 'ch_help', 'ja_help', 'tw_help', 'alarm']

In [56]:
# def normalize(audio, audio_max_value=max_value): max_value = 1.4 #.47121312618255615
#     """Normalize the audio data to the range of -1 to 1."""
#     return audio / float(audio_max_value)

def normalize(audio):
    """Normalize the audio data to the range of -1 to 1."""
    max_val = audio.max()
    min_val = audio.min()
    if max_val == min_val:
        return np.zeros_like(audio) 
    normalized_sound = 2 * (audio - min_val) / (max_val - min_val) - 1
    return normalized_sound

def preprocess_setup():
    funcs = [lambda x: normalize(x)]
    return funcs

def preprocess(sound, funcs):
    for f in funcs:
        sound = f(sound)
    return sound

def quantize_input(data, input_tensor_info):
    """Quantize data from float to int8 using the scale and zero_point."""
    scale, zero_point = input_tensor_info['quantization']
    if scale > 0:
        data = data / scale + zero_point
        data = np.round(data).astype(np.int8)  # Ensure conversion to INT8
    return data

def softmax(logits):
    """Convert logits to probability values."""
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def padding_zero(audio, sr=sr, seconds=1, pad_type='a'):
    if len(audio) < sr*seconds:
        if pad_type=='ab':
            total_padding = sr*seconds - len(audio)
            return np.pad(audio, (total_padding // 2, total_padding - (total_padding // 2)), 'constant', constant_values=(0, 0))
        elif pad_type=='a':
            total_padding = sr*seconds - len(audio)
            return np.pad(audio, (0, total_padding), 'constant', constant_values=(0, 0))
    else:
        return clip_1s(audio)
    
def clip_1s(audio, sr=sr, type='start'):
    if type =='start':
        return audio[:sr]
    elif type == 'end':
        return audio[-sr:]
    elif type == 'random':
        start = random.randint(0, len(audio) - sr)
        return audio[start:start+sr]
    else:
        # return audio[type:type+sr]
        raise ValueError('type must be start, end or random.')    
    
def load_data(wav_path, sr=sr, type='librosa'):
    if type == 'librosa':
        return librosa.load(wav_path, sr=sr)[0]
    elif type == 'wavfile':
        return wavfile.read(wav_path)[1]    

In [57]:
audio_data_list = []
def main(audio_file):

    audio_data_ = load_data(audio_file, sr=sr, type='librosa') #sf.read(audio_file)
    if audio_data_.ndim == 2:
        audio_data_ = np.mean(audio_data_, axis=1)    
    audio_data = padding_zero(audio_data_, sr=sr, seconds=1, pad_type='ab')
    
    
    funcs = preprocess_setup()
    audio_data = preprocess(audio_data, funcs)
    audio_data_forTony = audio_data
    print(audio_data_forTony.shape)
    
    # model
    interpreter = tf.lite.Interpreter(model_path=model_path)
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    audio_data = quantize_input(audio_data, input_details[0])
    audio_data = audio_data.reshape(input_details[0]['shape'])
    

    interpreter.set_tensor(input_details[0]['index'], audio_data)
    interpreter.invoke()
    output_data = interpreter.get_tensor(output_details[0]['index'])
    
    # Calculate probabilities
    probabilities = softmax(output_data)

    # Determine the predicted label
    predicted_index = np.argmax(probabilities)
    predicted_label = labels[predicted_index]
    for target_label in labels:
        if target_label in audio_file:
            break
        else:
            target_label = 'other'
    # predicted_probability = probabilities[0, predicted_index]
    # print(audio_data.shape)
    print(f"\nPrediction Results:\n{'-'*40}")
    print(f"Audio File: {audio_file}")

    if target_label == predicted_label:
        print("\033[92m" + f"Class Index      : {predicted_index}" + "\033[0m")  # Green for match
        print("\033[92m" + f"Target Label     : {target_label}" + "\033[0m")     # Green for match
        print("\033[92m" + f"Predicted Label  : {predicted_label}" + "\033[0m")    
    else:
        print(f"Class Index      : {predicted_index}")
        print(f"target Label  : {target_label}")
        print(f"Predicted Label  : {predicted_label}")

        
    # Print formatted probabilities for each label
    print("Probabilities:")
    for i, prob in enumerate(probabilities[0]):
        print(f"  {labels[i]:<15}: {prob:.8f}")
    print(f"{'-'*40}\n")
    return target_label , predicted_label, audio_data_forTony

def get_wav_files(directories):
    wav_files = []
    
    # 遍歷每個指定的目錄
    for directory in directories:
        # 使用 os.walk 遞迴遍歷目錄及其子目錄
        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith(".wav") and 'hk_' not in file:
                    # print(root)
                    # 如果找到 .wav 文件，加入到 wav_files 列表中
                    wav_files.append(os.path.join(root, file))
    
    return wav_files

In [58]:
DATA_path = '/home/sail/sound_project/DATA/using_data_v3/v3_traindata/for_training/test'

wav_files = get_wav_files([DATA_path])
len(wav_files)


44

In [59]:
if __name__ == "__main__":
    target_list , predicted_list, audio_tony_list, audio_data_list = [], [], [], []
    for audio_file in wav_files:
        target_label , predicted_label,  audio_data_ = main(audio_file = audio_file)

        target_list.append(target_label); predicted_list.append(predicted_label)
        audio_data_list.append(audio_data_)

    print(f'{sum(t == p for t, p in zip(target_list, predicted_list))} out of {len(target_list)} tests predicted correctly')


(16000,)

Prediction Results:
----------------------------------------
Audio File: /home/sail/sound_project/DATA/using_data_v3/v3_traindata/for_training/test/ch_help_TEST_0.wav
[92mClass Index      : 2[0m
[92mTarget Label     : ch_help[0m
[92mPredicted Label  : ch_help[0m
Probabilities:
  other          : 0.01797485
  en_help        : 0.00000000
  ch_help        : 0.98193359
  ja_help        : 0.00000000
  tw_help        : 0.00001639
  alarm          : 0.00000000
----------------------------------------

(16000,)

Prediction Results:
----------------------------------------
Audio File: /home/sail/sound_project/DATA/using_data_v3/v3_traindata/for_training/test/tw_help_TEST_7.wav
Class Index      : 0
target Label  : tw_help
Predicted Label  : other
Probabilities:
  other          : 1.00000000
  en_help        : 0.00000000
  ch_help        : 0.00000000
  ja_help        : 0.00000000
  tw_help        : 0.00000012
  alarm          : 0.00000000
----------------------------------------



## 

In [60]:
# wav_files = ['/home/sail/sound_project/DATA/v2.2_traindata/for_training/val/'+f for f in os.listdir('/home/sail/sound_project/DATA/v2.2_traindata/for_training/val/') if f.endswith('.wav')]

wav_files = [DATA_path+'/alarm_TEST_0.wav',
            # DATA_path+'/en_help_74.wav',
            #  DATA_path+'/ch_help_45.wav',
            #  DATA_path+'/ja_help_83.wav',
            # DATA_path+'/tw_help_60.wav',
            # DATA_path+'/other_99.wav',
            #  DATA_path+'/other_39.wav',
            
             ]

In [61]:
def load_data(audio_file):
    audio_data_, _ = sf.read(audio_file)
    if audio_data_.ndim == 2:  
        audio_data_ = np.mean(audio_data_, axis=1)      
    return audio_data_

In [None]:
audio_path = '/home/sail/sound_project/DATA/v2.2.2_traindata/no_padding_only_clip1s/Environment_0.wav'
Audio(load_data(audio_path),rate=sr)

In [None]:
import wave

for i, audio_data in enumerate(audio_data_list):
    output_file = f'/home/sail/sound_project/golden_sample/raw/{wav_files[i].split("/")[-1]}'
    sf.write(output_file, audio_data, sr)
    # audio_data_int = (audio_data * 32767).astype(np)


In [None]:
import wave

for i, audio_data in enumerate(audio_tony_list):
    output_file = f'/home/sail/sound_project/golden_sample/{wav_files[i].split("/")[-1]}'
    sf.write(output_file, audio_data, sr)
    # audio_data_int = (audio_data * 32767).astype(np)
