In [None]:
# 必要モジュールのimport
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

import os
import sys
import glob
import time
import numpy as np
import pandas as pd
import scipy
import scipy.signal as signal
import librosa
import soundfile as sf

from tqdm import tqdm
from natsort import natsorted

from models import FCMaskEstimator, BLSTMMaskEstimator, BLSTMMaskEstimator2, UnetMaskEstimator_kernel3, \
    UnetMaskEstimator_kernel3_single_mask, CNNMaskEstimator_kernel3, UnetMaskEstimator_kernel3_single_mask_two_speakers, \
    UnetMaskEstimator_kernel3_single_mask_dereverb, MCComplexUnet
from beamformer import estimate_covariance_matrix, condition_covariance, estimate_steering_vector, sparse, \
    ds_beamformer, mvdr_beamformer, mvdr_beamformer_two_speakers, gev_beamformer, mwf, localize_music

from utils.utilities import AudioProcess, AudioProcessForComplex, wave_plot
from utils.loss_func import solve_inter_channel_permutation_problem
# 話者識別用モデル
from utils.embedder import SpeechEmbedder
# 音源分離用モジュール 
from asteroid.models import BaseModel

sys.path.append('..')
from MyLibrary.MyFunc import audio_eval, ASR, asr_eval

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
if __name__ == "__main__":
    
    # 各パラメータを設定
    sample_rate = 16000 # 作成するオーディオファイルのサンプリング周波数を指定
    fft_size = 512 # 高速フーリエ変換のフレームサイズ
    hop_length = 160 # 高速フーリエ変換におけるフレームのスライド幅
    # マスクのチャンネルを指定（いずれはconfigまたはargsで指定）TODO
    target_aware_channel = 0
    noise_aware_channel = 4
    # 音声をバッチ処理する際の1バッチ当たりのサンプル数
    batch_length = 48000
    # 目的話者の発話位置（マイク正面を0°としたときの水平角）
    true_target_azimuth = 0
    # 処理後の音声の振幅の最大値を処理前の混合音声の振幅の最大値に合わせる（True）か否か（False）
    fit_max_value = False
    
    #########################音源定位用設定########################
    freq_range = [200, 3000] # 空間スペクトルの算出に用いる周波数帯[Hz]
    # TAMAGO-03マイクロホンアレイにおける各マイクロホンの空間的な位置関係
    mic_alignments = np.array(
    [
        [0.035, 0.0, 0.0],
        [0.035/np.sqrt(2), 0.035/np.sqrt(2), 0.0],
        [0.0, 0.035, 0.0],
        [-0.035/np.sqrt(2), 0.035/np.sqrt(2), 0.0],
        [-0.035, 0.0, 0.0],
        [-0.035/np.sqrt(2), -0.035/np.sqrt(2), 0.0],
        [0.0, -0.035, 0.0],
        [0.035/np.sqrt(2), -0.035/np.sqrt(2), 0.0]
    ])
    """mic_alignments: (num_microphones, 3D coordinates [m])"""
    # 各マイクロホンの空間的な位置関係を表す配列
    mic_alignments = mic_alignments.T # get the microphone arra
    """mic_alignments: (3D coordinates [m], num_microphones)"""
    #############################################################
    
    # 評価する音声ファイルを格納したディレクトリを指定
    test_data_dir = "./audio_data/NoisySpeechDataset_multi_wav_test_original_length_two_speakers_20210714/test" # 残響なし、複数話者（最新版）
#     test_data_dir = "./audio_data/NoisySpeechDataset_multi_wav_test_original_length_two_speakers_rt0300_20210714/test" # 残響あり、複数話者（最新版）
#     test_data_dir = "../data/NoisySpeechDataset_multi_wav_test_original_length_two_speakers_spatial_resolution_check_20210721/test" # 残響なし、複数話者（0°〜15°で3°刻み）
    azimuth_list = natsorted(os.listdir(test_data_dir)) # 0, 15, 30,・・・,90
#     azimuth_list.pop(1)
    print("azimuth_list:", azimuth_list)
    # 音声認識精度評価用正解ラベルを格納したディレクトリを指定
    reference_label_dir = "../data/NoisySpeechDatabase/testset_txt/"
    
    # 「https://huggingface.co/models?filter=asteroid」にある話者分離用の学習済みモデルを指定
#     pretrained_param_speaker_separation = "JorisCos/ConvTasNet_Libri2Mix_sepclean_16k" # ConvTasNet 16kHz
    pretrained_param_speaker_separation = "JorisCos/ConvTasNet_Libri2Mix_sepnoisy_16k" # ConvTasNet 16kHz noisy ← こっちの方が精度が高そう
    # 話者識別用の学習済みモデルのパス
    embedder_path = "./utils/embedder.pt"
    # 声を抽出したい人の発話サンプルのパス
    ref_speech_path = "./utils/ref_speech/sample.wav"
    
    # マスク推定モデルの種類を指定
    model_type = 'Unet_single_mask' # 'FC' or 'BLSTM' or 'Unet' or 'Unet_single_mask' or 'Unet_single_mask_two_speakers' or 'Unet_single_mask_dereverb' or 'Complex_Unet'
    # ビームフォーマの種類を指定
    beamformer_type = 'MVDR' # 'DS' or 'MVDR' or 'GEV', or 'MWF' or 'Sparse'
    # 残響除去手法の種類を指定
    dereverb_type = None # None or 'WPE' or 'WPD'
    
    # 音声認識結果を保存するディレクトリを指定
    recog_result_dir = "./recog_result/{}_{}_{}_{}_dereverb_{}/".format(test_data_dir.split('/')[-2], model_type, pretrained_param_speaker_separation.split('/')[-1], beamformer_type, str(dereverb_type))
    os.makedirs(recog_result_dir, exist_ok=True)
    
    # モデルの設定
    # 学習済みのパラメータを保存したチェックポイントファイルのパスを指定
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_Unet_aware_1208/ckpt_epoch110.pt" # U-Net small
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_Unet_aware_1211/ckpt_epoch160.pt" # U-Net small2
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_all_Unet_aware_1215/ckpt_epoch150.pt" # U-Net all
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_FC_1201/ckpt_epoch120.pt" # FC small
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_BLSTM_1201/ckpt_epoch70.pt" # BLSTM small data（wrong version）
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_BLSTM2_1231/ckpt_epoch100.pt" # BLSTM2 small data（wrong version）
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_Unet_SCM_aware_20210105/ckpt_epoch90.pt" # U-Net SCM  
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_Unet_aware_20210111/ckpt_epoch100.pt" # U-Net small data training 1209 dataset (best model)
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_Unet_SCM_conditioned_median_20210120/ckpt_epoch110.pt" # U-Net-SCM median small data training 1209 dataset
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_Unet_SCM_raw_conditioned_aware_20210227/ckpt_epoch20.pt" # U-Net-SCM-raw small data training 1209 dataset
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_CNN_aware_20210310/ckpt_epoch200.pt" # CNN small data training 1209 dataset
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_BLSTM_median_20210312/ckpt_epoch120.pt" # BLSTM small data (correct version)
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_Unet_single_mask_median_20210315/ckpt_epoch170.pt" # U-Net-single-mask small data  (best model new version)
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_for_unet_fft_512_multi_wav_Unet_single_mask_SCM_conditioned_median_20210318/ckpt_epoch150.pt" # U-Net-single-mask-SCM small data  
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_multi_wav_test_original_length_two_speakers_Unet_single_mask_median_lr_000001_20210613/ckpt_epoch500.pt" # U-Net-single-mask-SCM small data 2speakers
    checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_multi_wav_test_original_length_Unet_single_mask_median_multisteplr00001start_20210701/ckpt_epoch190.pt" # mask base proposed baseline model
#     checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_multi_wav_test_original_length_Unet_single_mask_median_dereverb_multisteplr00001start_rt0300_20210723/ckpt_epoch140.pt"
    # checkpoint_path = "./ckpt/ckpt_NoisySpeechDataset_multi_wav_test_original_length_ComplexUnet_multisteplr00001start_20210914/ckpt_epoch70.pt" # Complex U-Net
    
    # ネットワークモデルの定義、チャンネルの選び方の指定、モデル入力時にパディングを行うか否かを指定
    if model_type == 'BLSTM':
#         model = BLSTMMaskEstimator()
        model = BLSTMMaskEstimator2()
        channel_select_type = 'median'
        padding = False
    elif model_type == 'FC':
        model = FCMaskEstimator()
        channel_select_type = 'aware'
        padding = False
    elif model_type == 'CNN':
        model = CNNMaskEstimator_kernel3()
        channel_select_type = 'aware'
        padding = True
    elif model_type == 'Unet':
        model = UnetMaskEstimator_kernel3()
        channel_select_type = 'aware'
        padding = True
    elif model_type == 'Unet_single_mask':
        model = UnetMaskEstimator_kernel3_single_mask()
        channel_select_type = 'single'
        padding = True
    elif model_type == 'Unet_single_mask_two_speakers':
        model = UnetMaskEstimator_kernel3_single_mask_two_speakers()
        channel_select_type = 'single'
        padding = True
    elif model_type == 'Unet_single_mask_dereverb':
        model = UnetMaskEstimator_kernel3_single_mask_dereverb()
        channel_select_type = 'single'
        padding = True
    elif model_type == 'Complex_Unet':
        model = ComplexUnetMaskEstimator()
        channel_select_type = 'single'
        padding = True
    # 音声処理クラスのインスタンスを作成
    audio_processor = AudioProcess(sample_rate, fft_size, hop_length, channel_select_type, padding)
    # GPUが使える場合はGPUを使用、使えない場合はCPUを使用
#     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")
    print("使用デバイス：" , device)
    # 学習済みのパラメータをロード
    model_params = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(model_params['model_state_dict'])
    # print("モデルのパラメータ数：", count_parameters(model))
    # MaskEstimatorを使って推論
    # ネットワークを推論モードへ
    model.eval()
    # 音声認識用のインスタンスを生成
    asr_ins = ASR(lang='eng')
    # 話者分離モデルの学習済みパラメータをダウンロード
    speaker_separation_model = BaseModel.from_pretrained(pretrained_param_speaker_separation)
    # 話者識別モデルの学習済みパタメータをロード（いずれはhparamsでパラメータを指定できる様にする TODO）
    embedder = SpeechEmbedder()
    embed_params = torch.load(embedder_path, map_location=device)
    embedder.load_state_dict(embed_params)
    embedder.eval()
    # 声を分離抽出したい人の発話サンプルをロード
    ref_speech_data, _ = sf.read(ref_speech_path)
    # 発話サンプルの特徴量（ログメルスペクトログラム）をベクトルに変換
    ref_complex_spec = audio_processor.calc_complex_spec(ref_speech_data)
    ref_log_mel_spec = audio_processor.calc_log_mel_spec(ref_complex_spec)
    ref_log_mel_spec = torch.from_numpy(ref_log_mel_spec).float()
    ref_dvec = embedder(ref_log_mel_spec[0]) # 入力は1ch分
    # # PyTorchのテンソルからnumpy配列に変換
    # ref_dvec = ref_dvec.detach().numpy().copy() # CPU
    """ref_dvec: (embed_dim=256,)"""
    
    # 評価結果のログを保存するリスト
    eval_logs = []
    # 干渉音の到来方向ごとに評価
    for interference_azimuth in azimuth_list:
        ######################雑音除去＋音声評価#########################
        # 音声評価結果の合計値を格納するリストを用意
        sdr_mix_list = []
        sir_mix_list = []
        sar_mix_list = []
        sdr_est_list = []
        sir_est_list = []
        sar_est_list = []
        # 音声認識用を追加
        wer_clean_list = []
        wer_mix_list = []
        wer_est_list = []
        # 音源定位用に追加
        localization_error_target_list = []
        localization_error_mixed_list = []
        localization_error_estimated_list = []
        
        # 合計処理時間を測るための変数を用意
        processing_duration_sum = 0
        # 実時間比（Real TIme Factor）を測るための変数を用意
        rtf_sum = 0

        mixed_audio_path_list = natsorted(glob.glob(os.path.join(test_data_dir, interference_azimuth, "*_mixed.wav"))) # （例）p232_016_mixed.wav
        for mixed_audio_path in tqdm(mixed_audio_path_list):
            # 処理の開始時間
            iter_start_time = time.perf_counter()
            # 音声データをロード
            mixed_audio_data, _ = sf.read(mixed_audio_path)
            """mixed_audio_data: (num_samples, num_channels)"""
            mixed_complex_spec = audio_processor.calc_complex_spec(mixed_audio_data)
            """mixed_complex_spec: (num_channels, freq_bins, time_steps)"""
            # 処理前の混合音の音源定位結果を記録（この音源定位処理を処理時間に含めている分、処理後の音源定位処理は処理時間に含めない）
            speaker_azimuth_mixed = localize_music(mixed_complex_spec, mic_alignments, sample_rate, fft_size)
            localization_error_mixed = np.abs(speaker_azimuth_mixed - true_target_azimuth)
            localization_error_mixed_list.append(localization_error_mixed)
            
            # 残響除去手法を指定している場合は残響除去処理を実行
            if dereverb_type == 'WPE':
                mixed_complex_spec, _ = audio_processor.dereverberation_wpe_multi(mixed_complex_spec)
                
           # モデルに入力できるように音声をミニバッチに分けながら振幅スペクトログラムに変換
            mixed_amp_spec_batch = audio_processor.preprocess_mask_estimator(mixed_audio_data, batch_length)
            """mixed_amp_spec_batch: (batch_size, num_channels, freq_bins, time_frames)"""
            # 発話とそれ以外の雑音の時間周波数マスクを推定
            speech_mask_output, noise_mask_output = model(mixed_amp_spec_batch)
            """speech_mask_output: (batch_size, num_channels, freq_bins, time_frames), noise_mask_output: (batch_size, num_channels, freq_bins, time_frames)"""
            # ミニバッチに分けられたマスクを時間方向に結合し、混合音にかけて各音源のスペクトログラムを取得
            multichannel_speech_spec, _ = audio_processor.postprocess_mask_estimator(mixed_complex_spec, speech_mask_output, batch_length, target_aware_channel)
            """multichannel_speech_spec: (num_channels, freq_bins, time_frames)"""
            multichannel_noise_spec, estimated_noise_mask = audio_processor.postprocess_mask_estimator(mixed_complex_spec, noise_mask_output, batch_length, noise_aware_channel)
            """multichannel_noise_spec: (num_channels, freq_bins, time_frames)"""
            # 発話のマルチチャンネルスペクトログラムを音声波形に変換
            multichannel_denoised_data = audio_processor.spec_to_wave(multichannel_speech_spec, mixed_audio_data)
            """multichannel_denoised_data: (num_samples, num_channels)"""
            
            # 1ch分を取り出す
            multichannel_denoised_data = multichannel_denoised_data[:, 0][:, np.newaxis]
            """multichannel_denoised_data: (num_samples, num_channels=1)"""

            # 話者分離
            separated_audio_data = audio_processor.speaker_separation(speaker_separation_model, multichannel_denoised_data)
            """separated_audio_data: (num_sources, num_samples, num_channels)"""
            # start_time_speeaker_selector = time.perf_counter()
            # 分離音から目的話者の発話を選出（何番目の発話が目的話者のものかを判断）
            target_speaker_id, speech_amp_spec_all = audio_processor.speaker_selector(embedder, separated_audio_data, ref_dvec)
            """speech_amp_spec_all: (num_sources, num_channels, freq_bins, time_frames)"""
            # print("ID of the target speaker:", target_speaker_id)
            # finish_time_speeaker_selector = time.perf_counter()
            # duration_speeaker_selector = finish_time_speeaker_selector - start_time_speeaker_selector
            # rtf = duration_speeaker_selector / (mixed_audio_data.shape[0] / sample_rate)
            # print("実時間比（Speaker Selector）：{:.3f}".format(rtf))

            # 雑音の振幅スペクトログラムを算出
            noise_amp_spec = audio_processor.calc_amp_spec(multichannel_noise_spec)
            """noise_amp_spec: (num_channels, freq_bins, time_frames)"""
             # IRM計算（将来的にはマスクを使わず、信号から直接空間相関行列を算出できるようにする。あるいはcIRMを使う。） TODO
            estimated_target_mask = np.sqrt(speech_amp_spec_all[target_speaker_id] ** 2 / np.maximum((np.sum(speech_amp_spec_all**2, axis=0) + noise_amp_spec ** 2), 1e-7))
            """estimated_target_mask: (num_channels, freq_bins, time_frames)"""
            estimated_interference_mask = np.zeros_like(estimated_target_mask)
            for id in range(speech_amp_spec_all.shape[0]):
                # 目的話者以外の話者の発話マスクを足し合わせる
                if id == target_speaker_id:
                    pass
                else:
                    estimated_interference_mask += np.sqrt(speech_amp_spec_all[id] ** 2 / np.maximum((np.sum(speech_amp_spec_all**2, axis=0) + noise_amp_spec ** 2), 1e-7))
            """estimated_interference_mask: (num_channels, freq_bins, time_frames)"""

            # 複数チャンネルのうち1チャンネル分のマスクを算出
            if channel_select_type == 'aware':
                # 目的音と干渉音に近いチャンネルのマスクをそれぞれ使用（選択するチャンネルを変えて実験してみるのもあり）
                estimated_target_mask = estimated_target_mask[target_aware_channel, :, :]
                estimated_interference_mask = estimated_interference_mask[noise_aware_channel, :, :]
            elif channel_select_type == 'median' or channel_select_type == 'single':
                # 複数チャンネル間のマスク値の中央値をとる（median pooling）
                estimated_target_mask = np.median(estimated_target_mask, axis=0)
                estimated_interference_mask = np.median(estimated_interference_mask, axis=0)
            """estimated_target_mask: (freq_bins, time_steps), estimated_interference_mask: (freq_bins, time_frames)"""
            
            # 目的音のマスクと雑音のマスクからそれぞれの空間共分散行列を推定
            target_covariance_matrix = estimate_covariance_matrix(mixed_complex_spec, estimated_target_mask)
            interference_covariance_matrix = estimate_covariance_matrix(mixed_complex_spec, estimated_interference_mask)
            noise_covariance_matrix = estimate_covariance_matrix(mixed_complex_spec, estimated_noise_mask)
            noise_covariance_matrix = condition_covariance(noise_covariance_matrix, 1e-6) # これがないと性能が大きく落ちる（雑音の共分散行列のみで良い）
            # noise_covariance_matrix /= np.trace(noise_covariance_matrix, axis1=-2, axis2=-1)[..., None, None]
            # ビームフォーマによる雑音除去を実行
            if beamformer_type == 'MVDR':
                estimated_target_spec = mvdr_beamformer_two_speakers(mixed_complex_spec, target_covariance_matrix, interference_covariance_matrix, noise_covariance_matrix)
                # estimated_interference_spec = mvdr_beamformer_two_speakers(mixed_complex_spec, interference_covariance_matrix, target_covariance_matrix, noise_covariance_matrix)
            elif beamformer_type == 'GEV':
                estimated_target_spec = gev_beamformer(mixed_complex_spec, target_covariance_matrix, noise_covariance_matrix)
            elif beamformer_type == "DS":
                target_steering_vectors = estimate_steering_vector(target_covariance_matrix)
                estimated_target_spec = ds_beamformer(mixed_complex_spec, target_steering_vectors)
            elif beamformer_type == "MWF":
                estimated_target_spec = mwf(mixed_complex_spec, target_covariance_matrix, noise_covariance_matrix)
            elif beamformer_type == 'Sparse':
                estimated_target_spec = sparse(mixed_complex_spec, estimated_target_mask) # マスクが正常に推定できているかどうかをテストする用
            else:
                print("Please specify the correct beamformer type")
            """estimated_target_spec: (num_channels, freq_bins, time_frames)"""

            # マルチチャンネルスペクトログラムを音声波形に変換
            multichannel_estimated_target_voice_data = audio_processor.spec_to_wave(estimated_target_spec, mixed_audio_data)
            # multichannel_estimated_interference_voice_data = audio_processor.spec_to_wave(estimated_interference_spec, mixed_audio_data)
            """multichannel_estimated_target_voice_data: (num_samples, num_channels)"""
            
            # 最大値を元の音声に合わせる場合
            if fit_max_value:
                max_amp_postprocess = multichannel_estimated_voice_data.max()
                multichannel_estimated_voice_data *= max_amp_preprocess / max_amp_postprocess
                
            # 処理の終了時間
            iter_finish_time = time.perf_counter()
            # 1ループ当たりの処理時間（音声波形→STFT→雑音除去→iSTFT→音声波形）
            iter_processing_duration = iter_finish_time - iter_start_time
            processing_duration_sum += iter_processing_duration
            # 実時間比（Real Time Factor）の算出
            iter_rtf = iter_processing_duration / (mixed_audio_data.shape[0] / sample_rate)
            rtf_sum += iter_rtf
            
            # 処理後の推定音の定位誤差を記録
            speaker_azimuth_estimated = localize_music(estimated_target_spec, mic_alignments, sample_rate, fft_size)
            localization_error_estimated = np.abs(speaker_azimuth_estimated - true_target_azimuth)
            localization_error_estimated_list.append(localization_error_estimated)
            
            # ファイル名を取得
            file_num = os.path.basename(mixed_audio_path).split('.')[0].rsplit('_', maxsplit=1)[0] # （例） p232_029_p257_236
            target_file_num = file_num.rsplit('_', maxsplit=2)[0] # （例） p232_029
            # オーディオデータを保存
            estimated_target_voice_path = "./estimated_target_voice.wav"
#             estimated_target_voice_path = "./estimated_target_voice/{}/{}_estimated_target_voice.wav".format(interference_azimuth, file_num)
            sf.write(estimated_target_voice_path, multichannel_estimated_target_voice_data, sample_rate)
            # 干渉雑音の方位角を取得
            target_voice_path = os.path.join(test_data_dir, interference_azimuth, target_file_num + "_target.wav") # （例）p232_029_target.wav
            interference_audio_path = os.path.join(test_data_dir, interference_azimuth, file_num + "_interference.wav") # （例）p232_029_p257_236_interference.wav
            
            # 処理前の目的話者の定位誤差を記録
            target_voice_data, _ = sf.read(target_voice_path)
            target_complex_spec = audio_processor.calc_complex_spec(target_voice_data)
            speaker_azimuth_target = localize_music(target_complex_spec, mic_alignments, sample_rate, fft_size)
            localization_error_target = np.abs(speaker_azimuth_target - true_target_azimuth)
            localization_error_target_list.append(localization_error_target)
            
            # 音声評価
            # 音源分離性能の評価        
            sdr_mix, sir_mix, sar_mix, sdr_est, sir_est, sar_est = audio_eval(sample_rate, \
            target_voice_path, interference_audio_path, mixed_audio_path, estimated_target_voice_path)
            # 音声評価結果を記録
            sdr_mix_list.append(sdr_mix)
            sir_mix_list.append(sir_mix)
            sar_mix_list.append(sar_mix)
            sdr_est_list.append(sdr_est)
            sir_est_list.append(sir_est)
            sar_est_list.append(sar_est)
            # 音声認識性能の評価
            # 音声認識を実行
            target_voice_recog_text = asr_ins.speech_recognition(target_voice_path) # （例） IT IS MARVELLOUS
            target_voice_recog_text = target_voice_recog_text.replace('.', '').replace(',', '').upper().split() # （例） ['IT', 'IS', 'MARVELLOUS']
            mixed_audio_recog_text = asr_ins.speech_recognition(mixed_audio_path)
            mixed_audio_recog_text = mixed_audio_recog_text.replace('.', '').replace(',', '').upper().split()
            estimated_voice_recog_text = asr_ins.speech_recognition(estimated_target_voice_path)
            estimated_voice_recog_text = estimated_voice_recog_text.replace('.', '').replace(',', '').upper().split()
            # 正解ラベルを読み込む
            reference_label_path = os.path.join(reference_label_dir, target_file_num + '.txt')
            with open(reference_label_path, 'r', encoding="utf8") as ref:
                # ピリオドとコンマを消して大文字に変換した後、スペースで分割
                reference_label_text = ref.read().replace('.', '').replace(',', '').upper().split()  
            # WERを計算
            clean_recog_result_save_path = os.path.join(recog_result_dir, file_num + '_clean.txt')
            mix_recog_result_save_path = os.path.join(recog_result_dir, file_num + '_mix.txt')
            est_recog_result_save_path = os.path.join(recog_result_dir, file_num + '_est.txt')
            wer_clean = asr_eval(reference_label_text, target_voice_recog_text, clean_recog_result_save_path)
            wer_mix = asr_eval(reference_label_text, mixed_audio_recog_text, mix_recog_result_save_path)
            wer_est = asr_eval(reference_label_text, estimated_voice_recog_text, est_recog_result_save_path)
            wer_clean_list.append(wer_clean)
            wer_mix_list.append(wer_mix)
            wer_est_list.append(wer_est)
            # 推定音声が蓄積されないように削除
            os.remove(estimated_target_voice_path)
            
            # 音源定位性能の評価
            

        # データの数を取得
        num_file = len(mixed_audio_path_list)
        print("#" * 50)
        print("使用デバイス：" , device)
        print("干渉音の方向：{}deg".format(interference_azimuth))
        print("合計処理時間：{:.3f}sec".format(processing_duration_sum))
        print("平均処理時間：{:.3f}sec".format(processing_duration_sum/num_file))
        print("合計実時間比：{:.3f}".format(rtf_sum))
        print("平均実時間比：{:.3f}".format(rtf_sum/num_file))
        print("============================音源分離性能===============================")
        print("平均 | SDR_mix: {:.3f}, SIR_mix: {:.3f}, SAR_mix: {:.3f}".format(np.mean(sdr_mix_list), np.mean(sir_mix_list), np.mean(sar_mix_list)))
        print("平均 | SDR_est: {:.3f}, SIR_est: {:.3f}, SAR_est: {:.3f}".format(np.mean(sdr_est_list), np.mean(sir_est_list), np.mean(sar_est_list)))
        print("標準偏差 | SDR_mix: {:.3f}, SIR_mix: {:.3f}, SAR_mix: {:.3f}".format(np.std(sdr_mix_list), np.std(sir_mix_list), np.std(sar_mix_list)))
        print("標準偏差 | SDR_est: {:.3f}, SIR_est: {:.3f}, SAR_est: {:.3f}".format(np.std(sdr_est_list), np.std(sir_est_list), np.std(sar_est_list)))
        print("============================音声認識性能===============================")
        print("平均 | WER_clean: {:.3f}".format(np.mean(wer_clean_list)))
        print("平均 | WER_mix: {:.3f}".format(np.mean(wer_mix_list)))
        print("平均 | WER_est: {:.3f}".format(np.mean(wer_est_list)))
        print("標準偏差 | WER_clean: {:.3f}".format(np.std(wer_clean_list)))
        print("標準偏差 | WER_mix: {:.3f}".format(np.std(wer_mix_list)))
        print("標準偏差 | WER_est: {:.3f}".format(np.std(wer_est_list)))
        print("============================音源定位性能===============================")
        print("平均 | LE_clean：{:.3f}deg".format(np.mean(localization_error_target_list)))
        print("平均 | LE_mix：{:.3f}deg".format(np.mean(localization_error_mixed_list)))
        print("平均 | LE_est：{:.3f}deg".format(np.mean(localization_error_estimated_list)))
        print("標準偏差 | LE_clean：{:.3f}deg".format(np.std(localization_error_target_list)))
        print("標準偏差 | LE_mix：{:.3f}deg".format(np.std(localization_error_mixed_list)))
        print("標準偏差 | LE_est：{:.3f}deg".format(np.std(localization_error_estimated_list)))
              
        
        # 評価結果をエクセルに保存
        log_azimuth_wise = {"干渉音の方向": interference_azimuth, "平均実時間比": rtf_sum/num_file, \
                            "SDRの平均（混合音）": np.mean(sdr_mix_list), "SDRの平均（推定音）": np.mean(sdr_est_list), \
                            "SDRの標準偏差（混合音）": np.std(sdr_mix_list), "SDRの標準偏差（推定音）": np.std(sdr_est_list),\
                            "SIRの平均（混合音）": np.mean(sir_mix_list), "SIRの平均（推定音）": np.mean(sir_est_list), \
                            "SIRの標準偏差（混合音）": np.std(sir_mix_list), "SIRの標準偏差（推定音）": np.std(sir_est_list), \
                            "WERの平均（目的音）": np.mean(wer_clean_list), "WERの平均（混合音）": np.mean(wer_mix_list), "WERの平均（推定音）": np.mean(wer_est_list), \
                            "WERの標準偏差（目的音）": np.std(wer_clean_list), "WERの標準偏差（混合音）": np.std(wer_mix_list), "WERの標準偏差（推定音）": np.std(wer_est_list), \
                            "LEの平均（目的音）": np.mean(localization_error_target_list), "LEの平均（混合音）": np.mean(localization_error_mixed_list), "LEの平均（推定音）": np.mean(localization_error_estimated_list), \
                            "LEの標準偏差（目的音）": np.std(localization_error_target_list), "LEの標準偏差（混合音）": np.std(localization_error_mixed_list), "LEの標準偏差（推定音）": np.std(localization_error_estimated_list)}
        eval_logs.append(log_azimuth_wise)
        df = pd.DataFrame(eval_logs)
        excel_file_name = "eval_result_{}_{}_{}_{}_dereverb_{}.xlsx".format(test_data_dir.split('/')[-2], model_type, pretrained_param_speaker_separation.split('/')[-1], beamformer_type, str(dereverb_type))
        log_save_path = os.path.join(checkpoint_path.rsplit('/', maxsplit=1)[0], excel_file_name)
        df.to_excel(log_save_path, index=False)