# ASR evaluation
we perform subjective evaluation based on 
1. CER/WER by strong ASR
2. estimated SI-SNR
The dataset includes "clean"(AISHELL), "noisy"(wild V2S), "processed"(V2S_tmp)

In [1]:
import scipy.io.wavfile as wavfile
import librosa
import os
from tqdm import tqdm
import shutil
import matplotlib.pyplot as plt
import numpy as np


In [2]:
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import torch
torch.cuda.set_device(1)
inference_16k_pipline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer_asr_nat-aishell1-pytorch', device='gpu')

rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
print(rec_result)

2023-09-19 11:11:43,633 - modelscope - INFO - PyTorch version 1.13.0 Found.
2023-09-19 11:11:43,634 - modelscope - INFO - Loading ast index from /home/lixing/.cache/modelscope/ast_indexer
2023-09-19 11:11:43,699 - modelscope - INFO - Loading done! Current index file version is 1.8.4, with md5 bc2412af0ceee67320f5157c651de533 and a total number of 902 components indexed
2023-09-19 11:11:45,274 - modelscope - INFO - Model revision not specified, use the latest revision: v1.1.8
2023-09-19 11:11:45,621 - modelscope - INFO - initiate model from /home/lixing/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-aishell1-pytorch
2023-09-19 11:11:45,622 - modelscope - INFO - initiate model from location /home/lixing/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-aishell1-pytorch.
2023-09-19 11:11:45,625 - modelscope - INFO - initialize model from /home/lixing/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-aishell1-pytorch
2023-09-19 11:11:48,494 - modelscope - INFO - Decoding with w

{'text': '欢迎大家来体验打磨愿推出的语音识别模型'}


In [3]:
'''
prepare processed data into _tmp folder
1. separate to 1-st channel (vibration) or 2-nd channel (microphone)
'''
new_rate = 16000
dataset = '../V2S/'
dataset_target = '../V2S_tmp/'
for speaker in os.listdir(dataset):
    if not os.path.exists(dataset_target + speaker):
        os.mkdir(dataset_target + speaker)
    for date in os.listdir(dataset + speaker):
        # print(speaker, date)
        if not os.path.exists(dataset_target + speaker + "/" + date):
            os.mkdir(dataset_target + speaker + "/" + date)
        for wav in os.listdir(dataset + speaker + "/" + date):
            if wav[-3:] != "wav":
                shutil.copy(dataset + speaker + "/" + date + "/" + wav, dataset_target + speaker + "/" + date + "/" + wav)
                continue
            path = dataset + speaker + "/" + date + "/" + wav
            data, sample_rate = librosa.load(path, mono=False, sr=None)
            samples = round(len(data) * float(new_rate) / sample_rate)
            new_data = librosa.resample(data[0], orig_sr=sample_rate, target_sr=new_rate, scale=True)
            wavfile.write(dataset_target + speaker + "/" + date + "/" + wav, new_rate, new_data.T)

In [None]:
dataset = '../V2S/'
dataset_target = '../V2S_tmp/'
for speaker in os.listdir(dataset):
    for date in os.listdir(dataset + speaker):
        for wav in os.listdir(dataset + speaker + "/" + date):
            if wav[-3:] != "wav":
                continue
            path = dataset + speaker + "/" + date + "/" + wav
            data_before, sample_rate = librosa.load(path, mono=False, sr=None)
            data_before = data_before[1]
            path = dataset_target + speaker + "/" + date + "/" + wav
            data_after, sample_rate = librosa.load(path, mono=False, sr=None)
            plt.subplot(1, 2, 1)
            plt.plot(data_before)
            plt.subplot(1, 2, 2)
            plt.plot(data_after)
            plt.show()
            break
    break

In [3]:
'''
evaluation 
python inference.py
'''
import jiwer
data = '../V2S_tmp/'
output = {}
for speaker in os.listdir(data):
    directory = os.path.join(data, speaker)
    for date in os.listdir(directory):
        hypotheses = []
        references = []
        directory_date = os.path.join(directory, date)
        labels = os.path.join(directory_date, 'labels.txt')
        labels = open(labels, 'r').readlines()
        for label in tqdm(labels):
            l = label.strip().split(' ')
            file = l[0]
            text = ''.join(l[1:])
            file = os.path.join(directory_date, file+'.wav')
            try: 
                rec_result = inference_16k_pipline(audio_in=file,)
                hypotheses.append(rec_result['text'])
                references.append(text)
            except:
                pass
        wer = jiwer.cer(hypotheses, references)
        output[speaker + '_' + date] = round(wer * 100, 2)
print(output)   

  0%|          | 0/75 [00:00<?, ?it/s]2023-09-19 11:11:53,626 - modelscope - INFO - Decoding with wav files ...
2023-09-19 11:11:53,660 - modelscope - INFO - Computing the result of ASR ...
2023-09-19 11:11:53,661 - modelscope - INFO - Decoding with wav files ...
2023-09-19 11:11:53,682 - modelscope - INFO - Computing the result of ASR ...
2023-09-19 11:11:53,682 - modelscope - INFO - Decoding with wav files ...
2023-09-19 11:11:53,701 - modelscope - INFO - Computing the result of ASR ...
2023-09-19 11:11:53,701 - modelscope - INFO - Decoding with wav files ...
2023-09-19 11:11:53,723 - modelscope - INFO - Computing the result of ASR ...
2023-09-19 11:11:53,724 - modelscope - INFO - Decoding with wav files ...
2023-09-19 11:11:53,746 - modelscope - INFO - Computing the result of ASR ...
  7%|▋         | 5/75 [00:00<00:01, 41.35it/s]2023-09-19 11:11:53,747 - modelscope - INFO - Decoding with wav files ...
2023-09-19 11:11:53,769 - modelscope - INFO - Computing the result of ASR ...
2023

{'Bian_Chen_2023-09-07': 10.26, 'Shaoyang_Yang_2023-09-10': 23.5, 'Haozheng_Hou_2023-09-11': 27.17, 'Bowen_Zheng_2023-09-07': 6.73, 'Kaiwei_Liu_2023-09-11': 25.88, 'Youdong_Wang_2023-09-11': 14.56, 'Bufang_Yang_2023-09-09': 21.49, 'Bufang_Yang_2023-09-05': 13.79, 'Sitong_Cheng_2023-09-07': 4.58, 'Lixing_He_2023-09-10': 14.6, 'Lixing_He_2023-09-11': 32.04, 'Lixing_He_2023-09-09': 13.36, 'Lixing_He_2023-09-05': 6.71}





In [None]:

folder = '../ABCS/'
split = 'dev'
data = os.path.join(folder, 'Audio', split)
label_folder = os.path.join(folder, 'script', split)
hypotheses = []
references = []
for labels in os.listdir(label_folder):
    speaker = labels.split('_')[0]
    labels = open(os.path.join(label_folder, labels), 'r').readlines()
    directory = os.path.join(data, speaker)
    for label in tqdm(labels):
        l = label.strip().split(' ')
        file = l[0]
        text = ''.join(l[1:])
        file = os.path.join(directory, file+'.wav')
        rec_result = inference_16k_pipline(audio_in=file,)
        print(rec_result['text'], text)
        hypotheses.append(rec_result['text'])
        references.append(text)
        break

import jiwer
wer = jiwer.cer(hypotheses, references)
print(f"CER: {wer * 100:.2f} %") 

In [14]:
'''
evaluation for estimated SISNR
python inference.py
'''
from speechbrain.pretrained.interfaces import SNREstimator as snrest
import torch
import torchaudio

# 3- Estimate the performance
snr_est_model = snrest.from_hparams(source="speechbrain/REAL-M-sisnr-estimator",savedir='pretrained_models/REAL-M-sisnr-estimator')
# snrhat = snr_est_model.estimate_batch(mix, est_sources)
# print(snrhat) # Estimates are in dB / 10 (in the range 0-1, e.g., 0 --> 0dB, 1 --> 10dB)

mix_data = '../V2S/'
data = '../V2S_tmp/'
speaker_whitelist = ['Bufang_Yang', 'Lixing_He']
date_whitelist = ['2023-09-05', '2023-09-07']
for speaker in os.listdir(data):
    if speaker not in speaker_whitelist:
        continue
    directory = os.path.join(data, speaker)
    for date in os.listdir(directory):
        if date not in date_whitelist:
            continue
        directory_date = os.path.join(directory, date)
        labels = os.path.join(directory_date, 'labels.txt')
        labels = open(labels, 'r').readlines()
        for label in tqdm(labels):
            l = label.strip().split(' ')
            file = l[0]
            text = ''.join(l[1:])
            file = os.path.join(directory_date, file+'.wav')
            est_source, fs = torchaudio.load(file)
            # mix_file = file.replace(data, mix_data)
            # mix, fs = torchaudio.load(mix_file)
            # mix = mix[1:, :]
            # est_source *= mix.max() / est_source.max()
            # est_noise = mix - est_source
            # est_sources = torch.stack([est_source, est_noise], dim=-1)

            est_sources = est_source[:, :, None].repeat(1, 1, 2)
            mix = est_source * 2
            # fig, axs = plt.subplots(2)
            # axs[0].plot(mix[0, :])
            # axs[1].plot(est_sources[0, :])
            # plt.show()
            snrhat = snr_est_model.estimate_batch(mix, est_sources)
            print(snrhat)
            break


  0%|          | 0/143 [00:00<?, ?it/s]


tensor([0.4867, 0.4867])


  0%|          | 0/134 [00:00<?, ?it/s]

tensor([1.1396, 1.1395])



