# ASR evaluation
we perform subjective evaluation based on CER/WER by strong ASR

In [1]:
import scipy.io.wavfile as wavfile
import librosa
import os
from tqdm import tqdm
import shutil
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import torch
torch.cuda.set_device(0)
inference_16k_pipline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer_asr_nat-aishell1-pytorch', device='gpu')

rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
print(rec_result)

2024-01-18 11:22:16,715 - modelscope - INFO - PyTorch version 2.1.2 Found.
2024-01-18 11:22:16,716 - modelscope - INFO - Loading ast index from /home/lixing/.cache/modelscope/ast_indexer
2024-01-18 11:22:16,731 - modelscope - INFO - Loading done! Current index file version is 1.11.0, with md5 72658170e900d37d3b1ed615d8d439fe and a total number of 953 components indexed
  from .autonotebook import tqdm as notebook_tqdm
2024-01-18 11:22:20,234 - modelscope - INFO - initiate model from /home/lixing/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-aishell1-pytorch
2024-01-18 11:22:20,234 - modelscope - INFO - initiate model from location /home/lixing/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-aishell1-pytorch.
2024-01-18 11:22:20,235 - modelscope - INFO - initialize model from /home/lixing/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-aishell1-pytorch


Please install rotary_embedding_torch by: 
 pip install -U rotary_embedding_torch
If you want use h5py dataset, please pip install h5py, and try it again


2024-01-18 11:22:23,173 - modelscope - INFO - Decoding with wav files ...
2024-01-18 11:22:23,605 - modelscope - INFO - Computing the result of ASR ...


{'text': '欢迎大家来体验打磨愿推出的语音识别模型'}


In [None]:
'''
prepare processed data into _tmp folder
1. separate to 1-st channel (vibration) or 2-nd channel (microphone)
'''
new_rate = 16000
dataset = '../V2S/'
dataset_target = '../V2S_tmp/'
for speaker in os.listdir(dataset):
    if not os.path.exists(dataset_target + speaker):
        os.mkdir(dataset_target + speaker)
    for date in os.listdir(dataset + speaker):
        # print(speaker, date)
        if not os.path.exists(dataset_target + speaker + "/" + date):
            os.mkdir(dataset_target + speaker + "/" + date)
        for wav in os.listdir(dataset + speaker + "/" + date):
            if wav[-3:] != "wav":
                shutil.copy(dataset + speaker + "/" + date + "/" + wav, dataset_target + speaker + "/" + date + "/" + wav)
                continue
            path = dataset + speaker + "/" + date + "/" + wav
            data, sample_rate = librosa.load(path, mono=False, sr=None)
            samples = round(len(data) * float(new_rate) / sample_rate)
            new_data = librosa.resample(data[1], orig_sr=sample_rate, target_sr=new_rate, scale=True)
            wavfile.write(dataset_target + speaker + "/" + date + "/" + wav, new_rate, new_data.T)

In [2]:
'''
evaluation 
python inference.py
'''
import jiwer
data = '../V2S_tmp/'
output = {}
for speaker in os.listdir(data):
    directory = os.path.join(data, speaker)
    for date in os.listdir(directory):
        hypotheses = []
        references = []
        directory_date = os.path.join(directory, date)
        labels = os.path.join(directory_date, 'labels.txt')
        labels = open(labels, 'r').readlines()
        for label in tqdm(labels):
            l = label.strip().split(' ')
            file = l[0]
            text = ''.join(l[1:])
            file = os.path.join(directory_date, file+'.wav')
            try: 
                rec_result = inference_16k_pipline(audio_in=file,)
                hypotheses.append(rec_result['text'])
                references.append(text)
            except:
                pass
        wer = jiwer.cer(hypotheses, references)
        output[speaker + '_' + date] = round(wer * 100, 2)
import pickle
with open('saved_dict.pkl', 'wb') as f:
    pickle.dump(output, f)

  0%|          | 0/50 [00:00<?, ?it/s]2024-01-18 11:22:31,675 - modelscope - INFO - Decoding with wav files ...
2024-01-18 11:22:31,740 - modelscope - INFO - Computing the result of ASR ...
2024-01-18 11:22:31,742 - modelscope - INFO - Decoding with wav files ...
2024-01-18 11:22:31,811 - modelscope - INFO - Computing the result of ASR ...
  4%|▍         | 2/50 [00:00<00:03, 14.57it/s]2024-01-18 11:22:31,812 - modelscope - INFO - Decoding with wav files ...
2024-01-18 11:22:31,875 - modelscope - INFO - Computing the result of ASR ...
2024-01-18 11:22:31,877 - modelscope - INFO - Decoding with wav files ...
2024-01-18 11:22:31,965 - modelscope - INFO - Computing the result of ASR ...
  8%|▊         | 4/50 [00:00<00:03, 13.56it/s]2024-01-18 11:22:31,967 - modelscope - INFO - Decoding with wav files ...
2024-01-18 11:22:32,024 - modelscope - INFO - Computing the result of ASR ...
2024-01-18 11:22:32,025 - modelscope - INFO - Decoding with wav files ...
2024-01-18 11:22:32,081 - modelscope

In [None]:

folder = '../ABCS/'
split = 'dev'
data = os.path.join(folder, 'Audio', split)
label_folder = os.path.join(folder, 'script', split)
hypotheses = []
references = []
for labels in os.listdir(label_folder):
    speaker = labels.split('_')[0]
    labels = open(os.path.join(label_folder, labels), 'r').readlines()
    directory = os.path.join(data, speaker)
    for label in tqdm(labels):
        l = label.strip().split(' ')
        file = l[0]
        text = ''.join(l[1:])
        file = os.path.join(directory, file+'.wav')
        rec_result = inference_16k_pipline(audio_in=file,)
        print(rec_result['text'], text)
        hypotheses.append(rec_result['text'])
        references.append(text)
        break

import jiwer
wer = jiwer.cer(hypotheses, references)
print(f"CER: {wer * 100:.2f} %") 