# Audio Metrics Test

## Generate Audio

In [None]:
import torch
from vc.trainer_encodec_vc_inference import get_ar_prediction_sampling_rate, pack_inputs_v2
from types import SimpleNamespace
from vc.encodec_model.nar_bart_model import NARBartForConditionalGeneration
from transformers import BartForConditionalGeneration, AutoTokenizer
from dpo_eval import extract_data_from_json
import os

base_path = "/work/b0990106x/trl"
ts = "test_sampling_rate"
device = "cuda" if torch.cuda.is_available() else "cpu"

agent_output_dir = os.path.join(base_path, "output", ts) # Path of saving the generated audio for reward model to evaluate
os.makedirs(agent_output_dir, exist_ok=True)

args_predict = SimpleNamespace(output_path=f"{base_path}/output/{ts}/example.wav", seed=0, device=device)
ar_checkpoint = "lca0503/speech-chatgpt-base-ar-v2-epoch10-wotrans"
nar_checkpoint = "lca0503/speech-chatgpt-base-nar-v2-epoch4-wotrans"

nar_model = NARBartForConditionalGeneration.from_pretrained(nar_checkpoint)
ar_tokenizer = AutoTokenizer.from_pretrained(ar_checkpoint)
nar_tokenizer = AutoTokenizer.from_pretrained(nar_checkpoint)
ar_model = BartForConditionalGeneration.from_pretrained(ar_checkpoint, return_dict=True)

all_src_encodec, all_instruction, all_tgt_encodec = extract_data_from_json('dpo_data/src_encodec.json')
# single_src_encodec includes the first two examples in the dataset
# single_instruction includes the first two examples in the dataset

src_encodec = all_src_encodec[0:2]
instruction = all_instruction[0:2]
output_checkpoints = []
for i in range(len(src_encodec)):
    single_src_encodec = src_encodec[i]
    single_instruction = instruction[i]
    _, _, output_checkpoint = get_ar_prediction_sampling_rate(args_predict, ar_model, nar_model, ar_tokenizer, nar_tokenizer, single_src_encodec, single_instruction, episode_counter=i, temperature = 1.0)
    output_checkpoints.append(output_checkpoint)   

## PESQ

In [None]:
from pesq import pesq
from scipy.io import wavfile

output_checkpoint_1 = "/work/b0990106x/trl/output/test_sampling_rate/example_save_0.wav"
output_checkpoint_2 = "/work/b0990106x/trl/output/test_sampling_rate/example_save_1.wav"

rate, ref = wavfile.read(output_checkpoint_1)
print(rate)
rate, deg = wavfile.read(output_checkpoint_2)
print(rate)

print(pesq(rate, ref, deg, 'wb'))
print(pesq(rate, ref, deg, 'nb'))


## ASR Whisper

### Requires:
#### pip install --user transformers==4.36

In [1]:
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import torchaudio

from transformers import WhisperForConditionalGeneration, WhisperProcessor

In [2]:
output_temp = "/work/b0990106x/trl/output/0728-1417/example_save_eval_-1_data_0.wav"
output_checkpoint_1 = "/work/b0990106x/trl/output/test_sampling_rate/example_save_0.wav"
output_checkpoint_2 = "/work/b0990106x/trl/output/test_sampling_rate/example_save_1.wav"

def load_recorded_audio(path_audio,input_sample_rate=24000,output_sample_rate=24000):
    # Dataset: convert recorded audio to vector
    waveform, sample_rate = torchaudio.load(path_audio)
    waveform_resampled = torchaudio.functional.resample(waveform, orig_freq=input_sample_rate, new_freq=output_sample_rate) #change sample rate to 16000 to match training. 
    sample = waveform_resampled.numpy()[0]
    return sample

device = torch.device('cpu')
torch_dtype = torch.float32

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
model.to(device)
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")

whisper = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

text = whisper(output_checkpoint_1)

print(text)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'text': " There's even a white row of beehives in the orchard under the walnut trees.", 'chunks': [{'timestamp': (0.0, 12.96), 'text': " There's even a white row of beehives in the orchard under the walnut trees."}]}


#### Word Error Rate
#### Require pip install jiwer

In [6]:
text = {'text': " There's even a white row of beehives in the orchard under the walnut trees.", 'chunks': [{'timestamp': (0.0, 12.96), 'text': " There's even a white row of beehives in the orchard under the walnut trees."}]}
# print the text dict
print(text['text'])
hypothesis = text['text']
reference = "There is even a white row of beehives in the orchard, under the walnut trees."

 There's even a white row of beehives in the orchard under the walnut trees.


In [7]:
import jiwer

transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

wer = jiwer.wer(
                reference,
                hypothesis,
                truth_transform=transforms,
                hypothesis_transform=transforms,
            )
print(f"Word Error Rate (WER) :", wer)

Word Error Rate (WER) : 0.06666666666666667


In [1]:
from torch import randn
from torchmetrics.audio import DeepNoiseSuppressionMeanOpinionScore
g = torch.manual_seed(1)
preds = randn(8000)
dnsmos = DeepNoiseSuppressionMeanOpinionScore(8000, False)
dnsmos(preds)

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'DeepNoiseSuppressionMeanOpinionScore' from 'torchmetrics.audio' (/home/b0990106x/miniconda3/envs/trl/lib/python3.10/site-packages/torchmetrics/audio/__init__.py)