# downloads

In [44]:
!pip install transformers



In [45]:
!mkdir generated_speech

In [46]:
cd generated_speech

/content/NISQA/generated_speech


In [47]:
!gdown 1TdR-i5ZGhaPMKlalDeDBD_zNSjDwKpOz # bark sentence-0 wav
!gdown 16xSMLidkfPAGh2bMQihbtm-banaGKUko # bark generated speech
!gdown 1-Fnqxo0hdoBH4ZzU6rurPZrNeiHIvkrQ # bark voice prompt

!gdown 177-Vob0v1V-FjqZaaLVeeldcJ4ZkVVnv # rtvc sentence-0 wav
!gdown 1Ce6JFl7IjnTOWdtmvEeP21i_hv8QGVud # rtvc generated speech

!gdown 1-Scs8UrScADhYwisPh8GhRpzdz44O-k9 # tortoise sentence-0 wav

Downloading...
From: https://drive.google.com/uc?id=1TdR-i5ZGhaPMKlalDeDBD_zNSjDwKpOz
To: /content/NISQA/generated_speech/bark_result_sentence_0.wav
100% 571k/571k [00:00<00:00, 24.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=16xSMLidkfPAGh2bMQihbtm-banaGKUko
To: /content/NISQA/generated_speech/bark_result_speech.wav
100% 72.4M/72.4M [00:00<00:00, 97.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-Fnqxo0hdoBH4ZzU6rurPZrNeiHIvkrQ
To: /content/NISQA/generated_speech/10sec_vocal.wav
100% 1.29M/1.29M [00:00<00:00, 20.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=177-Vob0v1V-FjqZaaLVeeldcJ4ZkVVnv
To: /content/NISQA/generated_speech/rtvc_result_sentence_0.wav
100% 624k/624k [00:00<00:00, 9.13MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Ce6JFl7IjnTOWdtmvEeP21i_hv8QGVud
To: /content/NISQA/generated_speech/rtvc_result_speech.wav
100% 48.9M/48.9M [00:02<00:00, 18.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-Scs8UrScADhYwisPh8G

In [79]:
!gdown 1-_xiPZsN0o5uuxSKex94yUYezRtcvccM # translated_sentences

Downloading...
From: https://drive.google.com/uc?id=1-_xiPZsN0o5uuxSKex94yUYezRtcvccM
To: /content/NISQA/translation_result_sentences.pkl
  0% 0.00/9.63k [00:00<?, ?B/s]100% 9.63k/9.63k [00:00<00:00, 30.5MB/s]


In [48]:
cd /content

/content


# library

In [75]:
import librosa
import torch
import torchaudio
from IPython.display import Audio

# Evaluate

## Similarity between orig voice and generated voice

In [50]:
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
import torch

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-sv')
model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-sv')

### bark

In [51]:
original, sr = torchaudio.load("/content/generated_speech/10sec_vocal.wav")
# stereo --> mono
original = torch.mean(original, dim=0, keepdim=True)
# sr = 48000 --> 16000
default_sr = 16000
resampler = torchaudio.transforms.Resample(sr, default_sr)
original = resampler(original)

generated, sr = torchaudio.load("/content/generated_speech/bark_result_sentence_0.wav")
# stereo --> mono
generated = torch.mean(generated, dim=0, keepdim=True)
# sr = 48000 --> 16000
default_sample_rate = 16000
resampler = torchaudio.transforms.Resample(sr, default_sample_rate)
generated = resampler(generated)

min_length = min(original.shape[1], generated.shape[1])
generated = generated[:,:min_length]
original = original[:,:min_length]

In [52]:
Audio(original.numpy().squeeze(), rate=default_sr)

In [53]:
Audio(generated.numpy().squeeze(), rate=default_sr)

In [54]:
# audio files are decoded on the fly
inputs = feature_extractor(torch.cat((original, generated)), return_tensors="pt", sampling_rate=default_sr)
embeddings = model(inputs.input_values.squeeze(0)).embeddings
embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
# the resulting embeddings can be used for cosine similarity-based retrieval
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings[0], embeddings[1])
print(similarity)
print(cosine_sim(original, generated))

tensor(0.8036, grad_fn=<SumBackward1>)
tensor([0.0120])


### rtvc

In [55]:
original, sr = torchaudio.load("/content/generated_speech/10sec_vocal.wav")
# stereo --> mono
original = torch.mean(original, dim=0, keepdim=True)
# sr = 48000 --> 16000
default_sample_rate = 16000
resampler = torchaudio.transforms.Resample(sr, default_sample_rate)
original = resampler(original)

generated, sr = torchaudio.load("/content/generated_speech/rtvc_result_sentence_0.wav")
# stereo --> mono
generated = torch.mean(generated, dim=0, keepdim=True)
# sr = 48000 --> 16000
default_sample_rate = 16000
resampler = torchaudio.transforms.Resample(sr, default_sample_rate)
generated = resampler(generated)

min_length = min(original.shape[1], generated.shape[1])
generated = generated[:,:min_length]
original = original[:,:min_length]

In [56]:
Audio(original.numpy().squeeze(), rate=default_sr)

In [57]:
Audio(generated.numpy().squeeze(), rate=default_sr)

In [58]:
# audio files are decoded on the fly
inputs = feature_extractor(torch.cat((original, generated)), return_tensors="pt", sampling_rate=16000)
embeddings = model(inputs.input_values.squeeze(0)).embeddings
embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
# the resulting embeddings can be used for cosine similarity-based retrieval
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings[0], embeddings[1])
print(similarity)
print(cosine_sim(original, generated))

tensor(0.8551, grad_fn=<SumBackward1>)
tensor([0.0027])


### tortoise

In [59]:
original, sr = torchaudio.load("/content/generated_speech/10sec_vocal.wav")
# stereo --> mono
original = torch.mean(original, dim=0, keepdim=True)
# sr = 48000 --> 16000
default_sample_rate = 16000
resampler = torchaudio.transforms.Resample(sr, default_sample_rate)
original = resampler(original)

generated, sr = torchaudio.load("/content/generated_speech/tortoise_high_quality_24000_sentence_1.wav")
# stereo --> mono
generated = torch.mean(generated, dim=0, keepdim=True)
# sr = 48000 --> 16000
default_sample_rate = 16000
resampler = torchaudio.transforms.Resample(sr, default_sample_rate)
generated = resampler(generated)

min_length = min(original.shape[1], generated.shape[1])
generated = generated[:,:min_length]
original = original[:,:min_length]

In [60]:
Audio(original.numpy().squeeze(), rate=default_sr)

In [61]:
Audio(generated.numpy().squeeze(), rate=default_sr)

In [62]:
# audio files are decoded on the fly
inputs = feature_extractor(torch.cat((original, generated)), return_tensors="pt", sampling_rate=16000)
embeddings = model(inputs.input_values.squeeze(0)).embeddings
embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
# the resulting embeddings can be used for cosine similarity-based retrieval
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings[0], embeddings[1])
print(similarity)

tensor(0.8855, grad_fn=<SumBackward1>)


In [63]:
print(cosine_sim(original, generated))

tensor([0.0143])


## NISQA

In [64]:
!git clone https://github.com/gabrielmittag/NISQA.git

fatal: destination path 'NISQA' already exists and is not an empty directory.


In [77]:
ls

[0m[01;34mconfig[0m/            LICENSE    run_evaluate.py  [01;34mshort_sentences[0m/
env.yml            [01;34mnisqa[0m/     run_predict.py   [01;34mweights[0m/
[01;34mgenerated_speech[0m/  README.md  run_train.py


In [76]:
cd NISQA

[Errno 2] No such file or directory: 'NISQA'
/content/NISQA


In [73]:
!mkdir /content/short_sentences
!cp /content/generated_speech/*_sentence*.wav /content/short_sentences

In [78]:
!python run_predict.py --mode predict_dir --pretrained_model weights/nisqa.tar --data_dir /content/short_sentences --num_workers 0 --bs 10 --output_dir /content

Device: cpu
Model architecture: NISQA_DIM
Loaded pretrained model from weights/nisqa.tar
# files: 3
---> Predicting ...
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
                                       deg  mos_pred  noi_pred  dis_pred  col_pred  loud_pred   model
                rtvc_result_sentence_0.wav  1.883422  1.856739  1.990683  2.432577   3.139570 NISQAv2
                bark_result_sentence_0.wav  3.634589  3.902316  4.102291  3.708409   3.700254 NISQAv2
tortoise_high_quality_24000_sentence_1.wav  4.724054  4.384707  4.456821  3.967483   4.050156 NISQAv2


## ASR WER

In [80]:
import torch
import torchaudio
from transformers import pipeline
import pickle

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [82]:
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-medium", device=device
  )

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [83]:
def asr(path2data):
  signal, sample_rate = torchaudio.load(path2data)
  # stereo --> mono
  signal = torch.mean(signal, dim=0, keepdim=True)
  # sr = 48000 --> 16000
  default_sample_rate = 16000
  resampler = torchaudio.transforms.Resample(sample_rate, default_sample_rate)
  signal = resampler(signal)
  signal = signal.numpy().squeeze()

  return pipe(signal.copy())

In [84]:
path2bark = "/content/generated_speech/bark_result_sentence_0.wav"
asr_bark = asr(path2bark)
path2rtvc = "/content/generated_speech/rtvc_result_sentence_0.wav"
asr_rtvc = asr(path2rtvc)
path2tortoise = "/content/generated_speech/tortoise_high_quality_24000_sentence_1.wav"
asr_tortoise = asr(path2tortoise)

In [85]:
import pickle

path2sentences = '/content/translation_result_sentences.pkl'
with open(f'{path2sentences}', 'rb') as file:
    sentences = pickle.load(file)

In [86]:
def calculate_wer(reference, hypothesis):
	ref_words = reference.split()
	hyp_words = hypothesis.split()
	# Counting the number of substitutions, deletions, and insertions
	substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
	deletions = len(ref_words) - len(hyp_words)
	insertions = len(hyp_words) - len(ref_words)
	# Total number of words in the reference text
	total_words = len(ref_words)
	# Calculating the Word Error Rate (WER)
	wer = (substitutions + deletions + insertions) / total_words
	return wer

In [93]:
print('BARK ASR WER = ',calculate_wer(sentences[0], asr_bark['text']))
print('RTVC ASR WER = ',calculate_wer(sentences[0], asr_rtvc['text']))
print('Tortoise ASR WER = ',calculate_wer(sentences[0], asr_tortoise['text']))

BARK ASR WER =  0.13043478260869565
RTVC ASR WER =  0.17391304347826086
Tortoise ASR WER =  0.9130434782608695
