## TTS

In [1]:
# pip install speechbrain

In [2]:
import torchaudio
from speechbrain.inference.TTS import Tacotron2
from speechbrain.inference.vocoders import HIFIGAN

# Intialize TTS (tacotron2) and Vocoder (HiFIGAN)
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech")

# Running the TTS
mel_output, mel_length, alignment = tacotron2.encode_text("Did he write stories when he was younger? I think he used to write novels.")

# Running Vocoder (spectrogram-to-waveform)
waveforms = hifi_gan.decode_batch(mel_output)

# Save the waverform
torchaudio.save('example_TTS.wav',waveforms.squeeze(1), 22050)




In [3]:
mel_output.shape

torch.Size([1, 80, 420])

In [4]:
import IPython
IPython.display.Audio("example_TTS.wav")

In [5]:
mel_length

tensor([419], dtype=torch.int32)

## TTS batch infer

## 배치 안함

In [6]:
items = [
       "A quick brown fox jumped over the lazy dog",
       "How much wood would a woodchuck chuck?",
       "Never odd or even"
     ]
mel_outputs, mel_lengths, alignments = tacotron2.encode_batch(items)
for i, mel_output in enumerate(mel_outputs):
    waveforms = hifi_gan.decode_batch(mel_output)
    torchaudio.save(f'example_TTS{i}.wav',waveforms.squeeze(1), 22050)

In [7]:
import IPython
IPython.display.Audio("example_TTS0.wav")

## STT

In [8]:
from speechbrain.inference.ASR import EncoderDecoderASR

asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech")

asr_model.transcribe_file('example_TTS.wav')

'DID HE WRITE STORIES WHEN HE WAS YOUNGER I THINK HE USED TO WRITE NOVELS'

## 평가 파이프라인 만들기
- wav2vec 2를 이용해 1024차원 벡터를 레퍼런스 발음과 사용자 발음을 비교한다.
- librispeech 로 stt후 정답문장과 비교

In [9]:
# pip install Levenshtein

In [22]:
import re
import Levenshtein as lev

def preprocess(text):
    """
    전처리 함수: 문자열에서 !,.,',"와 같은 특수문자를 제거합니다.
    """
    # 정규표현식 패턴을 사용하여 특수문자 제거
    pattern = r'[!,.\'"\?]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def compute_cer(r, h):
    """
    CER(Character Error Rate)을 계산하는 함수입니다.
    r: 참조 문자열
    h: 가설 문자열
    """
    r = preprocess(r)
    h = preprocess(h)
    
    # 편집 거리(Levenshtein distance)를 계산합니다.
    distance = lev.distance(r, h)
    
    # CER을 계산합니다.
    cer = distance / len(r)
    
    return cer

def compute_wer(r, h):
    """
    WER(Word Error Rate)을 계산하는 함수입니다.
    r: 참조 문자열
    h: 가설 문자열
    """
    r = preprocess(r).split()
    h = preprocess(h).split()
    
    # 편집 거리(Levenshtein distance)를 계산합니다.
    distance = lev.distance(r, h)
    
    # WER을 계산합니다.
    wer = distance / len(r)
    
    return wer

# 예시 문자열
ref = "▁I▁KNOCKED▁AT▁THE▁DOOR▁ON▁THE▁ANCIENT▁SIDE▁OF▁THE▁BUILDING"
hyp = "▁I▁KNOCK▁AT▁THE▁DOOR▁ON▁THE▁ASIAN▁SIDE▁OF▁THE▁BUILDING"

# CER 계산
cer = compute_cer(ref, hyp)
print(f"Character Error Rate (CER): {cer:.4f}")

# WER 계산
wer = compute_wer(ref, hyp)
print(f"Word Error Rate (WER): {wer:.4f}")

Character Error Rate (CER): 0.1034
Word Error Rate (WER): 1.0000


In [23]:
Test_sample = "Did he write stories when he was younger? I think he used to write novels."

### 사용자 녹음 음성

In [24]:
IPython.display.Audio("audio_sample/Did_he.wav")

### Reference 발음

In [25]:
IPython.display.Audio("audio_sample/Did_he_REF.wav")

### 음성파일 벡터화 및 유사도 비교 함수 정의

In [26]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import librosa
from scipy.spatial.distance import cosine

model_name="facebook/wav2vec2-large-robust-ft-libri-960h",
model = Wav2Vec2ForCTC.from_pretrained(model_name, output_hidden_states=True).to(device)
processor = Wav2Vec2Processor.from_pretrained(model_name)

def get_average_wav2vec2_embedding(audio_file_path,  device="cpu"):
    """facebook/wav2vec2-large-robust-ft-libri-960h
    음성 파일을 입력받아 Wav2Vec2 임베딩의 평균 벡터를 반환합니다.

    Args:
        audio_file_path (str): 음성 파일 경로
        model_name (str): Wav2Vec2 모델 이름 (기본값: "facebook/wav2vec2-base-960h")
        device (str): 사용할 장치 (기본값: "cpu")

    Returns:
        np.ndarray: 평균 Wav2Vec2 임베딩 (1024 차원)
    """

    # Wav2Vec2 모델과 전처리 프로세서 로드

    # 음성 파일 로드 및 전처리
    waveform, sr = librosa.load(audio_file_path, sr=16000)
    input_values = processor(waveform, sampling_rate=16000, return_tensors="pt").input_values

    # Wav2Vec2 모델 실행
    with torch.no_grad():
        outputs = model(input_values.to(device))

    # 마지막 hidden state 추출 및 평균 계산
    last_hidden_state = outputs.hidden_states[-1]
    print("last_hidden_state : ",last_hidden_state.shape)
    average_embedding = torch.mean(last_hidden_state, dim=1).cpu().numpy()[0]

    return average_embedding


In [27]:
def compare_audio_similarity(audio_file_path1, audio_file_path2, model_name="facebook/wav2vec2-large-robust-ft-libri-960h", device="cpu"):
    """
    두 개의 음성 파일을 받아 Wav2Vec2 임베딩을 추출하고 코사인 유사도를 계산합니다.

    Args:
        audio_file_path1 (str): 첫 번째 음성 파일 경로
        audio_file_path2 (str): 두 번째 음성 파일 경로
        model_name (str): Wav2Vec2 모델 이름 (기본값: "facebook/wav2vec2-large-robust-ft-libri-960h")
        device (str): 사용할 장치 (기본값: "cpu")

    Returns:
        float: 코사인 유사도 (0 ~ 1 사이의 값, 1에 가까울수록 유사)
    """

    # 각 음성 파일의 평균 임베딩 벡터 추출
    embedding1 = get_average_wav2vec2_embedding(audio_file_path1, model_name, device)
    embedding2 = get_average_wav2vec2_embedding(audio_file_path2, model_name, device)
    print(embedding1.shape)
    print(embedding2.shape)
    # 코사인 유사도 계산
    similarity = 1 - cosine(embedding1, embedding2)

    return similarity

audio_file_path1 = "audio_sample/Did_he_REF.wav"
audio_file_path2 = "audio_sample/Did_he.wav"

similarity_score = compare_audio_similarity(audio_file_path1, audio_file_path2)
print("Similarity:", similarity_score)

Some weights of the model checkpoint at facebook/wav2vec2-large-robust-ft-libri-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-robust-ft-libri-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

last_hidden_state :  torch.Size([1, 245, 1024])


Some weights of the model checkpoint at facebook/wav2vec2-large-robust-ft-libri-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-robust-ft-libri-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

last_hidden_state :  torch.Size([1, 268, 1024])
(1024,)
(1024,)
Similarity: 0.9949031923403918


In [28]:
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h", output_hidden_states=True)
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
# print(model)
# print(processor)

In [32]:
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech")

def get_scoring_model_inputs(gt_text, refpath, userpath):
    ground_truth_text = gt_text
    similarity_score = compare_audio_similarity(refpath, userpath)    
    user_text = asr_model.transcribe_file(userpath).lower()
    print("Test_sample : ",Test_sample)
    print("recognized_text : ",user_text)
    cer = compute_cer(gt_text, user_text) 
    wer = compute_wer(gt_text, user_text)

    x_features = similarity_score, cer, wer
    return x_features




ref = "audio_sample/Did_he_REF.wav"
user = "audio_sample/Did_he.wav"
#user = ref
x_features = get_scoring_model_inputs(Test_sample, ref, user)
print(x_features)

Some weights of the model checkpoint at facebook/wav2vec2-large-robust-ft-libri-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-robust-ft-libri-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

last_hidden_state :  torch.Size([1, 245, 1024])


Some weights of the model checkpoint at facebook/wav2vec2-large-robust-ft-libri-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-robust-ft-libri-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

last_hidden_state :  torch.Size([1, 268, 1024])
(1024,)
(1024,)
Test_sample :  Did he write stories when he was younger? I think he used to write novels.
recognized_text :  to day wise stories when he was younger at dinner he used to rhyme novels
(0.9949031923403918, 0.2638888888888889, 0.4)
