In [None]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Tokenizer,Wav2Vec2Processor, Wav2Vec2ForCTC
from phonemizer import phonemize
import Levenshtein as lev
import librosa
import torch
import os

# Specific for my local implementation
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = 'C:/Program Files/eSpeak NG/libespeak-ng.dll'

phoneme_visemes = {
    'phoneme': ('viseme_id', 'instructions')
}

class Pipeline():
    """
    Evaluates speech and returns feedback to
    target pronunciation points that require further work.
    """
    def __init__(self):
        self.model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.model_name)
        self.tokenizer = Wav2Vec2Tokenizer.from_pretrained(self.model_name)
        self.processor = Wav2Vec2Processor.from_pretrained(self.model_name, feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
        self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)

    def speech2phonemes(self, audio_path):
        """Transforms user audio into IPA phonemes for evaluation"""
        # Load and normalize the user's audio
        speech, sr = librosa.load(audio_path, sr=16_000, mono=True)
        speech = librosa.util.normalize(speech)

        # Process input and generate logits
        inputs = self.processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
        with torch.no_grad():
            logits = self.model(inputs.input_values).logits

        # Decode the logits into phonemes and return
        predicted_ids = torch.argmax(logits, dim=-1)
        phonemes = self.processor.batch_decode(predicted_ids)[0]
        
        return phonemes
    
    def text2phonemes(self, text):
        """Converts text into IPA phonemes using eSpeak-NG"""
        phonemes = phonemize(text, language='en-us').replace(' ', '')
        return phonemes
    
    def get_misalignments(self, user_phonemes, target_phonemes):
        """Evaluates alignment between two phoneme sequences"""
        # Get the Levenshtein alignment codes
        opcodes = lev.opcodes(target_phonemes, user_phonemes)
        # Compute a percentage similarity based on Levenshtein distance
        distance = lev.distance(target_phonemes, user_phonemes)
        similarity = round(distance/max(len(user_phonemes), len(target_phonemes))*100, 2)

        matches = []
        substitutions = []
        deletions = []
        insertions = []
                
        # Extract matches and various kinds of errors
        for op, ref_start, ref_end, user_start, user_end in opcodes:
            # Encode as reference indices and attempt indices
            indices = ((ref_start, ref_end), (user_start, user_end))

            if op == 'equal':
                matches.append(indices)
            elif op == 'replace':
                substitutions.append(indices)
            elif op == 'delete':
                deletions.append(indices)
            elif op == 'insert':
                insertions.append(indices)

        return similarity, matches, substitutions, deletions, insertions
    
    def get_accuracy(self, target_phonemes, misalignments):
        """Returns a number based on sequence identity"""
        
    def get_viseme(self, phoneme):
        """Returns a viseme and a description for a phoneme"""

    def get_feedback(self, phoneme):
        """Returns the corresponding feedback to help understand a phoneme"""
        
    
    def __call__(self):
        """Makes the whole pipeline run from start to finish"""
        # Step 1. Get the user's phonemes and the reference phonemes
        user_phonemes = self.speech2phonemes('some_arbitrary_audio_path.wav')
        target_phonemes = self.text2phonemes('some_reference_speech')

        # Step 2. Get similarity misalignment indices between attempt and target
        similarity, matches, substitutions, deletions, insertions = self.get_misalignments(user_phonemes, target_phonemes)

        
        


        

equal (reference: 0, 3) (attempt: 0, 3)
delete (reference: 3, 4) (attempt: 3, 3)
equal (reference: 4, 5) (attempt: 3, 4)


In [1]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Tokenizer,Wav2Vec2Processor, Wav2Vec2ForCTC
import torch

model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"

# Load the tokenizer and the feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)

# Load the processor and the model
processor = Wav2Vec2Processor.from_pretrained(model_name, feature_extractor=feature_extractor, tokenizer=tokenizer)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2PhonemeCTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


In [2]:
import librosa

# Load audio with 16kHz sampling rate
audio_path = "harvard.wav"
speech, sr = librosa.load(audio_path, sr=16000, mono=True)

# Normalize audio (optional but recommended)
import numpy as np
speech = librosa.util.normalize(speech)

In [3]:
# Tokenize the audio
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)

# Run inference
with torch.no_grad():
    logits = model(inputs.input_values).logits

# Get predicted phonemes
predicted_ids = torch.argmax(logits, dim=-1)
phonemes = processor.batch_decode(predicted_ids)

# Output the phoneme string
ipa_string = phonemes[0]

In [4]:
ipa_string

'ðəsteɪlsmɛlʌvoʊldbiːlɪŋɡɚzɪtteɪkshiːttəbɹɪŋaʊtðɪoʊdɚɐkoʊlddɪpɹɪstoːɹzhɛlθændzɛsteɪsaʊltpɪkəlteɪstfaɪnwɪðheɪmtækəlzɑːlpæstɔːɹɹɑːɹmaɪfeɪvɹəteɪzɛstfəlfuːdɪzðɪhɑːtkɹɑːsbʌn'

In [5]:
import os
from phonemizer import phonemize
from phonemizer.separator import Separator

# Define separators for phones, words, and syllables (optional, defaults are fine)
# By default, phones are separated by a space and words by a space
# The default output format with espeak-ng is IPA.

text = "the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle tastes fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun"

os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = 'C:\Program Files\eSpeak NG\libespeak-ng.dll'

# Simplest use case:
phonemized_text = phonemize(text, language='en-us').replace(' ', '')

print(phonemized_text)

  os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = 'C:\Program Files\eSpeak NG\libespeak-ng.dll'


ðəsteɪlsmɛlʌvoʊldbɪɹlɪŋɡɚzɪtteɪkshiːttəbɹɪŋaʊtðɪoʊdɚɹɐkoʊlddɪpɹᵻstɔːɹzhɛlθændzɛstɐsɔltpɪkəlteɪstsfaɪnwɪðhæmtɑːkoʊzælpæstɚɹɑːɹmaɪfeɪvɚɹᵻtɐzɛstfəlfuːdɪzðəhɑːtkɹɔsbʌn


In [7]:
import nltk

nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\dante\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [8]:
nltk.edit_distance(ipa_string, phonemized_text)

28

In [18]:
import Levenshtein as lev

reference = ["ˈh", "ɛ", "l", "a", "oʊ"]
attempt = ["ˈh", "ɛ", "l", "oʊ"]

# Get the optimal alignment strings
opcodes = lev.opcodes(reference, attempt)
distance = lev.distance(reference, attempt)

for op, ref_start, ref_end, att_start, att_end in opcodes:
    if op == 'equal':
        print(f"Match: {reference[ref_start:ref_end]}")
    elif op == 'replace':
        print(f"Substitution: Ref({reference[ref_start:ref_end]}) -> Usr({attempt[att_start:att_end]})")
    elif op == 'delete':
        print(f"Deletion: Ref({reference[ref_start:ref_end]}) was missing")
    elif op == 'insert':
        print(f"Insertion: Usr({attempt[att_start:att_end]}) was extra")

Match: ['ˈh', 'ɛ', 'l']
Deletion: Ref(['a']) was missing
Match: ['oʊ']


In [14]:
distance

1