In [1]:
from TTS.utils.manage import ModelManager

import sys
sys.path.insert(0, "../Src/")
import generateVoices as gv
from os.path import exists
import pandas as pd
from pydub import AudioSegment
import speech_recognition as sr
import jiwer
import json
import numpy as np

%autosave 5

Autosaving every 5 seconds


Autosaving every 5 seconds


In [2]:
def evaluateWER(generatedAudioFileName, sentence):
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(generatedAudioFileName) as source:
            audio = recognizer.record(source)
            hypothesis = recognizer.recognize_google(audio)
        wer = jiwer.wer(sentence, hypothesis)
        print("WER:", wer)
        return wer
    except sr.UnknownValueError:
        print(f"Could not understand audio: {generatedAudioFileName}")
        return 1.0
    except Exception as e:
        print(f"Error processing {generatedAudioFileName}: {e}")
        return 1.0

In [3]:
def evaluateWERForModel(model_dir_name, outputCSVFile):
    outputDF = pd.read_csv(outputCSVFile)
    speakers = outputDF['speakerId']
    wordErrorRateArray = []
    for speaker in speakers:
        sentence = outputDF[outputDF['speakerId'] == speaker]['generatedSentence'].values[0]
        generatedAudioFileName = f'../Data/ttsOutputs/{model_dir_name}/{speaker}.wav'
        #evaluate word error rate for geneatedAudioFileName vs sentence
        wer = evaluateWER(generatedAudioFileName, sentence)
        wordErrorRateArray.append(wer)
    if not wordErrorRateArray:
        print(f"No valid WER values for model {model_dir_name}")
        return 1.0, 0.0
    average = np.mean(wordErrorRateArray)
    std_error = np.std(wordErrorRateArray, ddof=1) / np.sqrt(len(wordErrorRateArray))
    return average, std_error


In [4]:
def main():
    wordErrorRateResults = []
    if exists('../Data/wordErrorRateResults.json'):
        with open('../Data/wordErrorRateResults.json', 'r') as f:
            wordErrorRateResults = json.load(f) 
    
    englishModels = gv.getRawEnglishModelNames()
    for model in englishModels:
        print(f"Processing model: {model}")
        model_dir_name = model.replace("/", "_")
        outputCSVFile = f'../Data/ttsOutputs/{model_dir_name}_generatedSentences.csv'
        if not exists(outputCSVFile):
            continue
        averageWER, stdErrorWER = evaluateWERForModel(model_dir_name, outputCSVFile)
        wordErrorRateResults.append({
            "model": model,
            "averageWER": averageWER,
            "stdErrorWER": stdErrorWER
        })
        with open('../Data/wordErrorRateResults.json', 'w') as f:
            json.dump(wordErrorRateResults, f)
    wordErrorRateResults = pd.DataFrame(wordErrorRateResults)
    wordErrorRateResults = wordErrorRateResults.sort_values(by=['averageWER'], ascending=True)
    display(wordErrorRateResults)
    wordErrorRateResults.to_csv('../Data/wordErrorRateResults.csv', index=False)
    print('done')

In [5]:
if __name__ == '__main__':
    main()


 Name format: type/language/dataset/model
 1: tts_models/multilingual/multi-dataset/xtts_v2
 2: tts_models/multilingual/multi-dataset/xtts_v1.1
 3: tts_models/multilingual/multi-dataset/your_tts
 4: tts_models/multilingual/multi-dataset/bark
 5: tts_models/bg/cv/vits
 6: tts_models/cs/cv/vits
 7: tts_models/da/cv/vits
 8: tts_models/et/cv/vits
 9: tts_models/ga/cv/vits
 10: tts_models/en/ek1/tacotron2 [already downloaded]
 11: tts_models/en/ljspeech/tacotron2-DDC [already downloaded]
 12: tts_models/en/ljspeech/tacotron2-DDC_ph [already downloaded]
 13: tts_models/en/ljspeech/glow-tts [already downloaded]
 14: tts_models/en/ljspeech/speedy-speech [already downloaded]
 15: tts_models/en/ljspeech/tacotron2-DCA [already downloaded]
 16: tts_models/en/ljspeech/vits [already downloaded]
 17: tts_models/en/ljspeech/vits--neon [already downloaded]
 18: tts_models/en/ljspeech/fast_pitch [already downloaded]
 19: tts_models/en/ljspeech/overflow [already downloaded]
 20: tts_models/en/ljspeech/

Unnamed: 0,model,averageWER,stdErrorWER
7,tts_models/en/ljspeech/vits--neon,0.337605,0.007191
6,tts_models/en/ljspeech/vits,0.34863,0.007368
8,tts_models/en/ljspeech/fast_pitch,0.349517,0.006749
14,tts_models/en/jenny/jenny,0.389669,0.008098
9,tts_models/en/ljspeech/overflow,0.3971,0.009315
3,tts_models/en/ljspeech/glow-tts,0.413957,0.008565
10,tts_models/en/ljspeech/neural_hmm,0.551531,0.010792
4,tts_models/en/ljspeech/speedy-speech,0.575512,0.012111
5,tts_models/en/ljspeech/tacotron2-DCA,0.78766,0.034187
2,tts_models/en/ljspeech/tacotron2-DDC_ph,0.792134,0.023136


done
