In [1]:
import jiwer

import pandas as pd

In [2]:
metric_funcs = {
    'cer': jiwer.cer,
    'mer': jiwer.mer,
    'wer': jiwer.wer,
    'wil': jiwer.wil,
    'wip': jiwer.wip,
}

In [3]:
transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

In [4]:
prompts = pd.read_csv('../my_tests/prompts.csv', index_col=0)

In [5]:
def calc_metrics(row):
    m = {}
    for metric_name, metric_func in metric_funcs.items():
        m[metric_name] = metric_func(
            row['prompt'],
            row['transcription'],
            truth_transform=transforms,
            hypothesis_transform=transforms,
        )
        
    return pd.Series(m)

In [6]:
from transformers import pipeline

model_name = "openai/whisper-medium"

pipe = pipeline(
    "automatic-speech-recognition",
    model=model_name,
    chunk_length_s=30,
    device='cuda',
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# xtts

In [46]:
results = pd.read_csv('../my_tests/whisper_test_output.csv', index_col=0)
results

Unnamed: 0_level_0,prompt,output
number,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,The Hubble Space Telescope has revolutionized ...,The Hubble Space Telescope has revolutionized...
2.0,Photosynthesis is the process by which plants ...,Photosynthesis is the process by which plants...
3.0,The Industrial Revolution began in Britain in ...,The Industrial Revolution began in Britain in...
4.0,Chess is a strategic board game that originate...,Chess is a strategic board game that originat...
5.0,The human brain contains approximately 86 bill...,The human brain contains approximately 86 bil...
...,...,...
106.0,Happy New Year\n,Happy New Year!\n
107.0,I'm fine\n,I'm fine.\n
108.0,Let's go\n,Let's go.\n
109.0,Stop\n,Stop! OI!\n


In [19]:
metrics = {}

for metric, func in metric_funcs.items():
    _metrics = []
    for _, row in results.iterrows():
        _metrics.append(func(
            row['prompt'], 
            row['output'],
            truth_transform=transforms,
            hypothesis_transform=transforms,
        ))
    metrics[metric] = _metrics

In [12]:
metrics = pd.DataFrame(metrics, index=results.index)
metrics

Unnamed: 0_level_0,cer,mer,wer,wil,wip
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.000000,0.000000,0.000000,0.000000,1.000000
2.0,0.000000,0.000000,0.000000,0.000000,1.000000
3.0,0.005556,0.035714,0.035714,0.070153,0.929847
4.0,0.000000,0.000000,0.000000,0.000000,1.000000
5.0,0.000000,0.000000,0.000000,0.000000,1.000000
...,...,...,...,...,...
106.0,0.071429,0.333333,0.333333,0.555556,0.444444
107.0,0.125000,0.500000,0.500000,0.750000,0.250000
108.0,0.125000,0.500000,0.500000,0.750000,0.250000
109.0,1.250000,1.000000,2.000000,1.000000,0.000000


In [21]:
results.merge(metrics, left_index=True, right_index=True).to_csv('../my_tests/metrics.csv')

In [20]:
metrics = pd.DataFrame(metrics, index=results.index)
metrics

Unnamed: 0_level_0,cer,mer,wer,wil,wip
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.0,0.0,0.0,0.0,1.0
2.0,0.0,0.0,0.0,0.0,1.0
3.0,0.0,0.0,0.0,0.0,1.0
4.0,0.0,0.0,0.0,0.0,1.0
5.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
106.0,0.0,0.0,0.0,0.0,1.0
107.0,0.0,0.0,0.0,0.0,1.0
108.0,0.0,0.0,0.0,0.0,1.0
109.0,1.0,0.5,1.0,0.5,0.5


# respeecher

In [5]:
import glob
files = glob.glob('..\\my_tests\\respeecher_vs_xtts_2024_07_06\\**\\*.wav', recursive=True)
files = sorted(files)

In [8]:
transcriptions = pipe(files)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [33]:
metrics = []

for filename, transcription in zip(files, transcriptions):
    pieces = filename.split('\\')
    if len(pieces) > 4:
        speaker_name = pieces[3].split('__')[1]
        model_name = pieces[4]
        file = pieces[6].split('_')
        text_id = int(file[1][4:])
        gen_number = file[2].split('.')[0][3:]
        transcription = transcription['text']
            
        metrics.append(
            {
                'id_': text_id ,
                'model_name': model_name,
                'speaker': speaker_name,
                'gen': gen_number,
                'transcription': transcription,
            }
        )
        

IndexError: list index out of range

In [34]:
len(metrics)

4480

In [35]:
metrics = pd.DataFrame(metrics)
metrics

Unnamed: 0,id_,model_name,speaker,gen,transcription
0,0,respeecher_tts,constance,1,the Hubble Space Telescope has revolutionized...
1,0,respeecher_tts,constance,2,The Hubble Space Telescope has revolutionized...
2,0,respeecher_tts,constance,3,The Hubble Space Telescope has revolutionized...
3,0,respeecher_tts,constance,4,The Hubble Space Telescope has revolutionized...
4,0,respeecher_tts,constance,5,The Hubble Space Telescope has revolutionized...
...,...,...,...,...,...
4475,111,respeecher_tts_multilang,vincent,1,"of those saying, I've listened."
4476,111,respeecher_tts_multilang,vincent,2,Alex is a third. Listen.
4477,111,respeecher_tts_multilang,vincent,3,And this saying of Loss and Noir is it's
4478,111,respeecher_tts_multilang,vincent,4,"of listening of car, the galangal, I can say ..."


In [36]:
metrics = metrics[metrics['id_'] != 80].copy()

In [37]:
metrics.loc[metrics['id_'] > 80, 'id_'] -= 1

In [38]:
metrics = metrics[metrics['id_'] != 90].copy()
metrics.loc[metrics['id_'] > 90, 'id_'] -= 1

In [39]:
merged_df = metrics.merge(prompts, left_on='id_', right_index=True)
merged_df.set_index(['id_', 'gen', 'speaker', 'model_name'], inplace=True)
merged_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,transcription,prompt
id_,gen,speaker,model_name,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,constance,respeecher_tts,the Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...
0,2,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...
0,3,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...
0,4,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...
0,5,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...
...,...,...,...,...,...
109,1,vincent,respeecher_tts_multilang,"of those saying, I've listened.",Listen
109,2,vincent,respeecher_tts_multilang,Alex is a third. Listen.,Listen
109,3,vincent,respeecher_tts_multilang,And this saying of Loss and Noir is it's,Listen
109,4,vincent,respeecher_tts_multilang,"of listening of car, the galangal, I can say ...",Listen


In [40]:
metrics = merged_df.apply(calc_metrics, axis=1)
metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cer,mer,wer,wil,wip
id_,gen,speaker,model_name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,constance,respeecher_tts,1.272727,0.560000,1.272727,0.560000,0.440000
0,2,constance,respeecher_tts,1.272727,0.560000,1.272727,0.560000,0.440000
0,3,constance,respeecher_tts,1.272727,0.560000,1.272727,0.560000,0.440000
0,4,constance,respeecher_tts,1.272727,0.560000,1.272727,0.560000,0.440000
0,5,constance,respeecher_tts,1.272727,0.560000,1.272727,0.560000,0.440000
...,...,...,...,...,...,...,...,...
109,1,vincent,respeecher_tts_multilang,6.000000,1.000000,6.000000,1.000000,0.000000
109,2,vincent,respeecher_tts_multilang,4.000000,0.800000,4.000000,0.800000,0.200000
109,3,vincent,respeecher_tts_multilang,10.000000,1.000000,10.000000,1.000000,0.000000
109,4,vincent,respeecher_tts_multilang,11.000000,1.000000,11.000000,1.000000,0.000000


In [41]:
df = merged_df.merge(metrics, left_index=True, right_index=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,transcription,prompt,cer,mer,wer,wil,wip
id_,gen,speaker,model_name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,constance,respeecher_tts,the Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
0,2,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
0,3,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
0,4,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
0,5,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
...,...,...,...,...,...,...,...,...,...,...
109,1,vincent,respeecher_tts_multilang,"of those saying, I've listened.",Listen,6.000000,1.000000,6.000000,1.000000,0.000000
109,2,vincent,respeecher_tts_multilang,Alex is a third. Listen.,Listen,4.000000,0.800000,4.000000,0.800000,0.200000
109,3,vincent,respeecher_tts_multilang,And this saying of Loss and Noir is it's,Listen,10.000000,1.000000,10.000000,1.000000,0.000000
109,4,vincent,respeecher_tts_multilang,"of listening of car, the galangal, I can say ...",Listen,11.000000,1.000000,11.000000,1.000000,0.000000


In [42]:
df.groupby(['id_', 'speaker', 'model_name'])['cer'].std()

id_  speaker    model_name              
0    constance  respeecher_tts              0.000000
                respeecher_tts_multilang    0.000000
     samantha   respeecher_tts              0.000000
                respeecher_tts_multilang    0.000000
     spencer    respeecher_tts              0.000000
                                              ...   
109  samantha   respeecher_tts_multilang    4.827007
     spencer    respeecher_tts              1.870829
                respeecher_tts_multilang    2.949576
     vincent    respeecher_tts              1.341641
                respeecher_tts_multilang    2.863564
Name: cer, Length: 880, dtype: float64

In [44]:
df.to_csv('../my_tests/metrics/respeecher_tts_metrics.csv', index=True)

In [7]:
df = pd.read_csv('../my_tests/metrics/respeecher_tts_metrics.csv')
df

Unnamed: 0,id_,gen,speaker,model_name,transcription,prompt,cer,mer,wer,wil,wip
0,0,1,constance,respeecher_tts,the Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
1,0,2,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
2,0,3,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
3,0,4,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
4,0,5,constance,respeecher_tts,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,1.272727,0.560000,1.272727,0.560000,0.440000
...,...,...,...,...,...,...,...,...,...,...,...
4395,109,1,vincent,respeecher_tts_multilang,"of those saying, I've listened.",Listen,6.000000,1.000000,6.000000,1.000000,0.000000
4396,109,2,vincent,respeecher_tts_multilang,Alex is a third. Listen.,Listen,4.000000,0.800000,4.000000,0.800000,0.200000
4397,109,3,vincent,respeecher_tts_multilang,And this saying of Loss and Noir is it's,Listen,10.000000,1.000000,10.000000,1.000000,0.000000
4398,109,4,vincent,respeecher_tts_multilang,"of listening of car, the galangal, I can say ...",Listen,11.000000,1.000000,11.000000,1.000000,0.000000


# xtts

In [58]:
import glob

files = glob.glob('..\\my_tests\\audio\\**\\*.wav', recursive=True)
files = sorted(files)
transcriptions = pipe(files)



In [62]:
metrics = []

for filename, transcription in zip(files, transcriptions):
    pieces = filename.split('\\')
    speaker_name = 'tests\data\ljspeech\wavs\LJ001-0003.wav'
    model_name = 'xTTS'
    file = pieces[3].split('_')
    text_id = int(file[0])
    gen_number = file[1].split('.')[0]
    transcription = transcription['text']

    metrics.append(
        {
            'id_': text_id ,
            'model_name': model_name,
            'speaker': speaker_name,
            'gen': gen_number,
            'transcription': transcription,
        }
    )

In [None]:
metrics = pd.DataFrame(metrics)
metrics.set_index('id_', inplace=True)

In [75]:
metrics_ = metrics.reset_index().merge(prompts, left_on='id_', right_index=True).apply(calc_metrics, axis=1)

In [76]:
df = metrics.merge(prompts, left_index=True, right_index=True).merge(metrics_, left_index=True, right_index=True)
df

Unnamed: 0_level_0,model_name,speaker,gen,transcription,prompt,cer,mer,wer,wil,wip
id_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,0,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,1,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,2,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,3,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,4,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
9,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,5,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,6,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,7,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,xTTS,tests\data\ljspeech\wavs\LJ001-0003.wav,8,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0


In [90]:
df.groupby('id_')['mer'].std()

id_
0      0.000000
1      0.052705
2      0.000000
3      0.022588
4      0.000000
         ...   
105    0.316228
106    0.126491
107    0.079057
108    0.289875
109    0.498951
Name: mer, Length: 110, dtype: float64

In [91]:
df.to_csv('../my_tests/xtts_metrics.csv')

# respeacher tts

In [6]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_name,
    chunk_length_s=30,
    device='cuda',
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
import glob

files = glob.glob('..\\my_tests\\audio\\respeacher_tts\\*.wav')
files = sorted(files)
# transcriptions = pipe(files)

In [11]:
import tqdm
metrics = []

for filename,  in tqdm.tqdm(zip(files, )):
    pieces = filename.split('\\')
    speaker_name = 'tests\data\ljspeech\wavs\LJ001-0003.wav'
    model_name = 'respeacher TTS'
    file = pieces[4].split('_')
    text_id = int(file[0])
    gen_number = file[1].split('.')[0]
    transcription = pipe(filename)['text']

    metrics.append(
        {
            'id_': text_id ,
            'model_name': model_name,
            'speaker': speaker_name,
            'gen': gen_number,
            'transcription': transcription,
        }
    )

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
9it [00:07,  1.37it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
1100it [15:23,  1.19it/s]


In [12]:
metrics = pd.DataFrame(metrics)
metrics.set_index('id_', inplace=True)

In [28]:
merged_df = metrics.reset_index().merge(prompts, left_on='id_', right_index=True)
merged_df.set_index(['id_', 'gen'], inplace=True)

In [29]:
metrics_ = merged_df.apply(calc_metrics, axis=1)

In [31]:
df = merged_df.merge(metrics_, left_index=True, right_index=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,model_name,speaker,transcription,prompt,cer,mer,wer,wil,wip
id_,gen,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,1,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,2,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,3,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,4,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
9,5,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,6,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,7,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,8,respeacher TTS,tests\data\ljspeech\wavs\LJ001-0003.wav,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0


In [33]:
df.groupby('id_')['mer'].std()

id_
0      0.000000
1      0.120039
2      0.000000
3      0.034503
4      0.084369
         ...   
105    0.414868
106    0.395792
107    0.348675
108    0.450117
109    0.305465
Name: mer, Length: 110, dtype: float64

In [34]:
df.to_csv('../my_tests/metrics/fine_tuned_respeecher_tts_metrics.csv')

In [8]:
df = pd.read_csv('../my_tests/metrics/fine_tuned_respeecher_tts_metrics.csv')

In [12]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,transcription,prompt,cer,mer,wer,wil,wip
id_,gen,speaker,model_name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,1,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,2,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,3,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
0,4,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,The Hubble Space Telescope has revolutionized...,The Hubble Space Telescope has revolutionized ...,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
9,5,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,6,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,7,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0
9,8,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,William Shakespeare wrote 37 plays and 154 so...,William Shakespeare wrote 37 plays and 154 son...,0.0,0.0,0.0,0.0,1.0


In [10]:
df.set_index(['id_', 'gen', 'speaker', 'model_name'], inplace=True)

In [17]:
df[['cer', 'mer', 'wer', 'wil', 'wip']] = df.apply(calc_metrics, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cer,mer,wer,wil,wip
id_,gen,speaker,model_name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0
0,1,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0
0,2,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0
0,3,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0
0,4,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
9,5,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0
9,6,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0
9,7,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0
9,8,tests\data\ljspeech\wavs\LJ001-0003.wav,respeacher TTS,0.0,0.0,0.0,0.0,1.0


In [2]:
from TTS.api import TTS

In [3]:
model = TTS(model_path='../checkpoints/respeacher/', config_path='../checkpoints/respeacher/config.json').to('cuda')

 > Using model: xtts


In [4]:
wav = model.tts('Hello, my name is PC.', language='en', speaker_wav=r"D:\Учеба\КПИ\Diploma\TTS\tests\data\ljspeech\wavs\LJ001-0003.wav")

 > Text splitted to sentences.
['Hello, my name is PC.']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


 > Processing time: 3.3334999084472656
 > Real-time factor: 0.7594609954255064


In [14]:
from IPython.display import Audio

Audio(wav, rate=24000)