## IMPORT LIBRARIES AND SETUP ENV

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import sys
import string
import time
import glob
import argparse
import librosa
import pyworld
import pysptk
import math
import json
import numpy as np
import torch
import fnmatch
import subprocess
import soundfile as sf
import IPython
import csv

sys.path.append("/home/fa578s/thesis-mss-vector-synthesis-TTS-Portuguese-Corpus/") 
# add libraries into environment
VOCODER_PATH = "/home/fa578s/hifi-gan/"
sys.path.append(VOCODER_PATH)

#Set this if TTS is not installed globally or problems occur with TTS
#TTS_PATH = "/home/user/thesis-mss-vector-synthesis"
#sys.path.append(TTS_PATH) 

#INSERT ROOT DIRECTORY PATH UP TO THE PRIMARY GITREPO
HOME_DIRECTORY = "/home/fa578s/"

### SET OUTPUT AUDIO PATH ###
OUT_PATH = '/home/fa578s/Desktop/outputs/tristin/'
torch.cuda.empty_cache()

from IPython.display import Audio
from tqdm import tqdm
from scipy.io.wavfile import read as read_wav
from scipy.io.wavfile import write
from env import AttrDict
#from attrdict import AttrDict
from models import Generator
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from scipy import spatial
from TTS.utils.io import load_config
from TTS.vocoder.utils.generic_utils import setup_generator

try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor

#Check is cuda is avaliable.
os.environ["CUDA_VISIBLE_DEVICES"]="0"
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)

#These configs tell the speaker encoders where to find the specific wav files.
INPUT_DATASET_CONFIG = "/home/fa578s/thesis-mss-vector-synthesis/audios/input/input_dataset_config.json"
MSS_DATASET_CONFIG = "/home/fa578s/thesis-mss-vector-synthesis/audios/output/mss_dataset_config.json"


#Original WAV embeddings
#mss_pre_embedding_file = HOME_DIRECTORY + "thesis-mss-vector-synthesis/audios/input/audio/speakers.json"
mss_pre_embedding_file = HOME_DIRECTORY + "/thesis-mss-vector-synthesis/audios/input/audio/speakers.json"

#Synthesized WAV embeddings
#mss_after_embedding_file = HOME_DIRECTORY + "thesis-mss-vector-synthesis/audios/output/audio/speakers.json"


True


## MODEL CHECKPOINTS

### HIFI-GAN CHECKPOINT

In [2]:
### HiFi-GAN Vocoder ###
hifi_gan_checkpoint = '/home/fa578s/miniconda3/envs/My_project/checkpoints/hifi_gan/generator.pt'
hifi_gan_config_file = os.path.join(os.path.split(hifi_gan_checkpoint)[0], 'config.json')

### TTS MODEL CHECKPOINTS
Uncomment one model path from each model 

In [3]:
### MSS-Vectors Based TTS Model ###
#30k step speaker encoder output vector size 512 
MSS_TTS_MODEL_PATH = '/home/fa578s/miniconda3/envs/My_project/checkpoints/mss_vector/tts/experiment_2/30kmss_vsize_512_checkpoint_280000.pth.tar'

#50k step speaker encoder model
#MSS_TTS_MODEL_PATH = '../checkpoints/mss_vector/tts/experiment_1/50kmss_checkpoint_280000.pth.tar'

#30k step speaker encoder model
#MSS_TTS_MODEL_PATH = '../checkpoints/mss_vector/tts/experiment_1/30kmss_checkpoint_280000.pth.tar'

MSS_TTS_CONFIG_PATH = os.path.join(os.path.split(MSS_TTS_MODEL_PATH)[0], 'config.json')


# ### S-Vector Based TTS Model ###
# S_VECTOR_TTS_MODEL_PATH = '../checkpoints/s_vector/tts/30ks_checkpoint_280000.pth.tar'
# S_VECTOR_TTS_CONFIG_PATH = os.path.join(os.path.split(S_VECTOR_TTS_MODEL_PATH)[0], 'config.json')
    
    
### GE2E Based TTS Model ###
#50k step speaker encoder model
#GE2E_TTS_MODEL_PATH = '../checkpoints/ge2e/tts/30kge2e_checkpoint_280000.pth.tar'

# #30k step speaker encoder model
# GE2E_TTS_MODEL_PATH = '../checkpoints/ge2e/tts/30kge2e_checkpoint_280000.pth.tar'

# GE2E_TTS_CONFIG_PATH = os.path.join(os.path.split(GE2E_TTS_MODEL_PATH)[0], 'config.json')

### SPEAKER ENCODER CHECKPOINTS
Uncomment one model path from each model. Speaker encoder should correspond to model used in TTS Checkpoint.

In [4]:
### MSS SPEAKER ENCODER ###
#30k step speaker encoder output vector size 512
MSS_ENCODER_PATH = "/home/fa578s/miniconda3/envs/My_project/checkpoints/mss_vector/se/experiment_2/mss_vsize_512_checkpoint_30000.pth.tar"

#50k step 
#MSS_ENCODER_PATH = "../checkpoints/mss_vector/se/experiment_1/mss_checkpoint_50000.pth.tar"

#30k step
#MSS_ENCODER_PATH = "../checkpoints/mss_vector/se/experiment_1/mss_checkpoint_30000.pth.tar"

MSS_CONFIG = os.path.join(os.path.split(MSS_ENCODER_PATH)[0], 'config.json')


### S-VECTOR SPEAKER ENCODER ###
# S_VECTOR_ENCODER_PATH = "../checkpoints/s_vector/se/sv_checkpoint_30000.pth.tar"
# S_VECTOR_CONFIG = os.path.join(os.path.split(S_VECTOR_ENCODER_PATH)[0], 'config.json')


# ### GE2E SPEAKER ENCODER ###
# #50k step 
# #GE2E_ENCODER_PATH = "../checkpoints/ge2e/se/ge2e_checkpoint_50000.pth.tar"

# #30k step
# GE2E_ENCODER_PATH = "../checkpoints/ge2e/se/ge2e_checkpoint_30000.pth.tar"

# GE2E_CONFIG = os.path.join(os.path.split(GE2E_ENCODER_PATH)[0], 'config.json')



## SPEAKER ENCODER UTILS

In [5]:
def resample_files(input_dir):
    #Checks if files in a given directory need to be resampled to make the 24kHz sample rate of the TTS models
    need_resampled = []
    audio_files = glob.glob(os.path.join(input_dir, "*.wav"))
    #print(audio_files)
    print(f"Found {len(audio_files)} total files...")
    for file in audio_files:
        sampling_rate, data=read_wav(file)
        if(sampling_rate != 24000):
            need_resampled.append(file)
    print(f"Found {len(need_resampled)} files that need resampled...")       
    with tqdm(total=len(need_resampled)) as pbar:
        for file in need_resampled:
            pbar.update()
            y, sr = librosa.load(file, 24000)
            sf.write(file, y, sr)

def compute_embeddings(model_path, config, dataset, input_or_output, model_name):
    #Computes embeddings with help of premade program that comes with CoquiTTS
    command = "source /home/tristin/miniconda3/bin/activate tts_training && python3 ../thesis-mss-vector-training/TTS/bin/compute_embeddings.py \
    {} \
    {} \
    {} \
    ./audios/{}/{} && source /home/tristin/miniconda3/bin/deactivate".format(model_path, config, dataset, input_or_output, model_name)
    subprocess.run(command, shell=True, executable='/bin/bash')
    
def find_speaker_wavs(speaker, src_dir):
    #Helper function for load_speaker embeddings
    wavs = []
    for filename in enumerate(glob.glob(src_dir+"*.wav")):
        if (fnmatch.fnmatch(filename[1], '*'+speaker+'*')):
            wavs.append(filename[1].split('/')[-1])
    return wavs

def load_speaker_embeddings(speakers, embedding_file, src_dir, avg=False):
    #Load speaker embeddings from speaker.json file
    speakers_info = {}
    f = open(embedding_file)
    embeddings = json.load(f)
    for speaker_id in speakers:
        #print("\nSPEAKER ID: {}".format(speaker_id))
        wav_files = find_speaker_wavs(speaker_id, src_dir)
        #print("source dir is: {}".format(src_dir))
        #print("################################################")
        #print("# found wav files: {}".format(len(wav_files)))
        if len(wav_files) > 1:
            if avg:
                tmp_file = ""
                tmp_index = 0
                emb = np.empty([len(wav_files), 256])
                for i, file_name in enumerate(wav_files):
                    vector = embeddings[file_name]["embedding"]
                    emb[i] = vector
                    tmp_file = file_name
                    tmp_index = i
                emb = np.mean(emb, axis=0)
                speakers_info[tmp_file] = speaker_id + "_avg_" + str(i+1), emb
                print(speaker_id + "_avg_" + str(i+1))
            else:
                for file_name in wav_files:
                    speakers_info[file_name] = speaker_id, embeddings[file_name]["embedding"]
        else:
            speakers_info[wav_files[0]] = speaker_id, embeddings[wav_files[0]]["embedding"]
        
    f.close()
    return speakers_info

## TTS MODEL UTILS

In [6]:
def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None, silent=False):
    t_1 = time.time()
    t_2_s = time.time()
    waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
    t_2_e = time.time()
    if not silent:
        print("CONFIG.model: ", CONFIG.model)
    if CONFIG.model == "Tacotron" and not use_gl:
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T

    waveform_voc = hifi_predict(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), silent)
    waveform_voc = waveform_voc.flatten()
    rtf2 = (t_2_e - t_2_s)/ (len(waveform_voc) / ap.sample_rate)
    rtf = (time.time() - t_1) / (len(waveform_voc) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform_voc)
    if not silent:
        print(" > Run-time: {}".format(time.time() - t_1))
        print(" > Real-time factor without vocoder: {}".format(rtf2))
        print(" > Real-time factor: {}".format(rtf))
        print(" > Time per step: {}".format(tps))
    return waveform, waveform_voc

speaker_embedding = None
use_griffin_lim = False
speaker_embedding_dim = 512
def load_tts_model(CONFIG_PATH, MODEL_PATH, speaker_embedding_dim):
    # load the config
    C = load_config(CONFIG_PATH)

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'characters' in C.keys():
        symbols, phonemes = make_symbols(**C.characters)
    
    # load the tts model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, 20, C, speaker_embedding_dim)
    cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
    model.load_state_dict(cp['model'])
    model.eval()

    if USE_CUDA:
        model = model.cuda()

    model.length_scale = 1 # set speed of the speech. 
    model.noise_scale = 0.0 # set speech variation
    return model, ap, C

## LOAD TTS MODELS

In [7]:
#if changing mss vector dimension change the vector size here as the last arguement.
mss_model, mss_ap, mss_c = load_tts_model(MSS_TTS_CONFIG_PATH, MSS_TTS_MODEL_PATH, 512)
# s_vector_model, s_vector_ap, s_vector_c = load_tts_model(S_VECTOR_TTS_CONFIG_PATH, S_VECTOR_TTS_MODEL_PATH, 512)
# ge2e_model, ge2e_ap, ge2e_c = load_tts_model(GE2E_TTS_CONFIG_PATH, GE2E_TTS_MODEL_PATH, 256) 

 > Setting up Audio Processor...
 | > sample_rate:24000
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.98
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:None
 | > hop_length:256
 | > win_length:1024
 > Using model: glow_tts


  return librosa.filters.mel(
The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at ../aten/src/ATen/native/BatchLinearAlgebra.cpp:2426.)
  w_init = torch.qr(


## HIFI-GAN UTILS

In [8]:
h = None
device = None

MAX_WAV_VALUE = 32767.5

def load_checkpoint(filepath, device):
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print("Complete.")
    return checkpoint_dict


def scan_checkpoint(cp_dir, prefix):
    pattern = os.path.join(cp_dir, prefix + '*')
    cp_list = glob.glob(pattern)
    if len(cp_list) == 0:
        return ''
    return sorted(cp_list)[-1]

def hifi_predict(spectrogram, silent=False):
    start = time.time()
    y_g_hat = generator(spectrogram)
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.detach().cpu().numpy().astype('int16')
    if not silent:
        print('HiFi-GAN Time', time.time()-start)
    return audio

## LOAD HIFI-GAN MODEL

In [9]:
with open(hifi_gan_config_file) as f:
    data = f.read()
    json_config = json.loads(data)
    h = AttrDict(json_config)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

generator = Generator(h).to(device)

state_dict_g = load_checkpoint(hifi_gan_checkpoint, device)
generator.load_state_dict(state_dict_g['generator'])

Loading '/home/fa578s/miniconda3/envs/My_project/checkpoints/hifi_gan/generator.pt'
Complete.


<All keys matched successfully>

## MCD & COSINE SIMILARITY UTILS

In [10]:
SAMPLING_RATE = 24000
FRAME_PERIOD = 5.0

def cos_sim(a, b):
    #Calculates the cosine similarity
    a = np.array(a)
    b = np.array(b)
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    return dot_product / (norm_a * norm_b)

def load_wav(wav_file, sr):
    """
    Load a wav file with librosa.
    :param wav_file: path to wav file
    :param sr: sampling rate
    :return: audio time series numpy array
    """
    wav, _ = librosa.load(wav_file, sr=sr, mono=True)

    return wav

def log_spec_dB_dist(x, y):
    log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
    diff = x - y
    
    return log_spec_dB_const * math.sqrt(np.inner(diff, diff))

def wav2mcep_numpy(wavfile, target_directory, alpha=0.65, fft_size=512, mcep_size=34):
    # make relevant directories
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    loaded_wav = load_wav(wavfile, sr=SAMPLING_RATE)

    # Use WORLD vocoder to spectral envelope
    _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=SAMPLING_RATE,
                                   frame_period=FRAME_PERIOD, fft_size=fft_size)

    # Extract MCEP features
    mgc = pysptk.sptk.mcep(sp, order=mcep_size, alpha=alpha, maxiter=0,
                           etype=1, eps=1.0E-8, min_det=0.0, itype=3)

    fname = os.path.basename(wavfile).split('.')[0]
    
    #Save mceps to numpy file
    np.save(os.path.join(target_directory, fname + '.npy'),
            mgc,
            allow_pickle=False)

def average_mcd(ref_mcep_files, synth_mcep_files, cost_function):
    """
    Calculate the average MCD.
    :param ref_mcep_files: list of strings, paths to MCEP target reference files
    :param synth_mcep_files: list of strings, paths to MCEP converted synthesised files
    :param cost_function: distance metric used
    :returns: average MCD, total frames processed
    """
    min_cost_tot = 0.0
    frames_tot = 0
    
    for ref in ref_mcep_files:
        for synth in synth_mcep_files:
            # get the reference and synthesized speaker names and ids
            ref_fsplit, synth_fsplit = os.path.basename(ref).split('_'), os.path.basename(synth).split('_')
            ref_spk, ref_id = ref_fsplit[0], ref_fsplit[-1]

            if (len(synth_fsplit) > 4):
                synth_spk, synth_id = synth_fsplit[2], synth_fsplit[-1]
            else:
                synth_spk, synth_id = synth_fsplit[1], synth_fsplit[-1]
                
            # if the speaker name is the same and sample id is the same, calculate MCD
            if ref_spk == synth_spk and ref_id == synth_id:
                # load MCEP vectors
                ref_vec = np.load(ref)
                ref_frame_no = len(ref_vec)
                synth_vec = np.load(synth)

                # dynamic time warping using librosa
                min_cost, _ = librosa.sequence.dtw(ref_vec[:, 1:].T, synth_vec[:, 1:].T, 
                                                   metric=cost_function)
                
                min_cost_tot += np.mean(min_cost)
                frames_tot += ref_frame_no

    mean_mcd = min_cost_tot / frames_tot
    
    return mean_mcd, frames_tot

## Synthesizing Test Voices

In [11]:
#Example Prompt
#hello world. I am a synthetic voice produced by a zero-shot text-to-speech synthesis system.

AVG = False
FIRST_COMPUTE = True

#The speaker names for ids
unseen_speakers = ["christina_arctic", "tristin_arctic", "r1"]

# create output path if it doesnt exist
os.makedirs(OUT_PATH, exist_ok=True)

#Resample files to 24kHz if needed
resample_files("/home/fa578s/thesis-mss-vector-synthesis/audios/input/audio/")

#initialize file list and mcd variables
reference_file_list = []

mss_more_similar = 0
other_mss_more_similar = 0
i = 0
mss_mcd_lower = 0


        # s_vector_out_path = os.path.join(OUT_PATH, "s_vector/" + ("avg_" if AVG else "") + "s_vector_" + wav_file)
        # s_vector_ap.save_wav(s_vector_wav, s_vector_out_path) 
        
        # ge2e_out_path = os.path.join(OUT_PATH, "ge2e/" + ("avg_" if AVG else "") + "ge2e_" + wav_file)
        # ge2e_ap.save_wav(ge2e_wav, ge2e_out_path) 
    
    # #Compute embeddings of synthesized samples
    # compute_embeddings(MSS_ENCODER_PATH, MSS_CONFIG, MSS_DATASET_CONFIG, "output", "mss")
    # # compute_embeddings(S_VECTOR_ENCODER_PATH, S_VECTOR_CONFIG, S_VECTOR_DATASET_CONFIG, "output",  "s_vector")
    # # compute_embeddings(GE2E_ENCODER_PATH, GE2E_CONFIG, GE2E_DATASET_CONFIG, "output", "ge2e")
    
    # #load the synthesized embeddings
    # mss_unseen_speaker_after_embeddings = load_speaker_embeddings(unseen_speakers, mss_after_embedding_file, src_dir="./audios/output/audio/mss/", avg=AVG)
    # s_vector_unseen_speaker_after_embeddings = load_speaker_embeddings(unseen_speakers, s_vector_after_embedding_file, src_dir="./audios/output/audio/s_vector/", avg=AVG)
    # ge2e_unseen_speaker_after_embeddings = load_speaker_embeddings(unseen_speakers, ge2e_after_embedding_file, src_dir="./audios/output/audio/ge2e/", avg=AVG)
    
    # for file_name in reference_file_list:
    #     i = i + 1
    #     #speaker_name = mss_unseen_speaker_preembeddings[file_name.split('/')[-1]][0]
    #     speaker_name = file_name.split('/')[-1].split('.')[0]#.split('_')[0]
    #     wav_file =file_name.split('/')[-1]
        
    #     #Aquire proper embedding from each model the for given reference file
    #     s_vector_synthesized_speaker_embedding = s_vector_unseen_speaker_after_embeddings["s_vector_" + wav_file][1]
    #     mss_synthesized_speaker_embedding = mss_unseen_speaker_after_embeddings["mss_" + wav_file][1]
    #     ge2e_synthesized_speaker_embedding = ge2e_unseen_speaker_after_embeddings["ge2e_" + wav_file][1]
        
    #     #Calculate cosine similarity
    #     mss_similarity = cos_sim(mss_speaker_embedding, mss_synthesized_speaker_embedding)
    #     s_vector_similarity = cos_sim(s_vector_speaker_embedding, s_vector_synthesized_speaker_embedding)
    #     ge2e_similarity = cos_sim(ge2e_speaker_embedding, ge2e_synthesized_speaker_embedding)
    #     if mss_similarity > s_vector_similarity:
    #         mss_more_similar = mss_more_similar + 1
        
    #     print("\n########################################################################")
    #     print("Synthesize sentence with Speaker: ", speaker_name)
    #     _, mss_wav = tts(mss_model, TEXT, mss_c, USE_CUDA, mss_ap, use_griffin_lim, None, speaker_embedding=mss_speaker_embedding, silent=True)
    #     _, s_vector_wav = tts(s_vector_model, TEXT, s_vector_c, USE_CUDA, s_vector_ap, use_griffin_lim, None, speaker_embedding=s_vector_speaker_embedding, silent=True)
    #     _, ge2e_wav = tts(ge2e_model, TEXT, ge2e_c, USE_CUDA, ge2e_ap, use_griffin_lim, None, speaker_embedding=ge2e_speaker_embedding, silent=True)
        
    #     print("\nSynthesized: MSS Vector\nCosine Similarity: {}".format(mss_similarity))
    #     IPython.display.display(Audio(mss_wav, rate=mss_ap.sample_rate))
        
    #     print("Synthesized: S Vector\nCosine Similarity: {}".format(s_vector_similarity))
    #     IPython.display.display(Audio(s_vector_wav, rate=s_vector_ap.sample_rate))
        
    #     print("Synthesized: GE2E\nCosine Similarity: {}".format(ge2e_similarity))
    #     IPython.display.display(Audio(ge2e_wav, rate=ge2e_ap.sample_rate))
        
    #     print("Original")
    #     IPython.display.display(Audio(file_name, rate=s_vector_ap.sample_rate))

    #     #Set path to wave file for mcd calculation
    #     mss_out_path = os.path.join(OUT_PATH, "mss/" + "mss_" + wav_file)
    #     s_vector_out_path = os.path.join(OUT_PATH, "s_vector/" + "s_vector_" + wav_file)
    #     ge2e_out_path = os.path.join(OUT_PATH, "ge2e/" + "ge2e_" + wav_file)
        
    #     #MCD calculation variables
    #     alpha = 0.65  # commonly used at 22050 Hz
    #     fft_size = 512
    #     mcep_size = 34
        
    #     #Directories of where mcep files are stored
    #     mss_mcep_dir = "./audios/mceps_numpy/mss/"
    #     s_vector_mcep_dir = "./audios/mceps_numpy/s_vector/"
    #     ge2e_mcep_dir = "./audios/mceps_numpy/ge2e/"
    #     ref_mcep_dir = "./audios/mceps_numpy/ref/"
        
    #     #Generate mcep files
    #     wav2mcep_numpy(file_name, ref_mcep_dir, fft_size=fft_size, mcep_size=mcep_size)
    #     wav2mcep_numpy(mss_out_path, mss_mcep_dir, fft_size=fft_size, mcep_size=mcep_size)
    #     wav2mcep_numpy(s_vector_out_path, s_vector_mcep_dir, fft_size=fft_size, mcep_size=mcep_size)
    #     wav2mcep_numpy(ge2e_out_path, ge2e_mcep_dir, fft_size=fft_size, mcep_size=mcep_size)
        
    #     #Find mcep files
    #     ref_mceps = glob.glob("./audios/mceps_numpy/ref/*")
    #     mss_mceps = glob.glob("./audios/mceps_numpy/mss/*")
    #     s_vector_mceps = glob.glob("./audios/mceps_numpy/s_vector/*")
    #     ge2e_mceps = glob.glob("./audios/mceps_numpy/ge2e/*")
        
    #     cost_function = log_spec_dB_dist
        
    #     #Calculate MCD
    #     mss_mcd, mss_tot_frames_used = average_mcd(ref_mceps, mss_mceps, cost_function)
    #     s_vector_mcd, s_vector_tot_frames_used = average_mcd(ref_mceps, s_vector_mceps, cost_function)
    #     ge2e_mcd, ge2e_tot_frames_used = average_mcd(ref_mceps, ge2e_mceps, cost_function)

    #     print(f'MSS MCD = {mss_mcd} dB, calculated over a total of {mss_tot_frames_used} frames')
    #     print(f'Common MCD = {s_vector_mcd} dB, calculated over a total of {s_vector_tot_frames_used} frames')
    #     print(f'GE2E MCD = {ge2e_mcd} dB, calculated over a total of {ge2e_tot_frames_used} frames')
        
    #     #remove mcep files
    #     os.system("rm ./audios/mceps_numpy/*/*.npy")

Found 9 total files...
Found 0 files that need resampled...


0it [00:00, ?it/s]


In [13]:
while True:
    TEXT =  input("Enter sentence: ")
    if TEXT == 'q':
        break
    print(" > Text: {}".format(TEXT))
    
    #Compute initial speaker embeddings
    compute_embeddings(MSS_ENCODER_PATH, MSS_CONFIG, INPUT_DATASET_CONFIG, "input", "mss")
    # compute_embeddings(S_VECTOR_ENCODER_PATH, S_VECTOR_CONFIG, INPUT_DATASET_CONFIG, "input",  "s_vector")
    # compute_embeddings(GE2E_ENCODER_PATH, GE2E_CONFIG, INPUT_DATASET_CONFIG, "input", "ge2e")
    
    #load initial speaker embeddings
    mss_unseen_speaker_preembeddings = load_speaker_embeddings(unseen_speakers, mss_pre_embedding_file, src_dir="/home/fa578s/thesis-mss-vector-synthesis/audios/input/audio/", avg=AVG)
    # s_vector_unseen_speaker_preembeddings = load_speaker_embeddings(unseen_speakers, s_vector_pre_embedding_file, src_dir="./audios/input/audio/", avg=AVG)
    # ge2e_unseen_speaker_preembeddings = load_speaker_embeddings(unseen_speakers, ge2e_pre_embedding_file, src_dir="./audios/input/audio/", avg=AVG)

    #populate file list
    for file_name in mss_unseen_speaker_preembeddings.keys():
        reference_file_list.append("/home/fa578s/thesis-mss-vector-synthesis/audios/input/audio/" + file_name)    
        
    for file_name in reference_file_list:
        i = i + 1
        speaker_name = mss_unseen_speaker_preembeddings[file_name.split('/')[-1]][0]
        #speaker_name = file_name.split('/')[-1].split('.')[0]#.split('_')[0]
        wav_file =file_name.split('/')[-1]
     
        #get the proper presynthesis speaker embedding
        mss_speaker_embedding = mss_unseen_speaker_preembeddings[wav_file][1]
        # s_vector_speaker_embedding = s_vector_unseen_speaker_preembeddings[wav_file][1]
        # ge2e_speaker_embedding = ge2e_unseen_speaker_preembeddings[wav_file][1]
        
        #Synthesize wave files
        _, mss_wav = tts(mss_model, TEXT, mss_c, USE_CUDA, mss_ap, use_griffin_lim, None, speaker_embedding=mss_speaker_embedding, silent=True)
        # _, s_vector_wav = tts(s_vector_model, TEXT, s_vector_c, USE_CUDA, s_vector_ap, use_griffin_lim, None, speaker_embedding=s_vector_speaker_embedding, silent=True)
        # _, ge2e_wav = tts(ge2e_model, TEXT, ge2e_c, USE_CUDA, ge2e_ap, use_griffin_lim, None, speaker_embedding=ge2e_speaker_embedding, silent=True)

        #Save files to appropriate directory
        mss_out_path = os.path.join(OUT_PATH, ("avg_" if AVG else "") + "mss_" + wav_file)
        mss_ap.save_wav(mss_wav, mss_out_path) 
        

Enter sentence:  hello this is a test audio


 > Text: hello this is a test audio


/bin/bash: line 1: /home/tristin/miniconda3/bin/activate: No such file or directory


TypeError: bad operand type for unary +: 'str'