In [4]:
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
from types import NoneType

In [5]:
print("Loading model...")
config_path = "/teamspace/studios/this_studio/coqui-TTS/train_moore/models/config.json"
model_path = "/teamspace/studios/this_studio/coqui-TTS/train_moore/models"
vocab_path = "/teamspace/studios/this_studio/coqui-TTS/train_moore/models/vocab.json"

config = XttsConfig()
config.load_json(config_path)

model = Xtts.init_from_config(config)
model.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path)

model.init_models()
model.load_checkpoint(config, checkpoint_dir=model_path)
# model.cuda()

Loading model...


In [6]:
print("Computing speaker latents...")
ref_path = "/teamspace/studios/this_studio/coqui-TTS/train_moore/reference_speaker_male_1.wav"
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[ref_path])

print("Inference...")
out = model.inference(
    "Sʋy sã n waoogẽ, b rɩta taaba.(Yel-bũndi: Nebã sẽn yaa wʋsgã wata ne zu-loees wʋsgo, tɩ bũmb a ye sẽn tõe n wa ne yaa tɩ b pa tõe n paam ligd n maan b sẽn datã ye, tɩ rẽ tõe n wa ne zaba.)",
    "mos",
    gpt_cond_latent,
    speaker_embedding,
    temperature=1, # Add custom parameters here
)

torchaudio.save("reference.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)

Computing speaker latents...
Inference...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [7]:
import IPython


def to_speech(text, temperature=0.7, speaker_reference=ref_path):
    print("Computing speaker latents...")
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_reference)

    print("Inference...")
    out = model.inference(
        text,
        "mos",
        gpt_cond_latent,
        speaker_embedding,
        temperature=1, # Add custom parameters here
    )

    return IPython.display.Audio(torch.tensor(out["wav"]).unsqueeze(0).numpy(), rate=24000)

In [8]:
os.path.realpath(config.model_args.tokenizer_file)

'/teamspace/studios/this_studio/coqui-TTS/train_moore/run/training/CHECKPOINT_GPT_XTTS_v2.0_MOS_FT_2/vocab.json'

In [9]:
to_speech("Sʋy sã n waoogẽ, b rɩta taaba.(Yel-bũndi: Nebã sẽn yaa wʋsgã wata ne zu-loees wʋsgo, tɩ bũmb a ye sẽn tõe n wa ne yaa tɩ b pa tõe n paam ligd n maan b sẽn datã ye, tɩ rẽ tõe n wa ne zaba.)")

Computing speaker latents...


Inference...


In [13]:
#model.push_to_hub("ArissBandoss/coqui-tts-moore-V1")

In [14]:
"""from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="/teamspace/studios/this_studio/coqui-TTS/train_moore/run/training/RUN_GPT_XTTS_v2.0_MOS_FT_1-September-18-2024_09+20PM-f661a1b8",
    repo_id="ArissBandoss/coqui-tts-moore-V1",
    repo_type="model",
)"""

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

best_model_4312.pth:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

best_model_4312.pth:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

events.out.tfevents.1726694446.ip-10-192-10-161.5061.0:   0%|          | 0.00/6.71M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ArissBandoss/coqui-tts-moore-V1/commit/78c844265b5ea4e08383e90d6073bea628c1a290', commit_message='Upload folder using huggingface_hub', commit_description='', oid='78c844265b5ea4e08383e90d6073bea628c1a290', pr_url=None, pr_revision=None, pr_num=None)