In [11]:
import torch
import torchaudio
from tqdm import tqdm
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
import re

# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Model paths
xtts_checkpoint = "/teamspace/studios/this_studio/mosXTTS/checkpoints/GPT_XTTS_FT-September-27-2024_11+12AM-8526b4f/best_model_51560.pth"
xtts_config = "/teamspace/studios/this_studio/mosXTTS/checkpoints/XTTS_v2.0_original_model_files/config.json"
xtts_vocab = "/teamspace/studios/this_studio/mosXTTS/checkpoints/XTTS_v2.0_original_model_files/vocab.json"


def split_moore_sentences(text):
    # Define sentence-ending punctuation patterns
    sentence_endings = re.compile(r'(?<=[.!?])\s+')

    # Split the text into sentences
    sentences = sentence_endings.split(text)
    
    # Clean up any leading/trailing spaces and filter out empty strings
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    
    return sentences

In [12]:
tts_text = """sull kãnga rat n sõnga kom-biisa la pagba"""

tokenizer = VoiceBpeTokenizer(vocab_file=xtts_vocab)
ids = tokenizer.encode(tts_text, "mos")
print(len(ids))
ids

24


[6681,
 1500,
 25,
 2,
 6893,
 6705,
 2,
 31,
 48,
 2,
 27,
 2,
 7225,
 6705,
 2,
 2772,
 8,
 799,
 1779,
 2,
 494,
 2,
 7060,
 1159]

In [13]:
tokenizer.decode(torch.tensor(ids))

'[mos]sull kãnga rat n sõnga kom-biisa la pagba'

In [14]:
# Load model
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(config, 
    checkpoint_path=xtts_checkpoint, 
    vocab_path=xtts_vocab, 
    use_deepspeed=False)
    
XTTS_MODEL.to(device)

print("Model loaded successfully!")

Model loaded successfully!


In [15]:
# Inference
speaker_audio_file = "/teamspace/studios/this_studio/mosXTTS/reference_1_speaker_male_17.wav"
lang = "mos"

gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
    audio_path=speaker_audio_file,
    gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
    max_ref_length=XTTS_MODEL.config.max_ref_len,
    sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)

tts_texts = split_moore_sentences(tts_text)
tts_texts

['sull kãnga rat n sõnga kom-biisa la pagba']

In [16]:
wav_chunks = []
for text in tqdm(tts_texts):
    wav_chunk = XTTS_MODEL.inference(
        text=text,
        language=lang,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=0.1,
        length_penalty=1.0,
        repetition_penalty=10.0,
        top_k=10,
        top_p=0.3,
    )
    wav_chunks.append(torch.tensor(wav_chunk["wav"]))

out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
out_wav

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([[1006, 1000, 1011, 1012,  996,  710,  855,  985, 1002,  132,   96,   75,
          690,  161,  387,  297,   52,  389,  464,   80,  230,  326,   50,  252,
          173,  587,  410,  489,  619,   10,  718,  646,  906,  209,  210,  276,
          371,  839,  135,  674,  417,   55,  781,    6,  426,  350,   35,   89,
          310,   86,  570,  341,  445,  502, 1016,  648,  612,  785,  896,  959,
          976,  465,  989,  950,  990,  747,  178,  501,   18,   12,   83,  623,
          193,  725,  176,  345,    7,  626,  357,  478, 1014,  607,  490,  237,
          811,  972,  991,  964,  821,  993,  987, 1025]])


100%|██████████| 1/1 [00:15<00:00, 15.12s/it]


tensor([[4.2487e-04, 4.4983e-05, 9.7052e-05,  ..., 2.8408e-04, 2.6621e-04,
         6.3133e-05]])

In [17]:
torchaudio.save("/teamspace/studios/this_studio/mosXTTS/audio_tests/test_33.wav", out_wav, 24000)

In [18]:
"""from IPython.display import Audio

Audio(out_wav.numpy(), rate=24000)"""

'from IPython.display import Audio\n\nAudio(out_wav.numpy(), rate=24000)'

In [1]:
from huggingface_hub import HfApi
api = HfApi(token="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")

api.upload_folder(
    folder_path="/teamspace/studios/this_studio/mosXTTS/TTS",
    path_in_repo="/TTS",
    repo_id="ArissBandoss/coqui-tts-moore-V1",
    repo_type="model")

CommitInfo(commit_url='https://huggingface.co/ArissBandoss/coqui-tts-moore-V1/commit/ed15407329529589d5a83fe360ba41152563e3ea', commit_message='Upload folder using huggingface_hub', commit_description='', oid='ed15407329529589d5a83fe360ba41152563e3ea', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ArissBandoss/coqui-tts-moore-V1', endpoint='https://huggingface.co', repo_type='model', repo_id='ArissBandoss/coqui-tts-moore-V1'), pr_revision=None, pr_num=None)

In [1]:
from huggingface_hub import HfApi
api = HfApi(token="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")

api.upload_file(
    path_or_fileobj="/teamspace/studios/this_studio/mosXTTS/deploy_folder/model.tar.gz",
    path_in_repo="model.tar.gz",
    repo_id="ArissBandoss/coqui-tts-moore-V1",
    repo_type="model",
)

model.tar.gz:   0%|          | 0.00/1.96G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ArissBandoss/coqui-tts-moore-V1/commit/a0bd70b0fd59f27f993341ba7c30c58bb085e3a0', commit_message='Upload model.tar.gz with huggingface_hub', commit_description='', oid='a0bd70b0fd59f27f993341ba7c30c58bb085e3a0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ArissBandoss/coqui-tts-moore-V1', endpoint='https://huggingface.co', repo_type='model', repo_id='ArissBandoss/coqui-tts-moore-V1'), pr_revision=None, pr_num=None)

In [4]:

from huggingface_hub import HfApi
api = HfApi(token="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")

api.upload_file(
    path_or_fileobj="/teamspace/studios/this_studio/mosXTTS/models/converted_safetensors/dvae.safetensors",
    path_in_repo="dvae.safetensors",
    repo_id="ArissBandoss/coqui-tts-moore-V1",
    repo_type="model",
)

dvae.safetensors:   0%|          | 0.00/211M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ArissBandoss/coqui-tts-moore-V1/commit/be8d002b3ef86c2f9d80ceadd5031b538603b589', commit_message='Upload dvae.safetensors with huggingface_hub', commit_description='', oid='be8d002b3ef86c2f9d80ceadd5031b538603b589', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ArissBandoss/coqui-tts-moore-V1', endpoint='https://huggingface.co', repo_type='model', repo_id='ArissBandoss/coqui-tts-moore-V1'), pr_revision=None, pr_num=None)