# Short code

## Create dataset

In [3]:
!pip show faster_whisper

Name: faster-whisper
Version: 0.9.0
Summary: Faster Whisper transcription with CTranslate2
Home-page: https://github.com/guillaumekln/faster-whisper
Author: Guillaume Klein
Author-email: 
License: MIT
Location: c:\users\tibed\anaconda3\envs\researchenv\lib\site-packages
Requires: av, ctranslate2, huggingface-hub, onnxruntime, tokenizers
Required-by: 


In [1]:
from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
import gc
from tqdm import tqdm
import torch
import torchaudio
import pandas
from faster_whisper import WhisperModel
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
    audio_total_size = 0
    # make sure that ooutput file exists
    os.makedirs(out_path, exist_ok=True)

    # print lenght of audio files
    print(f"Found {len(audio_files)} audio files!")

    # Loading Whisper
    device = "cuda" if torch.cuda.is_available() else "cpu" 
    # device = "cpu"
    print(f"Using {device} device")

    print("Loading Whisper Model!")
    asr_model = WhisperModel("large-v2", device=device, compute_type="float32")

    metadata = {"audio_file": [], "text": [], "speaker_name": []}

    if gradio_progress is not None:
        tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
    else:
        tqdm_object = tqdm(audio_files)

    for audio_path in tqdm_object:
        wav, sr = torchaudio.load(audio_path)
        # stereo to mono if needed
        if wav.size(0) != 1:
            wav = torch.mean(wav, dim=0, keepdim=True)

        wav = wav.squeeze()
        audio_total_size += (wav.size(-1) / sr)

        print(f"Transcribing {audio_path}...")
        print(torch.cuda.memory_allocated())
        print(torch.cuda.memory_reserved())
        segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language,)
        segments = list(segments)
        print(f"Found {len(segments)} segments!")
        i = 0
        sentence = ""
        sentence_start = None
        first_word = True
        # added all segments words in a unique list
        words_list = []
        for _, segment in enumerate(segments):
            words = list(segment.words)
            words_list.extend(words)

        # process each word
        for word_idx, word in enumerate(words_list):
            if first_word:
                sentence_start = word.start
                # If it is the first sentence, add buffer or get the begining of the file
                if word_idx == 0:
                    sentence_start = max(sentence_start - buffer, 0)  # Add buffer to the sentence start
                else:
                    # get previous sentence end
                    previous_word_end = words_list[word_idx - 1].end
                    # add buffer or get the silence midle between the previous sentence and the current one
                    sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)

                sentence = word.word
                first_word = False
            else:
                sentence += word.word

            if word.word[-1] in ["!", ".", "?"]:
                sentence = sentence[1:]
                # Expand number and abbreviations plus normalization
                sentence = multilingual_cleaners(sentence, target_language)
                audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))

                audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"

                # Check for the next word's existence
                if word_idx + 1 < len(words_list):
                    next_word_start = words_list[word_idx + 1].start
                else:
                    # If don't have more words it means that it is the last sentence then use the audio len as next word start
                    next_word_start = (wav.shape[0] - 1) / sr

                # Average the current word end and next word start
                word_end = min((word.end + next_word_start) / 2, word.end + buffer)
                
                absoulte_path = os.path.join(out_path, audio_file)
                os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
                i += 1
                first_word = True

                audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
                # if the audio is too short ignore it (i.e < 0.33 seconds)
                if audio.size(-1) >= sr/3:
                    torchaudio.save(absoulte_path,
                        audio,
                        sr
                    )
                else:
                    continue

                metadata["audio_file"].append(audio_file)
                metadata["text"].append(sentence)
                metadata["speaker_name"].append(speaker_name)

    df = pandas.DataFrame(metadata)
    df = df.sample(frac=1)
    num_val_samples = int(len(df)*eval_percentage)

    df_eval = df[:num_val_samples]
    df_train = df[num_val_samples:]

    df_train = df_train.sort_values('audio_file')
    train_metadata_path = os.path.join(out_path, "metadata_train.csv")
    df_train.to_csv(train_metadata_path, sep="|", index=False)

    eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
    df_eval = df_eval.sort_values('audio_file')
    df_eval.to_csv(eval_metadata_path, sep="|", index=False)

    # deallocate VRAM and RAM
    del asr_model, df_train, df_eval, df, metadata
    gc.collect()

    return train_metadata_path, eval_metadata_path, audio_total_size

def clear_gpu_cache():
    # clear the GPU cache
    if torch.cuda.is_available():
        print("Clearing GPU Cache...")
        torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
outPath = "G:\\rp-tibedm\\tts_models\\coqui_xtss"
dataOutPath = os.path.join(outPath, "dataset")
audio_file = "G:\\rp-tibedm\\audio\\Alan_Wake_Voice_Short.wav"
# audio_file1 = "G:\\rp-tibedm\\audio\\koning_filip_toespraak1.wav"
# audio_file2 = "G:\\rp-tibedm\\audio\\koning_filip_toespraak2.wav"
language = "en"
train_metadata_path, eval_metadata_path, audio_total_size = format_audio_list([audio_file], out_path=dataOutPath,
                                                                               gradio_progress=None,speaker_name="alanWake", target_language=language)
print("################ Dataset Processed! ################")


Found 1 audio files!
Using cuda device
Loading Whisper Model!


  0%|          | 0/1 [00:00<?, ?it/s]

Transcribing G:\rp-tibedm\tts_models\audio\Alan_Wake_Voice_Short.wav...
0
0
Found 72 segments!


100%|██████████| 1/1 [00:19<00:00, 19.28s/it]

################ Dataset Processed! ################





## Train model

In [4]:
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
from TTS.utils.manage import ModelManager
from TTS.tts.datasets import load_tts_samples
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from trainer import Trainer, TrainerArgs
import gc
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
import torch


def create_train_model(output_path, train_csv, eval_csv, num_epochs, batch_size, grad_acum, language, speaker_name ,max_audio_length=255995):
    """
    Creates a new XTTS model and trains it with the given parameters.
    output_path: path to save the model
    train_csv: path to the train csv file
    eval_csv: path to the eval csv file
    num_epochs: number of epochs to train
    batch_size: batch size
    grad_acum: grad accumulation steps
    language: language of the speaker
    speaker_name: name of the speaker
    max_audio_length: max audio length
    Returns: XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref
    """
    RUN_NAME = speaker_name + "_run"
    PROJECT_NAME = speaker_name + "_project"
    DASHBOARD_LOGGER = "tensorboard"
    LOGGER_URI = None

    OUT_PATH = os.path.join(output_path, "runs/")
    # OUT_PATH = output_path

    OPTIMIZER_WD_ONLY_ON_WEIGHTS = True
    START_WITH_EVAL = False  # if True it will star with evaluation
    BATCH_SIZE = batch_size  # set here the batch size
    GRAD_ACUMM_STEPS = grad_acum  # set here the grad accumulation steps

    dataset_config = BaseDatasetConfig(
        formatter="coqui",
        dataset_name=speaker_name,
        path=os.path.dirname(train_csv),
        meta_file_train=train_csv,
        meta_file_val=eval_csv,
        language=language,
    )
    CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
    os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)

    # DVAE files
    DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
    MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"

    DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
    MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))

    # download the files if they don't exist
    if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
        print(" > Downloading DVAE files!")
        ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)

    # XTTS 2.0 checkpoint files
    TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
    XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
    XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"


    TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
    XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
    XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK))  # config.json file

    # download the files if they don't exist
    if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
        print(" > Downloading XTTS v2.0 files!")
        ModelManager._download_model_files(
            [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
        )


    model_args = GPTArgs(
        max_conditioning_length=132300,  # 6 secs
        min_conditioning_length=66150,  # 3 secs
        debug_loading_failures=False,
        max_wav_length=max_audio_length,  # ~11.6 seconds
        max_text_length=200,
        mel_norm_file=MEL_NORM_FILE,
        dvae_checkpoint=DVAE_CHECKPOINT,
        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
        tokenizer_file=TOKENIZER_FILE,
        gpt_num_audio_tokens=1026,
        gpt_start_audio_token=1024,
        gpt_stop_audio_token=1025,
        gpt_use_masking_gt_prompt_approach=True,
        gpt_use_perceiver_resampler=True,
    )

    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)


    config = GPTTrainerConfig(
        epochs=num_epochs,
        output_path=OUT_PATH,
        model_args=model_args,
        run_name=RUN_NAME,
        project_name=PROJECT_NAME,
        run_description=f"""
            {speaker_name} XTTS training
            """,
        # dashboard_logger=DASHBOARD_LOGGER,
        logger_uri=LOGGER_URI,
        audio=audio_config,
        batch_size=BATCH_SIZE,
        batch_group_size=48,
        eval_batch_size=BATCH_SIZE,
        num_loader_workers=8,
        eval_split_max_size=256,
        print_step=50,
        plot_step=100,
        log_model_step=100,
        save_step=1000,
        save_n_checkpoints=1,
        save_checkpoints=True,
        # target_loss="loss",
        print_eval=False,
        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
        optimizer="AdamW",
        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
        lr=5e-06,  # learning rate
        lr_scheduler="MultiStepLR",
        # it was adjusted accordly for the new step scheme
        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
        test_sentences=[],
    )

    train_samples, eval_samples = load_tts_samples(
        dataset_config,
        eval_split=True,
        eval_split_max_size=config.eval_split_max_size,
        eval_split_size=config.eval_split_size,
    )
    print(" > Loaded samples!")
    print(train_samples)
    print(eval_samples)

    model = GPTTrainer.init_from_config(config)


    # INITIALIZE THE TRAINER
    trainer = Trainer(
        TrainerArgs(
            restore_path=None,
            skip_train_epoch=False,
            start_with_eval=START_WITH_EVAL,
            grad_accum_steps=GRAD_ACUMM_STEPS,
        ), 
        config, 
        output_path=OUT_PATH, 
        model=model, 
        train_samples=train_samples, 
        eval_samples=eval_samples
    )

    # START TRAINING
    trainer.fit()

    samples_len = [len(item["text"].split(" ")) for item in train_samples]
    longest_text_idx =  samples_len.index(max(samples_len))
    speaker_ref = train_samples[longest_text_idx]["audio_file"]

    trainer_out_path = trainer.output_path

    del model, trainer, train_samples, eval_samples
    gc.collect()

    return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref

def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length, speaker_name):
    """
    Creates a new XTTS model and trains it with the given parameters.
    language: language of the speaker
    train_csv: path to the train csv file
    eval_csv: path to the eval csv file
    num_epochs: number of epochs to train
    batch_size: batch size
    grad_acumm: grad accumulation steps
    output_path: path to save the model
    max_audio_length: max audio length in seconds
    speaker_name: name of the speaker
    Returns: XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref
    """
    clear_gpu_cache()
    # check if train and eval csv files exist

    print(max_audio_length)
    max_audio_length = int(max_audio_length * 22050)
    print(max_audio_length)
    config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_ref = create_train_model(output_path, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, language, speaker_name, max_audio_length)

    os.system('copy "{}" "{}"'.format(os.path.dirname(vocab_file)+'\\'+os.path.basename(vocab_file), os.path.join(exp_path, "original_vocab.json")))
    os.system('copy "{}" "{}"'.format(os.path.dirname(vocab_file)+'\\'+os.path.basename(config_path), os.path.join(exp_path, "original_config.json")))

    # os.system(f"copy {config_path} {exp_path}")
    # os.system(f"copy {vocab_file} {exp_path}")

    ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
    print(" Done Training!")
    clear_gpu_cache()
    return config_path, vocab_file, ft_xtts_checkpoint, speaker_ref

def clear_gpu_cache():
    """Clears the GPU cache."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


In [5]:
# audio_file = "AlanWakeTest\Alan_Wake_Voice_Short.wav"
# audio_file = os.path.join("C:\\Users\\tibed\\Documents\\Python Scripts", audio_file)
outPath = "G:\\rp-tibedm\\tts_models\\coqui_xtss"
# print(outPath,"\n",audio_file)

dataOutPath = os.path.join(outPath, "dataset")

train_csv = os.path.join(outPath,"dataset","metadata_train.csv")
eval_csv = os.path.join(outPath,"dataset","metadata_eval.csv")
language = "en"
batch_size = 8
epochs = 10
# epochs = 1
grad_acumm = 1
speaker_name = "alanWake"
max_audio_length = 11.6

config_path, vocab_file, ft_xtts_checkpoint, speaker_ref = train_model(language, train_csv, eval_csv, epochs, batch_size, grad_acumm, outPath, max_audio_length, speaker_name)

11.6
255780
 | > Found 60 files in G:\rp-tibedm\tts_models\coqui_xtss\dataset
 > Loaded samples!
[{'text': 'i could feel him as a growing pressure in my head, stronger by the minute.', 'audio_file': 'G:\\rp-tibedm\\tts_models\\coqui_xtss\\dataset\\wavs/Alan_Wake_Voice_Short_00000001.wav', 'speaker_name': 'alanWake', 'emotion_name': 'neutral', 'root_path': 'G:\\rp-tibedm\\tts_models\\coqui_xtss\\dataset', 'language': 'en', 'audio_unique_name': 'alanWake#wavs\\Alan_Wake_Voice_Short_00000001'}, {'text': "why didn't he kill me with the rest?", 'audio_file': 'G:\\rp-tibedm\\tts_models\\coqui_xtss\\dataset\\wavs/Alan_Wake_Voice_Short_00000002.wav', 'speaker_name': 'alanWake', 'emotion_name': 'neutral', 'root_path': 'G:\\rp-tibedm\\tts_models\\coqui_xtss\\dataset', 'language': 'en', 'audio_unique_name': 'alanWake#wavs\\Alan_Wake_Voice_Short_00000002'}, {'text': 'what did he want?', 'audio_file': 'G:\\rp-tibedm\\tts_models\\coqui_xtss\\dataset\\wavs/Alan_Wake_Voice_Short_00000003.wav', 'speake

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 20
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=G:\rp-tibedm\tts_models\coqui_xtss\runs/alanWake_run-January-11-2024_05+20PM-efdefc7


>> DVAE weights restored from: G:\rp-tibedm\tts_models\coqui_xtss\runs/XTTS_v2.0_original_model_files/dvae.pth
Skipping logging for now



 > Model has 518442047 parameters

[4m[1m > EPOCH: 0/10[0m
 --> G:\rp-tibedm\tts_models\coqui_xtss\runs/alanWake_run-January-11-2024_05+20PM-efdefc7

[1m > TRAINING (2024-01-11 17:21:00) [0m


 > Sampling by language: dict_keys(['en'])



[1m   --> TIME: 2024-01-11 17:21:47 -- STEP: 0/8 -- GLOBAL_STEP: 0[0m
     | > loss_text_ce: 0.021239206194877625  (0.021239206194877625)
     | > loss_mel_ce: 4.385165691375732  (4.385165691375732)
     | > loss: 4.406404972076416  (4.406404972076416)
     | > grad_norm: 0  (0)
     | > current_lr: 5e-06 
     | > step_time: 1.6947  (1.6946632862091064)
     | > loader_time: 46.0135  (46.01351261138916)


[1m > EVALUATION [0m



 > Filtering invalid eval samples!!
 > Total eval samples after filtering: 10



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.023373842239379883 [0m(+0)
     | > avg_loss_text_ce: 0.018926315009593964 [0m(+0)
     | > avg_loss_mel_ce: 4.724788188934326 [0m(+0)
     | > avg_loss: 4.743714332580566 [0m(+0)

 > BEST MODEL : G:\rp-tibedm\tts_models\coqui_xtss\runs/alanWake_run-January-11-2024_05+20PM-efdefc7\best_model_8.pth

[4m[1m > EPOCH: 1/10[0m
 --> G:\rp-tibedm\tts_models\coqui_xtss\runs/alanWake_run-January-11-2024_05+20PM-efdefc7

[1m > TRAINING (2024-01-11 17:24:24) [0m

[1m > EVALUATION [0m


  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.016785621643066406 [0m(-0.0065882205963134766)
     | > avg_loss_text_ce:[92m 0.018738877028226852 [0m(-0.0001874379813671112)
     | > avg_loss_mel_ce:[92m 4.275575637817383 [0m(-0.44921255111694336)
     | > avg_loss:[92m 4.294314384460449 [0m(-0.4493999481201172)

 > BEST MODEL : G:\rp-tibedm\tts_models\coqui_xtss\runs/alanWake_run-January-11-2024_05+20PM-efdefc7\best_mode

 Done Training!


In [11]:
print(config_path)
print(vocab_file)
print(ft_xtts_checkpoint)
print(speaker_ref)

G:\rp-tibedm\tts_models\coqui_xtss\runs/XTTS_v2.0_original_model_files/config.json
G:\rp-tibedm\tts_models\coqui_xtss\runs/XTTS_v2.0_original_model_files/vocab.json
G:\rp-tibedm\tts_models\coqui_xtss\runs/alanWake_run-January-11-2024_05+20PM-efdefc7\best_model.pth
G:\rp-tibedm\tts_models\coqui_xtss\dataset\wavs/Alan_Wake_Voice_Short_00000029.wav


## Load model

In [None]:
!pip show tts

In [2]:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import torch

def clear_gpu_cache():
    """Clears the GPU cache."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
def load_model(ft_xtts_checkpoint,config_path, vocab_file):
    """
    Loads a XTTS model from the given files.
    config_path: path to the config file
    vocab_file: path to the vocab file
    ft_xtts_checkpoint: path to the fine-tuned checkpoint
    Returns: XTTS model
    """
    clear_gpu_cache()
    # load the model
    # model = GPTTrainer.load_from_checkpoint(
    #     checkpoint_path=ft_xtts_checkpoint,
    #     config_path=config_path,
    #     vocab_file=vocab_file,
    #     map_location="cpu",
    # )
    # model.eval()

    config = XttsConfig()
    config.load_json(config_path)
    print("config loaded", config)
    model = Xtts.init_from_config(config)
    # print("model loaded", model)
    model.load_checkpoint(config, checkpoint_path=ft_xtts_checkpoint, vocab_path=vocab_file, use_deepspeed=False)
    if torch.cuda.is_available():
        model.cuda()
    print("model loaded")

    return model

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# model = load_model("G:\\rp-tibedm\\coqui_xtss\\runs\\ObamaTest_run-January-08-2024_12+19PM-0000000\\best_model.pth",
#                    "G:\\rp-tibedm\coqui_xtss\\runs\XTTS_v2.0_original_model_files\config.json",
#                    "G:\\rp-tibedm\coqui_xtss\\runs\XTTS_v2.0_original_model_files\\vocab.json")
model = load_model("G:\\rp-tibedm\\tts_models\\coqui_xtss\\runs/koning_filip_run-January-09-2024_05+47PM-580be11\\best_model.pth",
                   "G:\\rp-tibedm\\tts_models\\coqui_xtss\\runs\XTTS_v2.0_original_model_files\config.json",
                   "G:\\rp-tibedm\\tts_models\\coqui_xtss\\runs\XTTS_v2.0_original_model_files\\vocab.json")


config loaded XttsConfig(output_path='output', logger_uri=None, run_name='run', project_name=None, run_description='🐸Coqui trainer run.', print_step=25, plot_step=100, model_param_stats=False, wandb_entity=None, dashboard_logger='tensorboard', save_on_interrupt=True, log_model_step=None, save_step=10000, save_n_checkpoints=5, save_checkpoints=True, save_all_best=False, save_best_after=10000, target_loss=None, print_eval=False, test_delay_epochs=0, run_eval=True, run_eval_steps=None, distributed_backend='nccl', distributed_url='tcp://localhost:54321', mixed_precision=False, precision='fp16', epochs=1000, batch_size=32, eval_batch_size=16, grad_clip=0.0, scheduler_after_epoch=True, lr=0.001, optimizer='radam', optimizer_params=None, lr_scheduler=None, lr_scheduler_params={}, use_grad_scaler=False, allow_tf32=False, cudnn_enable=True, cudnn_deterministic=False, cudnn_benchmark=False, training_seed=54321, model='xtts', num_loader_workers=0, num_eval_loader_workers=0, use_noise_augment=Fals

In [16]:
!pip show deepspeed



In [8]:
model = load_model(ft_xtts_checkpoint,config_path, vocab_file)

config loaded XttsConfig(output_path='output', logger_uri=None, run_name='run', project_name=None, run_description='🐸Coqui trainer run.', print_step=25, plot_step=100, model_param_stats=False, wandb_entity=None, dashboard_logger='tensorboard', save_on_interrupt=True, log_model_step=None, save_step=10000, save_n_checkpoints=5, save_checkpoints=True, save_all_best=False, save_best_after=10000, target_loss=None, print_eval=False, test_delay_epochs=0, run_eval=True, run_eval_steps=None, distributed_backend='nccl', distributed_url='tcp://localhost:54321', mixed_precision=False, precision='fp16', epochs=1000, batch_size=32, eval_batch_size=16, grad_clip=0.0, scheduler_after_epoch=True, lr=0.001, optimizer='radam', optimizer_params=None, lr_scheduler=None, lr_scheduler_params={}, use_grad_scaler=False, allow_tf32=False, cudnn_enable=True, cudnn_deterministic=False, cudnn_benchmark=False, training_seed=54321, model='xtts', num_loader_workers=0, num_eval_loader_workers=0, use_noise_augment=Fals

In [18]:
import torchaudio


def run_tts(model,lang,text,speaker_ref):
    """
    Runs the given text through the given model.
    model: XTTS model
    lang: language of the speaker
    text: text to synthesize
    Returns: audio
    """

    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_ref, gpt_cond_len=model.config.gpt_cond_len, max_ref_length=model.config.max_ref_len, sound_norm_refs=model.config.sound_norm_refs)
    
    out = model.inference(
        text=text,
        language=lang,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=model.config.temperature, # Add custom parameters here
        length_penalty=model.config.length_penalty,
        repetition_penalty=model.config.repetition_penalty,
        top_k=model.config.top_k,
        top_p=model.config.top_p,
    )
    # save it as a wav file
    out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
    return "created wav", out["wav"]

In [10]:
print(speaker_ref)

G:\rp-tibedm\tts_models\coqui_xtss\dataset\wavs/Alan_Wake_Voice_Short_00000029.wav


In [22]:
# text ="Dit is een test met de XTSS 2.0 model om een stem te klonen en spraak te genereren."
# text ="Dag Belgen, bij deze verklaar ik, koning Filip, dat JBB een grote scam is.  Bjorn Patrick wordt zo snel mogelijk het land uit gezet samen met zijn vieze sekspeeltjes. Anouk mag blijven zolang ze op Tiemen zijn penis verblijft."
# language = "nl"
# speaker_ref = "G:\\rp-tibedm\\tts_models\\coqui_xtss\dataset\wavs/koning_filip_toespraak2_00000032.wav"
text ="This is a test with the XTSS 2.0 model to clone a voice and generate voice"
language = "en"
speaker_ref = "G:\\rp-tibedm\coqui_xtss\dataset\wavs/Obama_speech_short_00000005.wav"
speaker_ref = "G:\\rp-tibedm\\tts_models\\bark\speakers\obama\obama.wav"
check , audio = run_tts(model,language,text,speaker_ref)
print(check)

created wav


In [23]:
# show audio file
import IPython.display as ipd
ipd.Audio(audio, rate=24000)

In [14]:
# show the reference audio file
ipd.Audio(speaker_ref, rate=24000)

In [18]:
torchaudio.save("alan_wake_xtss.wav",audio,24000)
# torchaudio.save("koning-filip-xtss.wav",audio,24000)