# Voice cloning with Coqui XTTS-V2 in Google Colab.

Hello Dear, in order to use this notebook please enable GPU acceleration for best results and performance.

## Install libraries

In [None]:
!pip install TTS


In [None]:
%pip install spacy


In [None]:
!python -m spacy download es_core_news_sm

In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

In [None]:
#

## Clone!

In [13]:
# @title
import torch
import uuid
from TTS.api import TTS
import numpy as np
from scipy.io.wavfile import write, read
import spacy

def process_large_text(text, speaker_audio_path, model_name, language, output_file):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tts = TTS(model_name=model_name).to(device)
    max_chars = 60
    segments = []

    # Load spaCy Language Model
    nlp = spacy.load("es_core_news_sm")
    doc = nlp(text)

    # Extract sentences
    current_segment = ""
    for sent in doc.sents:
        if len(current_segment) + len(sent.text) <= max_chars:
            current_segment += sent.text + " "
        else:
            if current_segment:
                segments.append(current_segment.strip())
            current_segment = sent.text + " "
    if current_segment:
        segments.append(current_segment.strip())

    combined_samples = np.array([])

    for segment in segments:
        audio_path = "/tmp/output_segment.wav"
        tts.tts_to_file(segment, speaker_wav=speaker_audio_path, language=language, file_path=audio_path)
        sr, data = read(audio_path)
        combined_samples = np.concatenate((combined_samples, data))

        write(output_file, sr, combined_samples.astype(np.int16))
# @markdown ### Usage
# @markdown **text** : the target text with good grammar and spelling. <br>
# @markdown **speaker_name**: the name of the cloned person (only for save path) <br>
# @markdown **speaker_audio_path**: The full path to the audio that will be used for cloning.<br>
# @markdown **output_folder**: The path which the result audio will be saved.<br>
# @markdown **language**: the language of the speaker.<br>

text = "Hola mi amor hermoso " # @param {"type":"string"}
speaker_name = "alexa" # @param {"type":"string"}
speaker_audio_path = "./gdrive/MyDrive/alexa_3.mp3" # @param {"type" : "string"}
output_folder = "./gdrive/MyDrive/test" # @param {"type" : "string"}
output_file = f"{output_folder}/{speaker_name}_{uuid.uuid4()}.wav"
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
language = "es" # @param ["es","en","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja","hu","ko","hi"]


process_large_text(text, speaker_audio_path, model_name, language, output_file)


 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)


 > Text splitted to sentences.
['Hola mi amor hermoso']
 > Processing time: 0.8842065334320068
 > Real-time factor: 0.45047953008724007
