# Comparison of Text to Speech Models on Replicate

In [11]:
pip install replicate


Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import getpass

os.environ["REPLICATE_API_TOKEN"] = getpass.getpass()

 ········


In [2]:
import replicate

In [11]:
inputs = {
    "en": "The stars are shining brightly tonight, casting a magical glow over the quiet world. Can you feel the peace in the air?",
    "es": "Las estrellas brillan intensamente esta noche, llenando el mundo silencioso con un resplandor mágico. ¿Puedes sentir la paz en el aire?",
    "fr": "Les étoiles brillent de mille feux ce soir, illuminant le monde tranquille d'une lueur magique. Sens-tu la paix dans l'air?",
    "de": "Die Sterne leuchten heute Nacht hell und tauchen die stille Welt in einen magischen Glanz. Spürst du die Ruhe in der Luft?",
    "hi": "आज रात तारें चमक रही हैं, जो शांत दुनिया पर एक जादुई चमक बिखेर रही हैं। क्या आपको हवा में शांति महसूस हो रही है?",
    "zh": "今晚星星闪烁，给宁静的世界增添了神秘的光芒。你能感受到空气中的宁静吗？",
    "ar": "النجوم تلمع بشدة الليلة، مما يضفي بريقًا سحريًا على العالم الهادئ. هل تشعر بالسلام في الهواء؟",
    "ru": "Звезды ярко сияют этой ночью, окутывая мир волшебным светом. Ты чувствуешь покой в воздухе?",
    "ja": "今夜、星々が輝き、静かな世界に魔法の光を投げかけています。空気中に平和を感じますか？",
    "it": "Le stelle brillano luminose stanotte, diffondendo una luce magica sul mondo tranquillo. Riesci a sentire la pace nell'aria?"
}

speaker_samples = {
    "male": "https://replicate.delivery/pbxt/Jt79w0xsT64R1JsiJ0LQRL8UcWspg5J4RFrU6YwEKpOT1ukS/male.wav",
    "female": "https://example.com/path/to/female_voice.wav",  # Replace with actual female voice URL
    # Add more speaker URLs as needed
}


# xtts-v2
English: en French: fr German: de Spanish: es Italian: it Portuguese: pt Czech: cs Polish: pl Russian: ru Dutch: nl Turksih: tr Arabic: ar Mandarin Chinese: zh-cn Hindi: hi

https://replicate.com/lucataco/xtts-v2

In [4]:
# Function to generate TTS based on language and speaker type
def generate_tts(text, link, language="en", speaker_type="male", speed=1.0, pitch=1.0):
    speaker_url = speaker_samples.get(speaker_type)
    
    if not speaker_url:
        raise ValueError("Invalid speaker type. Choose 'male' or 'female'.")

    # Input data for TTS generation
    input_data = {
        "speaker": speaker_url,
        "text": text,
        "language": language,
        "speed": speed,
        "pitch": pitch
    }

    # Run the model on Replicate
    output = replicate.run(
        link,
        input=input_data
    )
    
    # Return the generated audio URL
    return output

In [5]:
link = "lucataco/xtts-v2:684bc3855b37866c0c65add2ff39c78f3dea3f4ff103a436465326e0f438d55e"
output_url = generate_tts(text=inputs["hi"], language="hi", speaker_type="male", link=link)
print(output_url)

https://replicate.delivery/yhqm/NyaQ6wBXR9qPJhktYrzaaqliq4uIZEGkIkRKJUZMUUOzoX4E/output.wav


# styletts2
Not multilingual

https://replicate.com/adirik/styletts2

In [6]:
def generate_style_tts(text, embedding_scale=1.0, model_link="adirik/styletts2:989cb5ea6d2401314eb30685740cb9f6fd1c9001b8940659b406f952837ab5ac"):
    """Generates TTS using the StyleTTS model."""
    
    # Prepare the input data
    input_data = {
        "text": text,
        "embedding_scale": embedding_scale
    }
    
    # Run the model on Replicate
    output = replicate.run(model_link, input=input_data)
    
    # Return the generated audio URL
    return output

# Example usage
text_input = "StyleTTS 2 is a text-to-speech model that leverages style diffusion and adversarial training with large speech language models to achieve human-level text-to-speech synthesis."
embedding_scale_input = 1.5

# Generate TTS
try:
    output_url = generate_style_tts(text=text_input, embedding_scale=embedding_scale_input)
    print(f"Generated audio URL: {output_url}")
except Exception as e:
    print(f"An error occurred: {e}")


Generated audio URL: https://replicate.delivery/czjl/XDW0R1RdZvpSNdYi8MP5oy1WYeNlqzP9lYd7HL5NZnPjwtwJA/out.mp3


# Bark
English, German, Spanish, French, Hindi, Italian, Japanese, Korean, Polish, Portuguese, Russian, Turkish, Chinese (simplified)

https://replicate.com/suno-ai/bark

In [15]:
import replicate

# Function to generate audio with Bark TTS using custom inputs
def generate_bark_audio(prompt, text_temp=0.7, waveform_temp=0.7, history_prompt="en_speaker_0", custom_history_prompt=None, output_full=False):
    """
    Generate speech using Bark TTS model with custom input parameters.
    
    Args:
    prompt (str): Text prompt for speech synthesis.
    text_temp (float): Temperature for text generation (diversity of speech output).
    waveform_temp (float): Temperature for waveform generation.
    history_prompt (str): Preset speaker choice (e.g., en_speaker_0, hi_speaker_1).
    custom_history_prompt (str): URL to a custom .npz file for history cloning (optional).
    output_full (bool): If True, returns a full generation file for future use.
    
    Returns:
    str: URL to the generated audio.
    """
    
    # Construct the input payload
    input_data = {
        "prompt": prompt,
        "text_temp": text_temp,
        "waveform_temp": waveform_temp,
        "history_prompt": history_prompt,
        "output_full": output_full
    }
    
    # If custom history prompt is provided, override the default history_prompt
    if custom_history_prompt:
        input_data["custom_history_prompt"] = custom_history_prompt
    
    # Run the model on Replicate
    output = replicate.run(
        "suno-ai/bark:b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787", 
        input=input_data
    )
    
    # Return the output URL for generated audio
    return output

# Example usage
prompt = inputs["hi"]
history_prompt = "hi_speaker_3"  # Change this based on speaker preference (en_speaker_0, hi_speaker_1, etc.)
output_url = generate_bark_audio(prompt, history_prompt=history_prompt)

print(f"Generated Audio URL: {output_url}")


Generated Audio URL: {'audio_out': 'https://replicate.delivery/czjl/DIteiwHazCSAT6ncMKToNkMuXllVQd0ueRmEKvkR7zavf7CnA/audio.wav'}


# tortoise-
Engilsh, but can leverage custom voice arg to generate other languages but its not so good

https://replicate.com/afiaka87/tortoise-tts

In [6]:
import replicate

def generate_custom_tts(text, voice_a="random", voice_b="disabled", voice_c="disabled", preset="fast", seed=0, cvvp_amount=0):
    input_data = {
        "text": text,
        "voice_a": voice_a,
        "voice_b": voice_b,
        "voice_c": voice_c,
        "preset": preset,
        "seed": seed,
        "cvvp_amount": cvvp_amount
    }
    
    # Call the Tortoise TTS model on Replicate
    output = replicate.run(
        "afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71",
        input=input_data
    )

    return output

# Example usage:
output_url = generate_custom_tts(
    text="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.",
    voice_a="random",
    preset="fast"
)
print(output_url)


https://replicate.delivery/czjl/vf8pAtixEjVfEkU50iiR7ev0ma7yKIjxacYLPDgZVEdda9CnA/tortoise.mp3


# neon tts
English, Spanish, French, German, Italian, Polish, Ukrainian, Dutch, Romanian, Hungarian, Greek, Czech, Swedish, Portuguese, Croatian, Bulgarian, Danish, Slovak, Finnish, Lithuanian, Slovenian, Latvian, Estonian, Irish, Maltese

https://replicate.com/awerks/neon-tts

In [9]:
import replicate

def text_to_speech(text: str, language: str = "en") -> str:
    """
    Converts the input text to speech using Replicate's Text-to-Speech API.
    
    Parameters:
    text (str): The text to be converted to speech.
    language (str): The language of the text. Defaults to "en" (English).
    
    Returns:
    str: The URL of the generated speech audio file.
    """
    
    # List of supported languages
    supported_languages = [
        "en", "es", "fr", "de", "it", "pl", "uk", "nl", "ro", "hu", "el", 
        "cs", "sv", "pt", "bg", "hr", "da", "sk", "fi", "lt", "sl", "lv", 
        "et", "ga", "mt"
    ]
    
    # Check if the provided language is supported
    if language not in supported_languages:
        raise ValueError(f"Language '{language}' is not supported. Please choose from: {', '.join(supported_languages)}.")
    
    # Set up the input payload
    input_payload = {
        "text": text,
        "language": language
    }
    
    # Run the model on Replicate
    output = replicate.run(
        "awerks/neon-tts:139606fe1536f85a9f07d87982400b8140c9a9673733d47913af96738894128f",
        input=input_payload
    )
    
    # Return the output URL
    return output

# Example usage:
inp = "hi! my name is digvijay singh bisht. i am am gen AI developer and i am passionate about this feild"
result = text_to_speech(inp, language="en")
print(result)  # Outputs the URL of the generated speech file


https://replicate.delivery/czjl/D3OBnM6XIrJAOpi9fqThjk5kZJoARnNRDlBBO2PsmN6rovwJA/tmphzqxq0ud.wav


# seamless_communication
Bengali, Catalan, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Hindi, Indonesian, Italian, Japanese, Korean, Maltese, Mandarin Chinese, Modern Standard Arabic, Northern Uzbek, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swahili, Swedish, Tagalog, Telugu, Thai, Turkish, Ukrainian, Urdu, Vietnamese, Welsh, Western Persian.

https://replicate.com/cjwbw/seamless_communication

In [19]:
output = replicate.run(
    "cjwbw/seamless_communication:668a4fec05a887143e5fe8d45df25ec4c794dd43169b9a11562309b2d45873b0",
    input={
        "task_name": "T2ST (Text to Speech translation)",
        "input_text": inputs["hi"],
        "input_text_language": "Hindi",
        "max_input_audio_length": 60,
        "target_language_text_only": "Norwegian Nynorsk",
        "target_language_with_speech": "Hindi"
    }
)
print(output)

{'audio_output': 'https://replicate.delivery/pbxt/uWufWfx2v7kGWkFE7jMzrpDOlsMcig16Ki4Ad7mk8iDQofCnA/out.wav', 'text_output': 'आज रात तारे चमक रहे हैं, जो शांत दुनिया पर एक जादुई चमक बिखेर रही हैं। क्या आपको हवा में शांति महसूस हो रही है?'}


# realistic-voice-cloning : 
used for music audio dubbing/cloning
https://replicate.com/zsxkib/realistic-voice-cloning

# tango
used for generating sound effect https://replicate.com/declare-lab/tango