In [None]:
# the scipy version packaged with colab is not tolerant of misformated WAV files.
# install the latest version.
!pip3 install -U scipy

!git clone https://github.com/jnordberg/tortoise-tts.git
%cd tortoise-tts
!pip3 install -r requirements.txt
!pip3 install transformers==4.19.0 einops==0.5.0 rotary_embedding_torch==0.1.5 unidecode==1.3.5
!python3 setup.py install

In [None]:
import os
import streamlit as st
import torch
import torchaudio
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice

# Set up Streamlit app title and sidebar
st.title("Custom Voice Text-to-Speech Generator")
st.sidebar.title("Upload Custom Voice")

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()

# This is the text that will be spoken.
text = "Thanks for reading this article. I hope you learned something."

# Pick a "preset mode" to determine quality. Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. See docs in api.py
preset = "fast"

# Set your custom voice name
CUSTOM_VOICE_NAME = "martin"

# Check if the custom voice folder exists, create it if not
custom_voice_folder = f"tortoise/voices/{CUSTOM_VOICE_NAME}"
os.makedirs(custom_voice_folder, exist_ok=True)

# Streamlit file uploader in the sidebar
uploaded_file = st.sidebar.file_uploader("Upload a WAV file for custom voice", type=["wav"])

# Load voice samples and conditioning latents
voice_samples, conditioning_latents = load_voice(CUSTOM_VOICE_NAME)

# If a file is uploaded, save it to the custom voice folder
if uploaded_file:
    st.sidebar.text("Uploading file...")
    file_path = os.path.join(custom_voice_folder, uploaded_file.name)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.read())
    st.sidebar.text("File uploaded successfully!")

# Generate speech with the custom voice
# Generate speech with the custotm voice.
voice_samples, conditioning_latents = load_voice(CUSTOM_VOICE_NAME)
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
                          preset=preset)
torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')

# Display audio playback in the main app area
st.audio(gen.squeeze(0).cpu(), format="audio/wav")

# Add a disclaimer about using the custom voice
st.sidebar.markdown("**Note:** Using a custom voice may require multiple uploaded audio files and proper conditioning for best results.")
