<a href="https://colab.research.google.com/github/CoderFalconX/Tortoise-tts-gradio-ui/blob/main/Tortoise%20TTS_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -y transformers tokenizers

!pip install --no-cache-dir numpy==1.26.4 numba==0.59.1 scipy==1.11.4

!pip install --no-cache-dir transformers==4.41.2 tokenizers==0.19.1 \
  einops==0.4.1 rotary-embedding-torch==0.3.6 \
  Unidecode==1.4.0 librosa==0.10.2.post1 resampy soundfile \
  progressbar2 ffmpeg-python accelerate==0.26.0

In [None]:
# Clone the official Tortoise TTS repo
!git clone https://github.com/neonbjb/tortoise-tts.git

In [None]:
import sys
sys.path.insert(0, "/content/tortoise-tts")

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice

print("✅ Tortoise import successful")


In [None]:
import os, glob
base = "/content/tortoise-tts/tortoise/voices"
print("Voices base exists:", os.path.isdir(base), base)
print("Existing voices:", [d for d in os.listdir(base)] if os.path.isdir(base) else [])

# also show any audio files you might have anywhere under /content
cands = glob.glob("/content/**/*.*", recursive=True)
cand_audio = [p for p in cands if p.lower().endswith((".wav",".mp3",".m4a",".flac",".ogg"))]
print("Found audio files:", len(cand_audio))
for p in cand_audio[:20]:
    print(" -", p)

!rm -rf /content/tortoise-tts/tortoise/voices/myself


In [None]:
!apt -y install ffmpeg >/dev/null
!pip -q install pydub

from google.colab import files
from pydub import AudioSegment, effects
from pydub.utils import make_chunks
from io import BytesIO
from pathlib import Path
import os, glob

voice_dir = "/content/tortoise-tts/tortoise/voices/myvoice"
os.makedirs(voice_dir, exist_ok=True)


uploaded = files.upload()
fname, data = next(iter(uploaded.items()))
audio = AudioSegment.from_file(BytesIO(data))
audio = effects.normalize(audio)


chunk_ms = 15_000

chunks = make_chunks(audio, chunk_ms)

count = 0
for i, ch in enumerate(chunks, start=1):
    if ch.dBFS < -35:
        continue
    prepared = ch.set_frame_rate(22050).set_channels(1).set_sample_width(2)
    out = Path(voice_dir) / f"clip_{i:03d}.wav"
    prepared.export(out.as_posix(), format="wav")
    print(f"Saved: {out} | duration: {len(prepared)/1000:.2f}s")
    count += 1

print(f"\nToplam kaydedilen klip: {count}")
print("\nKayıtlı klipler:")
for p in glob.glob(voice_dir + "/*.wav"):
    print(" -", p)


In [None]:
import os, uuid, tempfile, traceback
import torch, torchaudio, gradio as gr
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio

VOICES_ROOT = "/content/tortoise-tts/tortoise/voices"
SAMPLE_RATE = 22050
TTS = TextToSpeech()

# --- SAVE helper (robust) ---
def _to_1xN(t: torch.Tensor) -> torch.Tensor:
    t = t.detach().cpu()
    if t.ndim == 1:
        t = t.unsqueeze(0)
    elif t.ndim > 2:
        t = t.reshape(1, -1)
    return t.to(torch.float32)

def save_wav_tensor(wav, path, sr=SAMPLE_RATE):
    if isinstance(wav, (list, tuple)):
        wav = torch.cat([_to_1xN(w) for w in wav if w is not None], dim=1)
    elif isinstance(wav, torch.Tensor):
        wav = _to_1xN(wav)
    else:
        wav = _to_1xN(torch.tensor(wav))
    maxabs = wav.abs().max()
    if maxabs > 1.0:
        wav = wav / maxabs
    torchaudio.save(path, wav, sr)

# --- VOICE UTILS ---
def list_voice_dirs():
    if not os.path.isdir(VOICES_ROOT):
        return []
    return sorted([d for d in os.listdir(VOICES_ROOT)
                   if os.path.isdir(os.path.join(VOICES_ROOT, d))])

def load_voice_samples(folder):
    path = os.path.join(VOICES_ROOT, folder)
    if not os.path.isdir(path):
        return [], []
    files = sorted([f for f in os.listdir(path) if f.lower().endswith((".wav", ".mp3"))])
    samples, cond_latents = [], []
    for f in files:
        s = load_audio(os.path.join(path, f), SAMPLE_RATE)
        if s.ndim == 1:
            s = s.unsqueeze(0)
        samples.append(s)
    return samples, None  # cond_latents is optional, Tortoise handles None

# --- CORE GEN ---
def generate(text, folder, preset):
    try:
        voice_samples, cond_latents = load_voice_samples(folder)
        if not voice_samples:
            return None, f"❌ No audio in voices/{folder}"

        wav = TTS.tts_with_preset(
            text=text,
            voice_samples=voice_samples,
            conditioning_latents=cond_latents,
            preset=preset
        )
        if wav is None or (isinstance(wav, (list, tuple)) and len(wav) == 0):
            return None, "❌ Model returned no audio. Try shorter text or lower quality."

        out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav")
        save_wav_tensor(wav, out_path, SAMPLE_RATE)
        return out_path, f"✅ Generated: {out_path}"
    except Exception as e:
        return None, f"❌ Error: {e}\n\n{traceback.format_exc()}"

# --- APPLY PITCH AFTER ---
def apply_pitch(file_path, semitones):
    try:
        if not file_path:
            return None, "⚠️ Generate audio first."
        wav, sr = torchaudio.load(file_path)
        wav_shifted = torchaudio.functional.pitch_shift(wav, sr, n_steps=int(semitones))
        out_path = os.path.join(tempfile.gettempdir(), f"pitch_{uuid.uuid4().hex}.wav")
        save_wav_tensor(wav_shifted, out_path, sr)
        return out_path, f"✅ Pitch applied ({semitones:+} semitones): {out_path}"
    except Exception as e:
        return None, f"❌ Pitch error: {e}\n\n{traceback.format_exc()}"

# --- UI ---
with gr.Blocks(title="Tortoise TTS") as demo:
    gr.Markdown("## 🎙️ Generate with Tortoise TTS, then adjust pitch separately")

    text = gr.Textbox(lines=6, label="Text", value="Hello world!")
    preset = gr.Dropdown(["ultra_fast","fast","standard","high_quality"],
                         value="fast", label="Quality")
    folder = gr.Dropdown(list_voice_dirs(), label="Voice Folder",
                         value=(list_voice_dirs()[0] if list_voice_dirs() else None))

    gen_btn = gr.Button("🚀 Generate Audio")
    audio_out = gr.Audio(type="filepath", label="Generated Audio", show_download_button=True)
    log_out = gr.Textbox(lines=6, label="Logs", interactive=False)

    # Pitch section
    pitch_slider = gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Pitch shift (semitones)")
    pitch_btn = gr.Button("🎚️ Apply Pitch to Generated Audio")
    audio_pitch = gr.Audio(type="filepath", label="Pitch-shifted Audio", show_download_button=True)
    log_pitch = gr.Textbox(lines=4, label="Pitch Logs", interactive=False)

    gen_btn.click(generate, [text, folder, preset], [audio_out, log_out])
    pitch_btn.click(apply_pitch, [audio_out, pitch_slider], [audio_pitch, log_pitch])

demo.launch()
