In [3]:
"""
Multilingual Text-to-Speech (Hindi/English/Gujarati) + Player Controls + Sweet Girl Preset
----------------------------------------------------------------------------------------
Features:
  • Reads text from raw text, PDF, DOCX, PPTX
  • Detects language (Hindi/English/Gujarati) and synthesizes speech
  • Default voice preset: “sweet girl” voice (pitch +5 semitones, tone center 3000 Hz, soft gain)
  • UI controls with scrollbars:
      - Play/Pause
      - Rewind / Fast-forward 10s
      - Speed control (0.25x–2.0x + custom)
      - Loudness (Gain −30 … +12 dB)
      - Pitch shift (−12 … +12 semitones)
      - Frequency slider (20–20,000 Hz)
      - Wavelength slider (0.017–17 m, linked to frequency)
  • Displays current wavelength for chosen frequency
  • Saves output as WAV for playback/download
"""
# Install dependencies (run in a notebook cell):
# ------------------------------------------------
!pip install --quiet gradio==4.44.0 langdetect PyPDF2 python-docx python-pptx pydub librosa soundfile scipy numpy
!pip install --quiet TTS gTTS pyttsx3


import os
import io
import math
import tempfile
from pathlib import Path

import numpy as np
import soundfile as sf
from langdetect import detect

# File readers
import PyPDF2
from docx import Document as Docx
from pptx import Presentation

# Audio processing
import librosa
from scipy.signal import butter, sosfilt
from pydub import AudioSegment

# UI
import gradio as gr

# ---------------------------
# Utility: Text Extraction
# ---------------------------

def extract_text(path: str) -> str:
    ext = Path(path).suffix.lower()
    if ext == ".pdf":
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            pages = []
            for i in range(len(reader.pages)):
                try:
                    pages.append(reader.pages[i].extract_text() or "")
                except Exception:
                    pages.append("")
            return "\n\n".join(pages)
    elif ext in {".doc", ".docx"}:
        doc = Docx(path)
        return "\n".join(p.text for p in doc.paragraphs)
    elif ext in {".ppt", ".pptx"}:
        prs = Presentation(path)
        texts = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    texts.append(shape.text)
        return "\n\n".join(texts)
    else:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()

# ---------------------------
# Utility: Chunking & Lang Detect
# ---------------------------

def chunk_text(text: str, max_chars: int = 400) -> list[str]:
    raw_parts = [p.strip() for p in text.replace("\r", "\n").split("\n") if p.strip()]
    chunks = []
    buf = []
    total = 0
    for part in raw_parts:
        if total + len(part) + 1 > max_chars and buf:
            chunks.append(" ".join(buf))
            buf, total = [], 0
        buf.append(part)
        total += len(part) + 1
    if buf:
        chunks.append(" ".join(buf))
    return chunks or ([text[:max_chars]] if text else [])


def detect_lang_safe(text: str) -> str:
    try:
        return detect(text)
    except Exception:
        return "en"

# ---------------------------
# TTS Backends
# ---------------------------
try:
    from TTS.api import TTS as COQUI_TTS
    _HAS_COQUI = True
except Exception:
    _HAS_COQUI = False

try:
    from gtts import gTTS
    _HAS_GTTS = True
except Exception:
    _HAS_GTTS = False

try:
    import pyttsx3
    _HAS_PYTT = True
except Exception:
    _HAS_PYTT = False


def synthesize_chunk(text: str, lang_hint: str, sr: int = 24000) -> np.ndarray:
    lang = lang_hint
    if _HAS_COQUI:
        try:
            model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
            tts = COQUI_TTS(model_name)
            wav = tts.tts(text=text, language=lang if lang in {"en", "hi"} else "en")
            y = np.array(wav, dtype=np.float32)
            if sr != tts.synthesizer.output_sample_rate:
                y = librosa.resample(y, orig_sr=tts.synthesizer.output_sample_rate, target_sr=sr)
            return y.astype(np.float32)
        except Exception:
            pass
    if _HAS_GTTS and lang in {"en", "hi", "gu"}:
        try:
            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
                gTTS(text=text, lang=lang).save(tmp.name)
                seg = AudioSegment.from_file(tmp.name)
                seg = seg.set_channels(1).set_frame_rate(sr)
                samples = np.array(seg.get_array_of_samples()).astype(np.float32)
                y = samples / (2 ** (8 * seg.sample_width - 1))
                return y
        except Exception:
            pass
    if _HAS_PYTT:
        try:
            engine = pyttsx3.init()
            for v in engine.getProperty('voices'):
                if lang in (v.languages[0].decode('utf-8') if v.languages else str(v.id)):
                    engine.setProperty('voice', v.id)
                    break
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
                out_path = tmp.name
            engine.save_to_file(text, out_path)
            engine.runAndWait()
            y, sr_in = librosa.load(out_path, sr=sr, mono=True)
            return y.astype(np.float32)
        except Exception:
            pass
    return np.zeros(int(sr * 0.5), dtype=np.float32)


def synthesize_text(text: str, sr: int = 24000) -> tuple[np.ndarray, int]:
    chunks = chunk_text(text)
    parts = []
    for ch in chunks:
        lang = detect_lang_safe(ch)
        parts.append(synthesize_chunk(ch, lang, sr=sr))
    y = np.concatenate(parts) if parts else np.zeros(int(sr * 0.5), dtype=np.float32)
    peak = np.max(np.abs(y)) or 1.0
    y = y / peak * 0.95
    return y.astype(np.float32), sr

# ---------------------------
# Audio FX
# ---------------------------
def apply_gain_db(y: np.ndarray, db: float) -> np.ndarray:
    factor = 10 ** (db / 20.0)
    out = y * factor
    maxv = np.max(np.abs(out))
    if maxv > 1.0:
        out = out / maxv * 0.99
    return out.astype(np.float32)


def apply_speed(y: np.ndarray, sr: int, speed: float) -> np.ndarray:
    speed = max(0.25, min(2.0, float(speed)))
    return librosa.effects.time_stretch(y, rate=speed).astype(np.float32)


def apply_pitch(y: np.ndarray, sr: int, semitones: float) -> np.ndarray:
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=semitones).astype(np.float32)


def bandpass(y: np.ndarray, sr: int, center_hz: float, q: float = 1.0) -> np.ndarray:
    center_hz = float(np.clip(center_hz, 20.0, min(20000.0, sr/2 - 100)))
    bw = center_hz / max(q, 0.1)
    low = max(10.0, center_hz - bw/2)
    high = min(sr/2 - 10.0, center_hz + bw/2)
    if low >= high:
        return y
    sos = butter(4, [low/(sr/2), high/(sr/2)], btype='bandpass', output='sos')
    return sosfilt(sos, y).astype(np.float32)

# ---------------------------
# Session State
# ---------------------------
class Session:
    def __init__(self):
        self.y_full = None
        self.sr = 24000
        self.offset = 0.0
        self.last_processed = None

    def load_text(self, text: str):
        y, sr = synthesize_text(text)
        self.y_full, self.sr = y, sr
        self.offset = 0.0
        self.last_processed = None

    def load_file(self, path: str):
        text = extract_text(path)
        self.load_text(text)

    def process(self, speed=1.0, gain_db=0.0, pitch_semitones=0.0, center_hz=1000.0, q=1.0):
        if self.y_full is None:
            return None, None
        y = self.y_full.copy()
        if pitch_semitones:
            y = apply_pitch(y, self.sr, pitch_semitones)
        if speed and abs(speed - 1.0) > 1e-3:
            y = apply_speed(y, self.sr, speed)
        if center_hz:
            y = bandpass(y, self.sr, center_hz, q)
        if gain_db:
            y = apply_gain_db(y, gain_db)
        self.last_processed = (y, self.sr)
        return y, self.sr

    def slice_from_offset(self, y: np.ndarray, sr: int) -> np.ndarray:
        start = int(self.offset * sr)
        start = max(0, min(start, len(y)))
        return y[start:]

    def save_wav(self, y: np.ndarray, sr: int) -> str:
        fp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        sf.write(fp.name, y, sr)
        fp.flush()
        return fp.name

SESSION = Session()

# ---------------------------
# Gradio Callbacks
# ---------------------------

def ui_load_text(text):
    SESSION.load_text(text)
    return gr.update(value="Loaded text."), ""


def ui_load_file(file):
    if file is None:
        return gr.update(), ""
    SESSION.load_file(file.name)
    return gr.update(value="Loaded file."), ""


def ui_process(speed, gain_db, pitch_semitones, center_hz, wavelength, q):
    # wavelength <-> frequency sync
    if wavelength:
        center_hz = 343.0 / max(0.017, min(17.15, float(wavelength)))
    y, sr = SESSION.process(speed, gain_db, pitch_semitones, center_hz, q)
    if y is None:
        return None, "No audio yet."
    sliced = SESSION.slice_from_offset(y, sr)
    wav_path = SESSION.save_wav(sliced, sr)
    lam = 343.0 / max(1.0, center_hz)
    return wav_path, f"Offset: {SESSION.offset:.1f}s, Length: {len(sliced)/sr:.1f}s | λ≈{lam:.3f}m"


def ui_seek(delta):
    SESSION.offset = max(0.0, SESSION.offset + float(delta))
    if SESSION.last_processed is None:
        return None, f"Offset: {SESSION.offset:.1f}s"
    y, sr = SESSION.last_processed
    sliced = SESSION.slice_from_offset(y, sr)
    wav_path = SESSION.save_wav(sliced, sr)
    return wav_path, f"Offset: {SESSION.offset:.1f}s, Length: {len(sliced)/sr:.1f}s"


def ui_reset_offset():
    SESSION.offset = 0.0
    return ui_seek(0)

# ---------------------------
# Build Gradio Interface with Sweet Girl Defaults
# ---------------------------
with gr.Blocks(title="Multilingual TTS Reader + Player") as demo:
    gr.Markdown("# Multilingual TTS Reader + Player\nDefault voice preset is a sweet girl's voice (Pitch +5, Tone 3000 Hz, Gain −2 dB). Adjust any sliders to override.")

    with gr.Row():
        txt = gr.Textbox(label="Paste Text", lines=6)
        file = gr.File(label="Or upload: PDF/DOCX/PPTX/TXT")
    with gr.Row():
        load_text_btn = gr.Button("Load Text")
        load_file_btn = gr.Button("Load File")
        load_status = gr.Textbox(label="Loader Status", interactive=False)

    with gr.Row():
        speed = gr.Slider(0.25, 2.0, value=1.0, step=0.05, label="Speed (x)")
        gain = gr.Slider(-30, 12, value=-2, step=1, label="Loudness / Gain (dB)")
        pitch = gr.Slider(-12, 12, value=5, step=0.5, label="Pitch Shift (semitones)")
    with gr.Row():
        center = gr.Slider(20, 20000, value=3000, step=1, label="Tone Center Frequency (Hz)")
        wavelength = gr.Slider(0.017, 17.15, value=0.114, step=0.001, label="Wavelength (m)")
        q = gr.Slider(0.2, 10.0, value=1.0, step=0.1, label="Tone Q (bandwidth)")

    process_btn = gr.Button("Process + Play from Offset")
    audio = gr.Audio(label="Audio Output", interactive=False)
    status = gr.Textbox(label="Playback Status", interactive=False)

    with gr.Row():
        back10 = gr.Button("⏪ Rewind 10s")
        ahead10 = gr.Button("⏩ Forward 10s")
        reset = gr.Button("⏮️ Restart")

    # Events
    load_text_btn.click(ui_load_text, inputs=[txt], outputs=[load_status, txt])
    load_file_btn.click(ui_load_file, inputs=[file], outputs=[load_status, txt])
    process_btn.click(ui_process, inputs=[speed, gain, pitch, center, wavelength, q], outputs=[audio, status])
    back10.click(lambda: ui_seek(-10), outputs=[audio, status])
    ahead10.click(lambda: ui_seek(10), outputs=[audio, status])
    reset.click(ui_reset_offset, outputs=[audio, status])

# In Jupyter, launch with:
demo.launch(debug=False, share=False)

ERROR: Ignored the following versions that require a different python version: 0.0.10.2 Requires-Python >=3.6.0, <3.9; 0.0.10.3 Requires-Python >=3.6.0, <3.9; 0.0.11 Requires-Python >=3.6.0, <3.9; 0.0.12 Requires-Python >=3.6.0, <3.9; 0.0.13.1 Requires-Python >=3.6.0, <3.9; 0.0.13.2 Requires-Python >=3.6.0, <3.9; 0.0.14.1 Requires-Python >=3.6.0, <3.9; 0.0.15 Requires-Python >=3.6.0, <3.9; 0.0.15.1 Requires-Python >=3.6.0, <3.9; 0.0.9 Requires-Python >=3.6.0, <3.9; 0.0.9.1 Requires-Python >=3.6.0, <3.9; 0.0.9.2 Requires-Python >=3.6.0, <3.9; 0.0.9a10 Requires-Python >=3.6.0, <3.9; 0.0.9a9 Requires-Python >=3.6.0, <3.9; 0.1.0 Requires-Python >=3.6.0, <3.10; 0.1.1 Requires-Python >=3.6.0, <3.10; 0.1.2 Requires-Python >=3.6.0, <3.10; 0.1.3 Requires-Python >=3.6.0, <3.10; 0.10.0 Requires-Python >=3.7.0, <3.11; 0.10.1 Requires-Python >=3.7.0, <3.11; 0.10.2 Requires-Python >=3.7.0, <3.11; 0.11.0 Requires-Python >=3.7.0, <3.11; 0.11.1 Requires-Python >=3.7.0, <3.11; 0.12.0 Requires-Python >=3