In [None]:
# Core + UI
!pip install --quiet "gradio==4.44.0" langdetect PyPDF2 python-docx python-pptx

# Audio stack (versions pinned for stability with Jupyter)
!pip install --quiet "numpy==1.26.4" "librosa==0.10.2.post1" soundfile scipy pydub imageio-ffmpeg

# TTS backends
!pip install --quiet "gTTS==2.5.3" "pyttsx3==2.90"

# OPTIONAL: Coqui XTTS v2 (very large download; comment out if you don't want it)
!pip install --quiet TTS


In [None]:
import os
import io
import math
import tempfile
from pathlib import Path

import numpy as np
import soundfile as sf
from langdetect import detect

# File readers
import PyPDF2
from docx import Document as Docx
from pptx import Presentation

# Audio processing
import librosa
from scipy.signal import butter, sosfilt
from pydub import AudioSegment
import imageio_ffmpeg

# UI
import gradio as gr

# Configure pydub to use the bundled ffmpeg from imageio-ffmpeg
AudioSegment.converter = imageio_ffmpeg.get_ffmpeg_exe()

# ---------------------------
# TTS Backends (fail-soft)
# ---------------------------
_HAS_COQUI = False
try:
    # Only attempt to import Coqui if installed (it's optional & heavy)
    from TTS.api import TTS as COQUI_TTS
    _HAS_COQUI = True
except Exception:
    _HAS_COQUI = False

try:
    from gtts import gTTS
    _HAS_GTTS = True
except Exception:
    _HAS_GTTS = False

try:
    import pyttsx3
    _HAS_PYTT = True
except Exception:
    _HAS_PYTT = False

# ---------------------------
# Utility: Text Extraction
# ---------------------------

def extract_text(path: str) -> str:
    ext = Path(path).suffix.lower()
    if ext == ".pdf":
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            pages = []
            for i in range(len(reader.pages)):
                try:
                    pages.append(reader.pages[i].extract_text() or "")
                except Exception:
                    pages.append("")
            return "\n\n".join(pages)
    elif ext in {".doc", ".docx"}:
        doc = Docx(path)
        return "\n".join(p.text for p in doc.paragraphs)
    elif ext in {".ppt", ".pptx"}:
        prs = Presentation(path)
        texts = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    texts.append(shape.text)
        return "\n\n".join(texts)
    else:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()

# ---------------------------
# Utility: Chunking & Lang Detect
# ---------------------------

def chunk_text(text: str, max_chars: int = 400) -> list[str]:
    raw_parts = [p.strip() for p in text.replace("\r", "\n").split("\n") if p.strip()]
    chunks = []
    buf = []
    total = 0
    for part in raw_parts:
        if total + len(part) + 1 > max_chars and buf:
            chunks.append(" ".join(buf))
            buf, total = [], 0
        buf.append(part)
        total += len(part) + 1
    if buf:
        chunks.append(" ".join(buf))
    return chunks or ([text[:max_chars]] if text else [])

def detect_lang_safe(text: str) -> str:
    try:
        return detect(text)
    except Exception:
        return "en"

# ---------------------------
# Audio FX
# ---------------------------
def apply_gain_db(y: np.ndarray, db: float) -> np.ndarray:
    factor = 10 ** (db / 20.0)
    out = y * factor
    maxv = np.max(np.abs(out)) if out.size else 0
    if maxv > 1.0:
        out = out / maxv * 0.99
    return out.astype(np.float32)

def apply_speed(y: np.ndarray, sr: int, speed: float) -> np.ndarray:
    speed = max(0.25, min(2.0, float(speed)))
    if speed == 1.0 or y.size == 0:
        return y.astype(np.float32)
    return librosa.effects.time_stretch(y, rate=speed).astype(np.float32)

def apply_pitch(y: np.ndarray, sr: int, semitones: float) -> np.ndarray:
    if semitones == 0 or y.size == 0:
        return y.astype(np.float32)
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=semitones).astype(np.float32)

def bandpass(y: np.ndarray, sr: int, center_hz: float, q: float = 1.0) -> np.ndarray:
    if y.size == 0:
        return y.astype(np.float32)
    center_hz = float(np.clip(center_hz if center_hz else 1000.0, 20.0, min(20000.0, sr/2 - 100)))
    q = max(0.1, float(q))
    bw = center_hz / q
    low = max(10.0, center_hz - bw/2)
    high = min(sr/2 - 10.0, center_hz + bw/2)
    if low >= high:
        return y.astype(np.float32)
    sos = butter(4, [low/(sr/2), high/(sr/2)], btype='bandpass', output='sos')
    return sosfilt(sos, y).astype(np.float32)

# ---------------------------
# TTS: synthesize with graceful fallback
# ---------------------------

def _resample_if_needed(y: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
    if sr_in == sr_out:
        return y.astype(np.float32)
    return librosa.resample(y, orig_sr=sr_in, target_sr=sr_out, res_type="kaiser_best").astype(np.float32)

def _tts_coqui(text: str, lang_hint: str, sr: int) -> np.ndarray | None:
    if not _HAS_COQUI:
        return None
    try:
        # Multilingual XTTS v2 (heavy model; skip if not installed)
        model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
        tts = COQUI_TTS(model_name)
        lang = lang_hint if lang_hint in {"en", "hi"} else "en"
        wav = tts.tts(text=text, language=lang)
        y = np.array(wav, dtype=np.float32)
        return _resample_if_needed(y, getattr(tts, "synthesizer", None).output_sample_rate if getattr(tts, "synthesizer", None) else 24000, sr)
    except Exception:
        return None

def _tts_gtts(text: str, lang_hint: str, sr: int) -> np.ndarray | None:
    if not _HAS_GTTS:
        return None
    lang_map = {"en":"en", "hi":"hi", "gu":"gu"}
    lang = lang_map.get(lang_hint, "en")
    try:
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
            gTTS(text=text, lang=lang).save(tmp.name)
            seg = AudioSegment.from_file(tmp.name)
        seg = seg.set_channels(1).set_frame_rate(sr)
        samples = np.array(seg.get_array_of_samples()).astype(np.float32)
        # Scale integer PCM to [-1,1]
        y = samples / (2 ** (8 * seg.sample_width - 1))
        return y.astype(np.float32)
    except Exception:
        return None

def _tts_pyttsx3(text: str, lang_hint: str, sr: int) -> np.ndarray | None:
    if not _HAS_PYTT:
        return None
    try:
        engine = pyttsx3.init()
        # Try to pick a voice matching the language; if not found, leave default
        try:
            for v in engine.getProperty('voices'):
                langs = []
                try:
                    langs = [x.decode('utf-8') for x in v.languages] if v.languages else []
                except Exception:
                    langs = []
                if lang_hint in "".join(langs) or lang_hint in str(v.id).lower():
                    engine.setProperty('voice', v.id)
                    break
        except Exception:
            pass
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            out_path = tmp.name
        engine.save_to_file(text, out_path)
        engine.runAndWait()
        y, sr_in = librosa.load(out_path, sr=sr, mono=True)
        return y.astype(np.float32)
    except Exception:
        return None

def synthesize_chunk(text: str, lang_hint: str, sr: int = 24000) -> np.ndarray:
    # Try Coqui -> gTTS -> pyttsx3 -> silence
    for fn in (_tts_coqui, _tts_gtts, _tts_pyttsx3):
        y = fn(text, lang_hint, sr)
        if y is not None and y.size > 0:
            return y.astype(np.float32)
    # Fallback: short silence to keep pipeline stable
    return np.zeros(int(sr * 0.4), dtype=np.float32)

def synthesize_text(text: str, sr: int = 24000) -> tuple[np.ndarray, int]:
    chunks = chunk_text(text)
    parts = []
    for ch in chunks:
        lang = detect_lang_safe(ch)
        parts.append(synthesize_chunk(ch, lang, sr=sr))
    if parts:
        y = np.concatenate(parts).astype(np.float32)
        peak = float(np.max(np.abs(y))) if y.size else 0.0
        if peak > 0:
            y = y / peak * 0.95
    else:
        y = np.zeros(int(sr * 0.5), dtype=np.float32)
    return y.astype(np.float32), sr

# ---------------------------
# Session State
# ---------------------------
class Session:
    def __init__(self):
        self.y_full = np.zeros(0, dtype=np.float32)
        self.sr = 24000
        self.offset = 0.0
        self.last_processed = (np.zeros(0, dtype=np.float32), 24000)

    def load_text(self, text: str):
        y, sr = synthesize_text(text)
        self.y_full, self.sr = y, sr
        self.offset = 0.0
        self.last_processed = (y, sr)

    def load_file(self, path: str):
        text = extract_text(path)
        self.load_text(text)

    def process(self, speed=1.0, gain_db=-2.0, pitch_semitones=5.0, center_hz=3000.0, q=1.0):
        y = self.y_full.copy()
        if y.size == 0:
            return np.zeros(0, dtype=np.float32), self.sr
        # Sweet-girl preset baked in, but all sliders override it
        if pitch_semitones:
            y = apply_pitch(y, self.sr, float(pitch_semitones))
        if speed and abs(float(speed) - 1.0) > 1e-6:
            y = apply_speed(y, self.sr, float(speed))
        if center_hz:
            y = bandpass(y, self.sr, float(center_hz), float(q))
        if gain_db:
            y = apply_gain_db(y, float(gain_db))
        self.last_processed = (y, self.sr)
        return y, self.sr

    def slice_from_offset(self, y: np.ndarray, sr: int) -> np.ndarray:
        start = int(max(0.0, min(float(self.offset), len(y)/sr)) * sr)
        return y[start:]

    def save_wav(self, y: np.ndarray, sr: int) -> str:
        fp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        sf.write(fp.name, y if y.size else np.zeros(1, dtype=np.float32), sr)
        fp.flush()
        return fp.name

SESSION = Session()

# ---------------------------
# Gradio Callbacks
# ---------------------------

def ui_load_text(text):
    text = text or ""
    SESSION.load_text(text)
    return "Loaded text.", ""

def ui_load_file(file_path):
    if not file_path:
        return gr.update(), ""
    SESSION.load_file(file_path)
    return "Loaded file.", ""

def ui_process(speed, gain_db, pitch_semitones, center_hz, wavelength, q):
    # Keep wavelength & frequency in sync if wavelength was moved
    try:
        if wavelength is not None:
            wavelength = float(wavelength)
            wavelength = min(17.15, max(0.017, wavelength))
            center_hz = 343.0 / wavelength
    except Exception:
        pass

    y, sr = SESSION.process(speed, gain_db, pitch_semitones, center_hz, q)
    sliced = SESSION.slice_from_offset(y, sr)
    wav_path = SESSION.save_wav(sliced, sr)
    lam = 343.0 / max(1.0, float(center_hz or 1000.0))
    return wav_path, f"Offset: {SESSION.offset:.1f}s, Length: {len(sliced)/sr:.1f}s | λ≈{lam:.3f} m"

def ui_seek(delta):
    SESSION.offset = max(0.0, float(SESSION.offset) + float(delta))
    y, sr = SESSION.last_processed
    sliced = SESSION.slice_from_offset(y, sr)
    wav_path = SESSION.save_wav(sliced, sr)
    return wav_path, f"Offset: {SESSION.offset:.1f}s, Length: {len(sliced)/sr:.1f}s"

def ui_reset_offset():
    SESSION.offset = 0.0
    return ui_seek(0)

# ---------------------------
# Build Gradio Interface (Notebook-safe)
# ---------------------------
with gr.Blocks(title="Multilingual TTS Reader + Player") as demo:
    gr.Markdown(
        "# Multilingual TTS Reader + Player\n"
        "Default voice preset is a sweet girl's voice (Pitch +5, Tone 3000 Hz, Gain −2 dB). "
        "Adjust any sliders to override."
    )

    with gr.Row():
        txt = gr.Textbox(label="Paste Text", lines=6, placeholder="Paste text in Hindi / English / Gujarati...")
        file = gr.File(label="Or upload: PDF/DOCX/PPTX/TXT", file_count="single", type="filepath")
    with gr.Row():
        load_text_btn = gr.Button("Load Text")
        load_file_btn = gr.Button("Load File")
        load_status = gr.Textbox(label="Loader Status", interactive=False)

    with gr.Row():
        speed = gr.Slider(0.25, 2.0, value=1.0, step=0.05, label="Speed (x)")
        gain = gr.Slider(-30, 12, value=-2, step=1, label="Loudness / Gain (dB)")
        pitch = gr.Slider(-12, 12, value=5, step=0.5, label="Pitch Shift (semitones)")
    with gr.Row():
        center = gr.Slider(20, 20000, value=3000, step=1, label="Tone Center Frequency (Hz)")
        wavelength = gr.Slider(0.017, 17.15, value=343.0/3000.0, step=0.001, label="Wavelength (m)")
        q = gr.Slider(0.2, 10.0, value=1.0, step=0.1, label="Tone Q (bandwidth)")

    process_btn = gr.Button("Process + Play from Offset")
    audio = gr.Audio(label="Audio Output", interactive=False, type="filepath", autoplay=True)
    status = gr.Textbox(label="Playback Status", interactive=False)

    with gr.Row():
        back10 = gr.Button("⏪ Rewind 10s")
        ahead10 = gr.Button("⏩ Forward 10s")
        reset = gr.Button("⏮️ Restart")

    # Events
    load_text_btn.click(ui_load_text, inputs=[txt], outputs=[load_status, txt])
    load_file_btn.click(ui_load_file, inputs=[file], outputs=[load_status, txt])
    process_btn.click(ui_process, inputs=[speed, gain, pitch, center, wavelength, q], outputs=[audio, status])
    back10.click(lambda: ui_seek(-10), outputs=[audio, status])
    ahead10.click(lambda: ui_seek(10), outputs=[audio, status])
    reset.click(ui_reset_offset, outputs=[audio, status])

# In Jupyter, launch with:
demo.launch(debug=False, share=False)