In [None]:
"""
Multilingual Text-to-Speech (Hindi/English/Gujarati) + Player + Sweet Girl Preset + Logging
------------------------------------------------------------------------------------------
What this notebook/app does
  • Reads text from: raw text, PDF, DOCX, PPTX, TXT (best-effort extraction)
  • Detects language (Hindi/English/Gujarati) and synthesizes speech
  • Default voice preset: “sweet girl” (pitch +5 semitones, tone center 3000 Hz, gain −2 dB)
  • Player controls (via Gradio): Play, Rewind 10s, Forward 10s, Restart, Speed, Gain, Pitch, Frequency, Wavelength
  • Wavelength <-> frequency linkage (λ = 343 / f)
  • Saves output **MP3** by default (falls back to WAV if MP3 export fails) and logs each run
  • Logging/Archival with auto-increment ID per item:
      Textual Data/
        ├── Hindi/
        ├── English/
        └── Gujarati/
      Audio/
        ├── Hindi/
        ├── English/
        └── Gujarati/
      logs/session_log.csv


#Installation (run in a Jupyter cell)
#------------------------------------
!pip install --upgrade pip

# Core libraries
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install transformers
!pip install gtts
!pip install pydub
!pip install librosa
!pip install soundfile
!pip install gradio
!pip install langdetect
!pip install python-docx
!pip install python-pptx
!pip install PyPDF2
!pip install pandas

# For MP3 export, you need FFmpeg available on your system PATH.
# Linux (Debian/Ubuntu):  !apt-get update && apt-get install -y ffmpeg
# Windows: install FFmpeg and add to PATH (https://ffmpeg.org/)
# Mac:     brew install ffmpeg
"""

import os
import io
import csv
import math
import time
import tempfile
from datetime import datetime
from pathlib import Path

import numpy as np
import soundfile as sf
from langdetect import detect

# File readers
import PyPDF2
from docx import Document as Docx
from pptx import Presentation

# Audio processing
import librosa
from scipy.signal import butter, sosfilt
from pydub import AudioSegment

# UI
import gradio as gr

# Optional: logs via pandas for convenience (not required for core)
try:
    import pandas as pd
except Exception:
    pd = None

# ---------------------------
# Config & Paths
# ---------------------------
SPEED_MIN, SPEED_MAX = 0.25, 2.0
GAIN_MIN_DB, GAIN_MAX_DB = -30, 12
PITCH_MIN, PITCH_MAX = -12, 12  # semitones
FREQ_MIN_HZ, FREQ_MAX_HZ = 20, 20000
WAVEL_MIN_M, WAVEL_MAX_M = 0.017, 17.15
AIR_C = 343.0  # m/s

DEFAULTS = {
    "speed": 1.0,
    "gain_db": -2.0,
    "pitch_semitones": 5.0,  # sweet girl preset
    "center_hz": 3000.0,
    "wavelength_m": AIR_C / 3000.0,
    "q": 1.0,
    "sr": 24000,
}

BASE_TEXT_DIR = Path("Textual Data")
BASE_AUDIO_DIR = Path("Audio")
LOGS_DIR = Path("logs")
LOG_CSV = LOGS_DIR / "session_log.csv"

LANG_MAP = {
    "hi": "Hindi",
    "en": "English",
    "gu": "Gujarati",
}

# Ensure directories
for root in [BASE_TEXT_DIR, BASE_AUDIO_DIR, LOGS_DIR]:
    root.mkdir(parents=True, exist_ok=True)
for lang_name in ["Hindi", "English", "Gujarati"]:
    (BASE_TEXT_DIR / lang_name).mkdir(parents=True, exist_ok=True)
    (BASE_AUDIO_DIR / lang_name).mkdir(parents=True, exist_ok=True)

# ---------------------------
# Utility: Text Extraction
# ---------------------------

def extract_text(path: str) -> str:
    ext = Path(path).suffix.lower()
    if ext == ".pdf":
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            pages = []
            for i in range(len(reader.pages)):
                try:
                    pages.append(reader.pages[i].extract_text() or "")
                except Exception:
                    pages.append("")
            return "\n\n".join(pages)
    elif ext in {".doc", ".docx"}:
        doc = Docx(path)
        return "\n".join(p.text for p in doc.paragraphs)
    elif ext in {".ppt", ".pptx"}:
        prs = Presentation(path)
        texts = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    texts.append(shape.text)
        return "\n\n".join(texts)
    else:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()

# ---------------------------
# Utility: Chunking & Lang Detect
# ---------------------------

def chunk_text(text: str, max_chars: int = 400) -> list[str]:
    raw_parts = [p.strip() for p in text.replace("\r", "\n").split("\n") if p.strip()]
    chunks = []
    buf = []
    total = 0
    for part in raw_parts:
        if total + len(part) + 1 > max_chars and buf:
            chunks.append(" ".join(buf))
            buf, total = [], 0
        buf.append(part)
        total += len(part) + 1
    if buf:
        chunks.append(" ".join(buf))
    return chunks or ([text[:max_chars]] if text else [])


def detect_lang_safe(text: str) -> str:
    try:
        return detect(text)
    except Exception:
        return "en"

# ---------------------------
# TTS Backends
# ---------------------------
try:
    from TTS.api import TTS as COQUI_TTS
    _HAS_COQUI = True
except Exception:
    _HAS_COQUI = False

try:
    from gtts import gTTS
    _HAS_GTTS = True
except Exception:
    _HAS_GTTS = False

try:
    import pyttsx3
    _HAS_PYTT = True
except Exception:
    _HAS_PYTT = False


def synthesize_chunk(text: str, lang_hint: str, sr: int = DEFAULTS["sr"]) -> np.ndarray:
    lang = lang_hint
    if _HAS_COQUI:
        try:
            model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
            tts = COQUI_TTS(model_name)
            wav = tts.tts(text=text, language=lang if lang in {"en", "hi"} else "en")
            y = np.array(wav, dtype=np.float32)
            if sr != tts.synthesizer.output_sample_rate:
                y = librosa.resample(y, orig_sr=tts.synthesizer.output_sample_rate, target_sr=sr)
            return y.astype(np.float32)
        except Exception:
            pass
    if _HAS_GTTS and lang in {"en", "hi", "gu"}:
        try:
            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
                gTTS(text=text, lang=lang).save(tmp.name)
                seg = AudioSegment.from_file(tmp.name)
                seg = seg.set_channels(1).set_frame_rate(sr)
                samples = np.array(seg.get_array_of_samples()).astype(np.float32)
                y = samples / (2 ** (8 * seg.sample_width - 1))
                return y
        except Exception:
            pass
    if _HAS_PYTT:
        try:
            engine = pyttsx3.init()
            for v in engine.getProperty('voices'):
                if lang in (v.languages[0].decode('utf-8') if v.languages else str(v.id)):
                    engine.setProperty('voice', v.id)
                    break
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
                out_path = tmp.name
            engine.save_to_file(text, out_path)
            engine.runAndWait()
            y, sr_in = librosa.load(out_path, sr=sr, mono=True)
            return y.astype(np.float32)
        except Exception:
            pass
    return np.zeros(int(sr * 0.5), dtype=np.float32)


def synthesize_text(text: str, sr: int = DEFAULTS["sr"]) -> tuple[np.ndarray, int]:
    chunks = chunk_text(text)
    parts = []
    for ch in chunks:
        lang = detect_lang_safe(ch)
        parts.append(synthesize_chunk(ch, lang, sr=sr))
    y = np.concatenate(parts) if parts else np.zeros(int(sr * 0.5), dtype=np.float32)
    peak = np.max(np.abs(y)) or 1.0
    y = y / peak * 0.95
    return y.astype(np.float32), sr

# ---------------------------
# Audio FX
# ---------------------------

def apply_gain_db(y: np.ndarray, db: float) -> np.ndarray:
    factor = 10 ** (db / 20.0)
    out = y * factor
    maxv = np.max(np.abs(out))
    if maxv > 1.0:
        out = out / maxv * 0.99
    return out.astype(np.float32)


def apply_speed(y: np.ndarray, sr: int, speed: float) -> np.ndarray:
    speed = max(SPEED_MIN, min(SPEED_MAX, float(speed)))
    return librosa.effects.time_stretch(y, rate=speed).astype(np.float32)


def apply_pitch(y: np.ndarray, sr: int, semitones: float) -> np.ndarray:
    semitones = max(PITCH_MIN, min(PITCH_MAX, float(semitones)))
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=semitones).astype(np.float32)


def bandpass(y: np.ndarray, sr: int, center_hz: float, q: float = 1.0) -> np.ndarray:
    center_hz = float(np.clip(center_hz, FREQ_MIN_HZ, min(FREQ_MAX_HZ, sr/2 - 100)))
    bw = center_hz / max(q, 0.1)
    low = max(10.0, center_hz - bw/2)
    high = min(sr/2 - 10.0, center_hz + bw/2)
    if low >= high:
        return y
    sos = butter(4, [low/(sr/2), high/(sr/2)], btype='bandpass', output='sos')
    return sosfilt(sos, y).astype(np.float32)

# ---------------------------
# Logging & ID helpers
# ---------------------------

def next_id() -> str:
    """Compute next zero-padded ID based on session_log.csv (6 digits)."""
    LOGS_DIR.mkdir(parents=True, exist_ok=True)
    if not LOG_CSV.exists():
        return "000001"
    try:
        with open(LOG_CSV, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            max_id = 0
            for row in reader:
                try:
                    max_id = max(max_id, int(row.get('ID', 0)))
                except Exception:
                    pass
        return f"{max_id+1:06d}"
    except Exception:
        return "000001"


def write_log_row(row: dict):
    headers = ["ID", "Language", "InputSource", "Timestamp", "TextFilePath", "AudioFilePath", "Speed", "GainDB", "PitchSemitones", "CenterHz", "WavelengthM", "Q"]
    new_file = not LOG_CSV.exists()
    with open(LOG_CSV, "a", newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        if new_file:
            writer.writeheader()
        writer.writerow(row)


def save_text_and_audio(id_str: str, language_name: str, text: str, y: np.ndarray, sr: int, prefer_mp3: bool = True) -> tuple[Path, Path, str]:
    # Save text
    text_dir = BASE_TEXT_DIR / language_name
    audio_dir = BASE_AUDIO_DIR / language_name
    text_dir.mkdir(parents=True, exist_ok=True)
    audio_dir.mkdir(parents=True, exist_ok=True)

    txt_path = text_dir / f"{id_str}.txt"
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)

    # Save audio as WAV temp first, then convert to MP3 if possible
    wav_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(wav_tmp.name, y, sr)
    wav_tmp.flush()

    audio_path = None
    used_format = "wav"

    if prefer_mp3:
        try:
            seg = AudioSegment.from_file(wav_tmp.name)
            audio_path = audio_dir / f"{id_str}.mp3"
            seg.export(str(audio_path), format="mp3", bitrate="192k")
            used_format = "mp3"
        except Exception:
            # fallback to WAV
            audio_path = audio_dir / f"{id_str}.wav"
            sf.write(str(audio_path), y, sr)
            used_format = "wav"
    else:
        audio_path = audio_dir / f"{id_str}.wav"
        sf.write(str(audio_path), y, sr)
        used_format = "wav"

    try:
        os.unlink(wav_tmp.name)
    except Exception:
        pass

    return txt_path, audio_path, used_format

# ---------------------------
# Session State
# ---------------------------
class Session:
    def __init__(self):
        self.y_full = None
        self.sr = DEFAULTS["sr"]
        self.offset = 0.0
        self.last_processed = None
        self.last_text = ""
        self.last_language = "English"
        self.last_id = None
        self.last_saved_audio_path = None
        self.last_saved_text_path = None

    def detect_language_name(self, text: str) -> str:
        code = detect_lang_safe(text or "")
        return LANG_MAP.get(code, "English")

    def load_text(self, text: str):
        self.last_text = text or ""
        self.last_language = self.detect_language_name(self.last_text[:400])
        y, sr = synthesize_text(self.last_text, sr=self.sr)
        self.y_full, self.sr = y, sr
        self.offset = 0.0
        self.last_processed = None

    def load_file(self, path: str):
        text = extract_text(path)
        self.load_text(text)

    def process(self, speed=DEFAULTS["speed"], gain_db=DEFAULTS["gain_db"], pitch_semitones=DEFAULTS["pitch_semitones"], center_hz=DEFAULTS["center_hz"], q=DEFAULTS["q"]):
        if self.y_full is None:
            return None, None
        y = self.y_full.copy()
        # Order: pitch -> speed -> tone -> gain
        if pitch_semitones:
            y = apply_pitch(y, self.sr, pitch_semitones)
        if speed and abs(speed - 1.0) > 1e-3:
            y = apply_speed(y, self.sr, speed)
        if center_hz:
            y = bandpass(y, self.sr, center_hz, q)
        if gain_db:
            y = apply_gain_db(y, gain_db)
        self.last_processed = (y, self.sr)
        return y, self.sr

    def slice_from_offset(self, y: np.ndarray, sr: int) -> np.ndarray:
        start = int(self.offset * sr)
        start = max(0, min(start, len(y)))
        return y[start:]

    def save_run(self, y: np.ndarray, sr: int, freq_hz: float, wavelength_m: float, speed: float, gain_db: float, pitch_semitones: float, q: float, input_source: str = "typed", prefer_mp3: bool = True) -> tuple[str, Path, Path, str]:
        id_str = next_id()
        txt_path, audio_path, fmt = save_text_and_audio(id_str, self.last_language, self.last_text, y, sr, prefer_mp3=prefer_mp3)
        timestamp = datetime.utcnow().isoformat()
        write_log_row({
            "ID": id_str,
            "Language": self.last_language,
            "InputSource": input_source,
            "Timestamp": timestamp,
            "TextFilePath": str(txt_path),
            "AudioFilePath": str(audio_path),
            "Speed": speed,
            "GainDB": gain_db,
            "PitchSemitones": pitch_semitones,
            "CenterHz": round(freq_hz, 3),
            "WavelengthM": round(wavelength_m, 5),
            "Q": q,
        })
        self.last_id = id_str
        self.last_saved_audio_path = audio_path
        self.last_saved_text_path = txt_path
        return id_str, txt_path, audio_path, fmt

SESSION = Session()

# ---------------------------
# Gradio Callbacks
# ---------------------------

def ui_load_text(text):
    SESSION.load_text(text)
    return gr.update(value=f"Loaded text. Language: {SESSION.last_language}"), ""


def ui_load_file(file):
    if file is None:
        return gr.update(), ""
    SESSION.load_file(file.name)
    return gr.update(value=f"Loaded file. Language: {SESSION.last_language}"), SESSION.last_text[:500]


def ui_process(speed, gain_db, pitch_semitones, center_hz, wavelength, q):
    # Wavelength <-> Frequency synchronization
    if wavelength is not None:
        wavelength = float(max(WAVEL_MIN_M, min(WAVEL_MAX_M, float(wavelength))))
        center_hz = AIR_C / wavelength
    center_hz = float(max(FREQ_MIN_HZ, min(FREQ_MAX_HZ, float(center_hz))))

    y, sr = SESSION.process(speed, gain_db, pitch_semitones, center_hz, q)
    if y is None:
        return None, "No audio yet.", ""

    # Playback slice from current offset
    sliced = SESSION.slice_from_offset(y, sr)
    play_wav_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
    sf.write(play_wav_path, sliced, sr)

    # Save FULL processed audio + text with auto-increment ID (archive/log)
    full_id, txt_path, audio_path, fmt = SESSION.save_run(y, sr, center_hz, AIR_C / center_hz, speed, gain_db, pitch_semitones, q, input_source="typed", prefer_mp3=True)

    status = (f"Saved ID #{full_id} | Lang: {SESSION.last_language} | File: {audio_path.name} ({fmt.upper()})\n"
              f"Text → {txt_path}\nAudio → {audio_path}\n"
              f"Offset: {SESSION.offset:.1f}s | Play length: {len(sliced)/sr:.1f}s | λ≈{AIR_C/center_hz:.3f} m")
    return play_wav_path, status, str(audio_path)


def ui_seek(delta):
    SESSION.offset = max(0.0, SESSION.offset + float(delta))
    if SESSION.last_processed is None:
        return None, f"Offset: {SESSION.offset:.1f}s"
    y, sr = SESSION.last_processed
    sliced = SESSION.slice_from_offset(y, sr)
    wav_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
    sf.write(wav_path, sliced, sr)
    return wav_path, f"Offset: {SESSION.offset:.1f}s, Play length: {len(sliced)/sr:.1f}s"


def ui_reset_offset():
    SESSION.offset = 0.0
    return ui_seek(0)

# ---------------------------
# Build Gradio Interface
# ---------------------------
with gr.Blocks(title="Multilingual TTS Reader + Player + Logging") as demo:
    gr.Markdown("# Multilingual TTS Reader + Player + Logging\nDefault preset: **sweet girl's voice** (Pitch +5, Tone 3 kHz, Gain −2 dB). Adjust any slider to override.\n\n**Note:** Audible/safe ranges enforced. For MP3 export, FFmpeg must be installed.")

    with gr.Row():
        txt = gr.Textbox(label="Paste Text", lines=6, placeholder="Type or paste Hindi / English / Gujarati / mixed text…")
        file = gr.File(file_types=[".pdf", ".docx", ".pptx", ".txt"], label="Or upload: PDF / DOCX / PPTX / TXT")
    with gr.Row():
        load_text_btn = gr.Button("Load Text")
        load_file_btn = gr.Button("Load File")
        load_status = gr.Textbox(label="Loader Status", interactive=False)

    with gr.Row():
        speed = gr.Slider(SPEED_MIN, SPEED_MAX, value=DEFAULTS["speed"], step=0.05, label="Speed (x)")
        gain = gr.Slider(GAIN_MIN_DB, GAIN_MAX_DB, value=DEFAULTS["gain_db"], step=1, label="Loudness / Gain (dB)")
        pitch = gr.Slider(PITCH_MIN, PITCH_MAX, value=DEFAULTS["pitch_semitones"], step=0.5, label="Pitch Shift (semitones)")
    with gr.Row():
        center = gr.Slider(FREQ_MIN_HZ, FREQ_MAX_HZ, value=DEFAULTS["center_hz"], step=1, label="Tone Center Frequency (Hz)")
        wavelength = gr.Slider(WAVEL_MIN_M, WAVEL_MAX_M, value=DEFAULTS["wavelength_m"], step=0.001, label="Wavelength (m)")
        q = gr.Slider(0.2, 10.0, value=DEFAULTS["q"], step=0.1, label="Tone Q (bandwidth)")

    process_btn = gr.Button("Process + Save + Play from Offset")
    audio = gr.Audio(label="Audio Output", interactive=False)
    status = gr.Textbox(label="Status & Saved Paths", interactive=False)
    last_audio_path = gr.Textbox(label="Last Saved Audio Path", interactive=False)

    with gr.Row():
        back10 = gr.Button("⏪ Rewind 10s")
        ahead10 = gr.Button("⏩ Forward 10s")
        reset = gr.Button("⏮️ Restart")

    # Events
    load_text_btn.click(ui_load_text, inputs=[txt], outputs=[load_status, txt])
    load_file_btn.click(ui_load_file, inputs=[file], outputs=[load_status, txt])
    process_btn.click(ui_process, inputs=[speed, gain, pitch, center, wavelength, q], outputs=[audio, status, last_audio_path])
    back10.click(lambda: ui_seek(-10), outputs=[audio, status])
    ahead10.click(lambda: ui_seek(10), outputs=[audio, status])
    reset.click(ui_reset_offset, outputs=[audio, status])

# In Jupyter, run:
demo.launch(debug=False, share=False)
