In [1]:
"""
Offline TTS + Audio Caching System
Demo – VisionAssist

"""

import os
import time
import hashlib
import threading
import queue
from pathlib import Path
from typing import Optional, Dict

import numpy as np
import soundfile as sf
from pydub import AudioSegment
from pydub.playback import play

# Root folders
ROOT_DIR = Path(".")
AUDIO_CACHE = ROOT_DIR / "tts_cache"
AUDIO_CACHE.mkdir(exist_ok=True)

print("Environment initialized.")


Environment initialized.


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


In [2]:
class Timer:
    """Simple latency measurement utility."""
    def __init__(self):
        self.start_time = None

    def start(self):
        self.start_time = time.time()

    def stop(self) -> float:
        if self.start_time is None:
            return 0.0
        return (time.time() - self.start_time) * 1000  # ms


In [3]:
def normalize_alert_text(text: str) -> str:
    """
    Normalize alert text to ensure consistent hashing and caching.
    """
    text = text.strip().lower()
    text = " ".join(text.split())   # remove extra spaces
    return text


In [4]:
def generate_tone_wav(path: Path, duration: float = 0.35, freq: float = 440):
    """
    Generate a placeholder WAV that mimics offline TTS output.
    """
    samplerate = 22050
    t = np.linspace(0, duration, int(samplerate * duration), False)
    tone = 0.5 * np.sin(freq * 2 * np.pi * t)
    sf.write(str(path), tone, samplerate)


In [5]:
class TTSCache:
    """
    Offline TTS caching + generation manager.
    """
    def __init__(self, cache_dir=AUDIO_CACHE):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)

    def _hash(self, text: str) -> str:
        return hashlib.md5(text.encode()).hexdigest()

    def get_path(self, text: str) -> Path:
        h = self._hash(text)
        return self.cache_dir / f"tts_{h}.wav"

    def exists(self, text: str) -> bool:
        return self.get_path(text).exists()

    def generate(self, text: str) -> Path:
        """
        Simulate offline TTS by generating a tone WAV.
        """
        path = self.get_path(text)
        generate_tone_wav(path)
        return path

    def get_or_create(self, text: str) -> Path:
        text = normalize_alert_text(text)
        path = self.get_path(text)
        if not path.exists():
            print(f"[TTS] Generating new audio: '{text}'")
            self.generate(text)
        else:
            print(f"[TTS] Using cached audio: '{text}'")
        return path


In [6]:
tts_cache = TTSCache()


In [7]:
class AudioPlayer:
    """Handles queued async audio playback."""

    def __init__(self):
        self.queue = queue.Queue()
        self.stop_signal = object()
        self.worker = threading.Thread(target=self._worker_fn, daemon=True)
        self.worker.start()

    def _worker_fn(self):
        while True:
            item = self.queue.get()
            if item is self.stop_signal:
                break

            wav_path: Path = item
            try:
                audio = AudioSegment.from_wav(wav_path)
                play(audio)
            except Exception as e:
                print("[Audio Error]", e)

            self.queue.task_done()

    def play(self, wav_path: Path):
        self.queue.put(wav_path)

    def stop(self):
        self.queue.put(self.stop_signal)
        self.worker.join()
        print("Audio thread stopped.")

audio_player = AudioPlayer()


In [8]:
class AlertSystem:
    """
    Full offline-alert pipeline:
    text → normalization → cache lookup → wav → queued playback
    """

    def __init__(self, cache_manager: TTSCache, player: AudioPlayer):
        self.cache = cache_manager
        self.player = player

    def speak(self, text: str):
        text = normalize_alert_text(text)

        t = Timer()
        t.start()

        wav_path = self.cache.get_or_create(text)
        self.player.play(wav_path)

        latency = t.stop()
        print(f"[Latency] Alert pipeline = {latency:.2f} ms")
        return latency

alert_system = AlertSystem(tts_cache, audio_player)


In [9]:
fake_detections = [
    {"cls": "person", "dist": 2.5},
    {"cls": "bicycle", "dist": 5.1},
    {"cls": "person", "dist": 2.6},   # Cached alert!
]

for det in fake_detections:
    alert_text = f"{det['cls']} ahead"
    alert_system.speak(alert_text)


[TTS] Generating new audio: 'person ahead'
[Latency] Alert pipeline = 10.11 ms
[TTS] Generating new audio: 'bicycle ahead'
[Latency] Alert pipeline = 8.33 ms
[TTS] Using cached audio: 'person ahead'
[Latency] Alert pipeline = 0.10 ms


In [10]:
latency_log = []

for det in fake_detections:
    alert_text = f"{det['cls']} detected ahead"
    lat = alert_system.speak(alert_text)
    latency_log.append(lat)

print("\nLatency Summary (ms):")
for i, l in enumerate(latency_log):
    print(f"{i+1}. {l:.2f} ms")

print(f"\nAverage latency: {np.mean(latency_log):.2f} ms")


[TTS] Generating new audio: 'person detected ahead'
[Latency] Alert pipeline = 10.06 ms
[TTS] Generating new audio: 'bicycle detected ahead'
[Latency] Alert pipeline = 8.06 ms
[TTS] Using cached audio: 'person detected ahead'
[Latency] Alert pipeline = 0.10 ms

Latency Summary (ms):
1. 10.06 ms
2. 8.06 ms
3. 0.10 ms

Average latency: 6.07 ms


In [11]:
audio_player.stop()
print("Demo finished successfully.")


Audio thread stopped.
Demo finished successfully.
