In [2]:
# requirements (install via pip):
# faster-whisper==1.*  sounddevice webrtcvad numpy scipy torch
# sentence-transformers wordfreq pronouncing nltk
# (for Windows CPU-only whisper you may want: pip install torch --index-url https://download.pytorch.org/whl/cpu)

import queue, threading, time, sys, re, json, os
import numpy as np
import sounddevice as sd
import webrtcvad

from faster_whisper import WhisperModel
from sentence_transformers import SentenceTransformer
from wordfreq import zipf_frequency
import pronouncing

In [3]:
# ---------------------------
# Config
# ---------------------------
ASR_MODEL = "medium"        # try "small" or "medium" depending on your hardware
DEVICE = "cpu"              # 'cuda' if you have a GPU; 'cpu' if not
VAD_FRAME_MS = 20           # 10/20/30 ms frames allowed by webrtcvad
SAMPLE_RATE = 16000
CHANNELS = 1
CONTEXT_SECONDS = 30        # window for suggestions
TOP_K = 8
PERSONAL_VOCAB_PATH = "./personal_vocab.txt"  # one word/phrase per line

# Optional: letter / sound hint controls (connect to UI fields)
HINT_FIRST_LETTER = None    # e.g., 'b'
HINT_SYLLABLES = None       # e.g., 2

In [4]:
# ---------------------------
# Audio + VAD
# ---------------------------
audio_q = queue.Queue()
vad = webrtcvad.Vad(2)  # 0-3 (3 is most aggressive)

def mic_callback(indata, frames, time_info, status):
    if status:
        print(status, file=sys.stderr)
    # convert to 16-bit mono samples as bytes
    pcm16 = (indata[:, 0] * 32767).astype(np.int16).tobytes()
    audio_q.put(pcm16)

def audio_stream():
    with sd.InputStream(
        samplerate=SAMPLE_RATE,
        channels=CHANNELS,
        dtype='float32',
        blocksize=int(SAMPLE_RATE * VAD_FRAME_MS / 1000),
        callback=mic_callback,
    ):
        while True:
            time.sleep(0.1)

In [5]:
# ---------------------------
# Rolling transcript store
# ---------------------------
class RollingTranscript:
    def __init__(self, max_seconds=120):
        self.segments = []  # list of (t, text)
        self.max_seconds = max_seconds

    def add(self, text):
        t = time.time()
        self.segments.append((t, text))
        self.trim()

    def trim(self):
        cutoff = time.time() - self.max_seconds
        self.segments = [(t, s) for (t, s) in self.segments if t >= cutoff]

    def get_context(self, seconds):
        cutoff = time.time() - seconds
        text = " ".join(s for (t, s) in self.segments if t >= cutoff)
        return text.strip()

transcript = RollingTranscript(max_seconds=300)


In [6]:
# ---------------------------
# ASR worker (chunked)
# ---------------------------
def asr_worker():
    model = WhisperModel(ASR_MODEL, device=DEVICE, compute_type="int8")
    buf = b""
    bytes_per_frame = int(SAMPLE_RATE * (VAD_FRAME_MS/1000)) * 2  # 16-bit mono
    silence_bytes = b"\x00" * bytes_per_frame
    speech_chunk = b""
    last_speech_time = time.time()
    speaking = False

    while True:
        try:
            frame = audio_q.get(timeout=1.0)
        except queue.Empty:
            frame = silence_bytes

        is_speech = vad.is_speech(frame, SAMPLE_RATE)

        if is_speech:
            speaking = True
            last_speech_time = time.time()
            speech_chunk += frame
        else:
            # if we were speaking and now silence, flush after short tail
            if speaking and (time.time() - last_speech_time) > 0.35:
                # decode this speech_chunk
                audio_np = np.frombuffer(speech_chunk, dtype=np.int16).astype(np.float32) / 32768.0
                segments, _ = model.transcribe(audio_np, language="en", beam_size=1, vad_filter=False)
                for seg in segments:
                    piece = seg.text.strip()
                    if piece:
                        transcript.add(piece)
                        print("[ASR]", piece)
                speech_chunk = b""
                speaking = False

In [7]:
# ---------------------------
# Suggestion Engine
# ---------------------------
class SuggestionEngine:
    def __init__(self, personal_vocab_path):
        self.embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        self.personal_vocab = self._load_personal_vocab(personal_vocab_path)
        self.core_vocab = self._load_core_vocab()
        self.candidate_cache = {}  # word -> embedding

        # build embeddings for personal vocab (do once)
        _ = self._embed_list(self.personal_vocab)

    def _load_personal_vocab(self, path):
        if not os.path.exists(path):
            return []
        with open(path, "r", encoding="utf-8") as f:
            words = [w.strip() for w in f if w.strip()]
        return list(dict.fromkeys(words))  # unique, preserve order

    def _load_core_vocab(self, top_n=5000):
        # you can replace this with a curated list; for demo, a small seed:
        seed = [
            "appointment","breakfast","coffee","doctor","nurse","medication",
            "glasses","television","bathroom","kitchen","remote","charger",
            "daughter","son","grandson","granddaughter","neighbor","friend",
            "church","pharmacy","grocery","walker","wheelchair","keys","phone",
            "wallet","jacket","mailbox","bus","taxi","ride","library","park"
        ]
        return seed

    def _embed_list(self, words):
        to_compute = [w for w in words if w not in self.candidate_cache]
        if to_compute:
            embs = self.embedder.encode(to_compute, normalize_embeddings=True)
            for w, e in zip(to_compute, embs):
                self.candidate_cache[w] = e
        return np.array([self.candidate_cache[w] for w in words])

    def _semantic_scores(self, context, candidates):
        if not context.strip():
            return np.zeros(len(candidates))
        ctx_emb = self.embedder.encode([context], normalize_embeddings=True)[0]
        cand_embs = self._embed_list(candidates)
        return (cand_embs @ ctx_emb)  # cosine since normalized

    def _freq_prior(self, word):
        # Zipf 1-7; clamp and scale
        return max(0.0, zipf_frequency(word, "en") - 3.0) / 4.0

    def _phonetic_score(self, word, hint_first_letter=None, hint_syllables=None):
        score = 0.0
        if hint_first_letter:
            score += 0.5 if word.lower().startswith(hint_first_letter.lower()) else 0.0
        if hint_syllables is not None:
            # crude syllable match via CMU dict:
            phones = pronouncing.phones_for_word(word.lower())
            if phones:
                syls = [pronouncing.syllable_count(p) for p in phones]
                if syls and min(abs(s - hint_syllables) for s in syls) == 0:
                    score += 0.3
        return score

    def suggest(self, context_text, top_k=TOP_K, hint_first_letter=None, hint_syllables=None):
        # build candidate pool
        candidates = list(dict.fromkeys(self.personal_vocab + self.core_vocab))

        # rank components
        sem = self._semantic_scores(context_text, candidates)
        pri = np.array([self._freq_prior(w) for w in candidates])
        pho = np.array([self._phonetic_score(w, hint_first_letter, hint_syllables) for w in candidates])

        # weighted blend (tune these)
        scores = 0.70*sem + 0.20*pri + 0.10*pho

        # sort & return
        idx = np.argsort(-scores)
        ranked = [(candidates[i], float(scores[i])) for i in idx[:top_k]]
        return ranked

engine = SuggestionEngine(PERSONAL_VOCAB_PATH)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# ---------------------------
# UI stub: button → suggestions
# ---------------------------
def on_button_press():
    context = transcript.get_context(CONTEXT_SECONDS)
    suggestions = engine.suggest(
        context,
        top_k=TOP_K,
        hint_first_letter=HINT_FIRST_LETTER,
        hint_syllables=HINT_SYLLABLES
    )
    print("\n--- SUGGESTIONS ---")
    for w, s in suggestions:
        print(f"{w:20s}  {s:.3f}")
    print("-------------------\n")


In [None]:
# ---------------------------
# Launch threads
# ---------------------------
if __name__ == "__main__":
    t_audio = threading.Thread(target=audio_stream, daemon=True)
    t_asr = threading.Thread(target=asr_worker, daemon=True)
    t_audio.start()
    t_asr.start()

    print("Live. Press Enter to show suggestions. Ctrl+C to exit.")
    try:
        while True:
            input()  # simulate the big on-screen button
            on_button_press()
    except KeyboardInterrupt:
        pass

Live. Press Enter to show suggestions. Ctrl+C to exit.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]


--- SUGGESTIONS ---
friend                0.119
phone                 0.115
son                   0.114
park                  0.108
church                0.108
daughter              0.104
doctor                0.095
coffee                0.093
-------------------


--- SUGGESTIONS ---
friend                0.119
phone                 0.115
son                   0.114
park                  0.108
church                0.108
daughter              0.104
doctor                0.095
coffee                0.093
-------------------


--- SUGGESTIONS ---
friend                0.119
phone                 0.115
son                   0.114
park                  0.108
church                0.108
daughter              0.104
doctor                0.095
coffee                0.093
-------------------


--- SUGGESTIONS ---
friend                0.119
phone                 0.115
son                   0.114
park                  0.108
church                0.108
daughter              0.104
doctor       