In [14]:
import sounddevice as sd
import numpy as np
from faster_whisper import WhisperModel

# 1. Setup Model (Download happens automatically on first run)
# Use "tiny" for speed, "base" for accuracy.
# device="cuda" for your GTX 1060, or "cpu" for Mac (CTranslate2 has specific Mac support too)
model_size = "tiny" 
model = WhisperModel(model_size, device="cpu", compute_type="int8")

print("Recording for 5 seconds...")

# 2. Record Audio
fs = 16000  # Whisper expects 16kHz audio
duration = 5  # seconds
recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
sd.wait()  # Wait until recording is finished

print("Transcribing...")

# 3. Transcribe
# Whisper expects raw audio data
segments, info = model.transcribe(recording.flatten(), beam_size=5)

for segment in segments:
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

Recording for 5 seconds...
Transcribing...
[0.00s -> 3.00s]  Testing the tiny model.


In [17]:
import openwakeword
print(openwakeword.get_pretrained_model_paths())

['/opt/homebrew/lib/python3.11/site-packages/openwakeword/resources/models/alexa_v0.1.tflite', '/opt/homebrew/lib/python3.11/site-packages/openwakeword/resources/models/hey_mycroft_v0.1.tflite', '/opt/homebrew/lib/python3.11/site-packages/openwakeword/resources/models/hey_jarvis_v0.1.tflite', '/opt/homebrew/lib/python3.11/site-packages/openwakeword/resources/models/hey_rhasspy_v0.1.tflite', '/opt/homebrew/lib/python3.11/site-packages/openwakeword/resources/models/timer_v0.1.tflite', '/opt/homebrew/lib/python3.11/site-packages/openwakeword/resources/models/weather_v0.1.tflite']


In [32]:
import pyaudio
import numpy as np
import openwakeword
from openwakeword.model import Model
import torch
import collections
from faster_whisper import WhisperModel

# --- CONFIGURATION ---
# Both models work best at 16000 Hz
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
# OpenWakeWord prefers chunks of 1280 samples (80ms) for efficiency, but works with smaller multiples
CHUNK = 512 
WAKE_WORD_THRESHOLD = 0.5

# Initialize Whisper model
whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")

# --- INITIALIZE MODELS ---
print("Loading Wake Word Model...")
# Using the pre-trained 'hey_jarvis' model. You can swap this later.
oww_model = Model(wakeword_models=["hey_jarvis"], inference_framework="onnx")

print("Loading Silero VAD...")
# Load Silero VAD from Torch Hub (downloads automatically)
vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                  model='silero_vad',
                                  force_reload=False)
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
vad_iterator = VADIterator(vad_model)

# --- AUDIO STREAM SETUP ---
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

print("\n--- SYSTEM READY ---")
print("Say 'Hey Jarvis' to trigger me.")

# --- STATE MANAGEMENT ---
# Buffer to store audio when user is speaking
audio_buffer = collections.deque() 
is_awake = False 

try:
    while True:
        # 1. Get Audio Chunk
        data = stream.read(CHUNK, exception_on_overflow=False)
        # Convert raw bytes to numpy array (int16)
        audio_int16 = np.frombuffer(data, dtype=np.int16)

        # ---------------------------------------------------------
        # STATE 1: PASSIVE LISTENING (Waiting for Wake Word)
        # ---------------------------------------------------------
        if not is_awake:
            # Feed audio to OpenWakeWord
            prediction = oww_model.predict(audio_int16)
            
            # Check if "hey_jarvis" score is high enough
            if prediction["hey_jarvis"] > WAKE_WORD_THRESHOLD:
                print("\nðŸ¤– WAKE WORD DETECTED! (Listening for command...)")
                is_awake = True
                vad_iterator.reset_states() # Reset VAD logic
                audio_buffer.clear() # Clear old audio

        # ---------------------------------------------------------
        # STATE 2: ACTIVE LISTENING (VAD / Recording)
        # ---------------------------------------------------------
        else:
            # Silero expects float32 tensor between -1 and 1
            audio_float32 = torch.from_numpy(audio_int16.astype(np.float32) / 32768.0)
            
            # Feed to VAD Iterator
            # This function returns a dict if speech starts or ends
            speech_dict = vad_iterator(audio_float32, return_seconds=True)
            
            # Always save audio while awake (so we don't miss words)
            audio_buffer.append(data)

            # Check if Silero thinks you stopped talking
            if speech_dict:
                if "end" in speech_dict:
                    print("âœ… End of speech detected. Processing...")
                    
                    # Transcribe the captured audio using Whisper
                    full_audio_data = b''.join(audio_buffer)
                    audio_int16 = np.frombuffer(full_audio_data, dtype=np.int16)
                    audio_float32 = audio_int16.astype(np.float32) / 32768.0
                    segments, info = whisper_model.transcribe(audio_float32.flatten(), beam_size=5)
                    transcription = " ".join([segment.text for segment in segments])
                    print(f"Transcribed: {transcription}")
                    print("Returning to sleep...")
                    
                    is_awake = False
                    audio_buffer.clear()

except KeyboardInterrupt:
    print("\nStopping...")
    stream.stop_stream()
    stream.close()
    audio.terminate()

Loading Wake Word Model...
Loading Silero VAD...


Using cache found in /Users/felipesilverio/.cache/torch/hub/snakers4_silero-vad_master



--- SYSTEM READY ---
Say 'Hey Jarvis' to trigger me.

ðŸ¤– WAKE WORD DETECTED! (Listening for command...)
âœ… End of speech detected. Processing...
Transcribed:  That's it, I think I can correctly detect what I'm saying.
Returning to sleep...

ðŸ¤– WAKE WORD DETECTED! (Listening for command...)
âœ… End of speech detected. Processing...
Transcribed:  A job is
Returning to sleep...

ðŸ¤– WAKE WORD DETECTED! (Listening for command...)
âœ… End of speech detected. Processing...
Transcribed:  I want to test.
Returning to sleep...

ðŸ¤– WAKE WORD DETECTED! (Listening for command...)
âœ… End of speech detected. Processing...
Transcribed:  Proceed.
Returning to sleep...

ðŸ¤– WAKE WORD DETECTED! (Listening for command...)
âœ… End of speech detected. Processing...
Transcribed:  Hey, Jarvis.
Returning to sleep...

ðŸ¤– WAKE WORD DETECTED! (Listening for command...)
âœ… End of speech detected. Processing...
Transcribed:  I want to test the accuracy.
Returning to sleep...

ðŸ¤– WAKE WORD DETECTED!

In [19]:
pip install tflite-runtime

[31mERROR: Could not find a version that satisfies the requirement tflite-runtime (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tflite-runtime[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
