In [None]:
import whisper
import queue,threading,time
import numpy as np
from scipy.io.wavfile import write as wv
import tempfile,os
import sounddevice as sd
import pyttsx3
from datetime import datetime 
import pickle 
import face_recognition
import cv2

In [2]:
model=whisper.load_model("base.en")


In [3]:
SAMPLE_RATE=16000
CHUNK_SECONDS=1
COMMAND_ON="guard my room"
COMMAND_OFF="stop guarding"
audio_queue=queue.Queue()
guard_mode=False
stop_flag=False


In [4]:

def mic_stream():
    def callback(indata, frames, t, status):
        if status:
            pass
        audio_queue.put(indata.copy())
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype='int16', callback=callback, blocksize=int(SAMPLE_RATE*CHUNK_SECONDS)):
        while not stop_flag:
            time.sleep(0.01)


In [5]:
threading.Thread(target=mic_stream, daemon=True).start()


In [6]:
try:
    while True:
        chunk = audio_queue.get()
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            path = tmp.name
        wv(path, SAMPLE_RATE, chunk)
        result = model.transcribe(path, language="en", fp16=False, condition_on_previous_text=False)
        text = (result.get("text") or "").strip().lower()
        os.remove(path)
        if text:
            print("Heard:", text)
            if COMMAND_ON in text and not guard_mode:
                guard_mode = True
                print("Guard mode: ON")
            elif COMMAND_OFF in text and guard_mode:
                guard_mode = False
                print("Guard mode: OFF")
except KeyboardInterrupt:
    stop_flag = True

Heard: all right,mate.
Heard: all right, please.
Heard: if you want to take some of your
Heard: okay.
Heard: okay, this is the...
Heard: this is bad.
Heard: if you can't talk.
Heard: one thing that i want.
Heard: to the west.
Heard: soon!
Heard: thank you.
Heard: thank you.


In [1]:
class AIGuardAgent:
    def __init__(self):
        self.model=whisper.load_model("base.en")
        self.SAMPLE_RATE=16000
        self.CHUNK_SECONDS=2
        self.audio_queue=queue.Queue()

        self.tts_engine=pyttsx3.init()
        self.tts_engine.setProperty('rate',150)

        self.guard_mode=False
        self.listening=False
        self.stop_flag=False

        self.face_db_path = "FACE_DB_PATH"#fill this in later 
        self.load_trusted_faces()
    
        self.activation_phrases = [
            "guard my room",
            "protect my room", 
            "secure my room",
            "start guard mode",
            "activate guard"
        ]
        self.deactivation_phrases = [
            "stop guard mode",
            "deactivate guard",
            "stand down",
            "stop monitoring",
            "goodbye guard"
        ]
        self.enrollment_phrases = [
            "enroll face",
            "register face",
            "add trusted person"
        ]
    
    def speak(self,text):
        self.tts_engine.say(text)
        self.tts_engine.runAndWait()
    
    def load_trusted_faces(self):

        if os.path.exists(self.face_db_path):
            with open(self.face_db_path,'rb') as f:
                data=pickle.load(f)#depends on what type of file me useing for the db i guess
                self.known_face_encodings=data['encodings']
                self.known_face_names=data['names']
            print(f"Loaded {len(self.known_face_names)} trusted faces.")

        else:
            print("No existing faces database found. Starting fresh.")
            self.known_face_encodings=[]
            self.known_face_names=[]

    def save_trusted_faces(self):
        with open(self.face_db_path,'wb') as f:  
            pickle.dump({'encodings':self.known_face_encodings,'names':self.known_face_names},f)

            print("Saved face to db")

    def enroll_using_webcam(self,name="unknown"):
        self.camera=cv2.VideoCapture(0)
        self.speak(f"Please look at the camera for face enrollment as {name}")

        enrollment_frames = []
        frames_captured = 0
        max_frames = 30
        
        while frames_captured < max_frames:
            ret, frame = self.camera.read()
            if not ret:
                print(" Failed to capture frame")
                continue
            
            # Convert BGR to RGB
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Find all face locations and encodings in the current frame
            face_locations = face_recognition.face_locations(rgb_frame)
            face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)
            
            if len(face_encodings) == 1:
                enrollment_frames.append(face_encodings[0])
                frames_captured += 1
                print(f" Captured face frame {frames_captured}/{max_frames}")
                
                # Show preview
                cv2.putText(frame, f"Enrolling: {frames_captured}/{max_frames}", 
                           (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            
            cv2.imshow("Face Enrollment - Press 'q' to cancel", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            
            time.sleep(0.5)  # Wait between captures
        
        cv2.destroyAllWindows()
        
        if enrollment_frames:
            # Average the encodings for better accuracy
            avg_encoding = np.mean(enrollment_frames, axis=0)
            self.known_face_encodings.append(avg_encoding)
            self.known_face_names.append(name)
            self.save_trusted_faces()
            self.speak(f"Successfully enrolled {name} as a trusted person")
            return True
        else:
            self.speak("Failed to capture face. Please try again.")
            return False
        
    def mic_stream(self):
        """Capture microphone audio in chunks"""
        def callback(indata, frames, time, status):
            if status:
                print(f"Audio status: {status}")
            self.audio_queue.put(indata.copy())
        
        with sd.InputStream(
            samplerate=self.SAMPLE_RATE, 
            channels=1, 
            dtype='int16', 
            callback=callback,
            blocksize=int(self.SAMPLE_RATE * self.CHUNK_SECONDS)
        ):
            while not self.stop_flag:
                time.sleep(0.1)
    def check_activation_command(self, text):
        """Check if text contains any activation phrase"""
        return any(phrase in text for phrase in self.activation_phrases)
    
    def check_deactivation_command(self, text):
        """Check if text contains any deactivation phrase"""
        return any(phrase in text for phrase in self.deactivation_phrases)
    
    def check_enrollment_command(self, text):
        """Check if text contains face enrollment phrase"""
        return any(phrase in text for phrase in self.enrollment_phrases)
    
    def activate_guard_mode(self):
        """Activate guard mode with voice confirmation"""
        self.guard_mode = True
        self.speak("Guard mode activated! Starting face monitoring.")
        print(f"Guard mode ACTIVATED at {datetime.now().strftime('%H:%M:%S')}")
        
        # Start face monitoring in a separate thread
        face_thread = threading.Thread(target=self.face_monitoring_loop, daemon=True)
        face_thread.start()
    
    def deactivate_guard_mode(self):
        """Deactivate guard mode with voice confirmation"""
        self.guard_mode = False
        self.speak("Guard mode deactivated. Goodbye!")
        print(f"Guard mode DEACTIVATED at {datetime.now().strftime('%H:%M:%S')}")
    def process_audio_chunk(self, chunk):
        """Process audio chunk with Whisper and handle commands"""
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            path = tmp.name
        
        try:
            wv(path, self.SAMPLE_RATE, chunk)
            result = self.model.transcribe(
                path, 
                language="en", 
                fp16=False, 
                condition_on_previous_text=False
            )
            
            text = (result.get("text") or "").strip().lower()
            
            if text:
                print(f"Heard: {text}")
                
                # Check for various commands
                if self.check_activation_command(text) and not self.guard_mode:
                    self.activate_guard_mode()
                elif self.check_deactivation_command(text) and self.guard_mode:
                    self.deactivate_guard_mode()
                elif self.check_enrollment_command(text):
                    self.speak("Starting face enrollment process.")
                    self.enroll_using_webcam()
                elif "how many trusted" in text or "list trusted" in text:
                    count = len(self.known_face_names)
                    if count == 0:
                        self.speak("No trusted faces enrolled yet.")
                    else:
                        self.speak(f"I have {count} trusted faces enrolled.")
                        print(f"Trusted faces: {', '.join(self.known_face_names)}")
                elif self.guard_mode:
                    print(f" In guard mode, heard: {text}")
                    
        except Exception as e:
            print(f"Error processing audio: {e}")
        finally:
            if os.path.exists(path):
                os.remove(path)
    
    def start_listening(self):
        """Start the continuous listening loop"""
        self.listening = True
        self.stop_flag = False
        
        # Start microphone stream in background thread
        audio_thread = threading.Thread(target=self.mic_stream, daemon=True)
        audio_thread.start()
        
        self.speak("AI Guard system ready. Say 'Guard my room' to activate or 'Enroll face' to add trusted persons.")
        print(" Listening for commands...")
        print(f" {len(self.known_face_names)} trusted faces loaded")
        
        try:
            while self.listening and not self.stop_flag:
                if not self.audio_queue.empty():
                    chunk = self.audio_queue.get()
                    self.process_audio_chunk(chunk)
                else:
                    time.sleep(0.1)
                    
        except KeyboardInterrupt:
            print("\nShutting down AI Guard system...")
            self.stop_flag = True
    
    def stop_listening(self):
        """Stop the listening loop"""
        self.listening = False
        self.stop_flag = True
        if self.camera:
            self.camera.release()
        cv2.destroyAllWindows()

    

In [None]:
guard=AIGuardAgent()
guard.start_listening()