In [None]:
import logging
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Optional
import pyaudio
import struct
import requests
from PIL import Image
from io import BytesIO
import tempfile
from pydub import AudioSegment
from pydub.playback import play
import pvporcupine
from ollama import generate
import whisper
import sounddevice as sd
import numpy as np
import wave
import torch
import cv2
from ultralytics import YOLO
import pyttsx3
import threading



# Constants
ACCESS_KEY = "O6Jn/TyI+Rcl7jYPPB6QmEk3as4RzWcFuQv6k1dCICtl7BGyUHeYsA=="
API_KEY = "sk_f810c561808f533447a6d19093bc1cff201352a1a880acfb"
VOICE_ID = "nPczCjzI2devNBz1zQrb"
CAMERA_URL = "http://192.168.105.196/"  # important change to the camera IP

MIC_INDEX = 0
TIMEOUT = 5
SAMPLE_RATE = 16000  # Could be reduced to 8000 if Whisper performs well

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

class VisionAssistant:
    def __init__(self, whisper_model_size: str = "base"):
        """Initialize the Vision Assistant with specified Whisper model size."""
        self.pa = pyaudio.PyAudio()
        self.porcupine = pvporcupine.create(
            access_key=ACCESS_KEY,
            keyword_paths=["Hey-Vision_en_windows_v3_0_0.ppn"]
        )
        self.audio_stream = self.pa.open(
            rate=self.porcupine.sample_rate,
            channels=1,
            format=pyaudio.paInt16,
            input=True,
            frames_per_buffer=self.porcupine.frame_length,
            input_device_index=MIC_INDEX
        )
        self.whisper_model = whisper.load_model(whisper_model_size)
        self.executor = ThreadPoolExecutor(max_workers=2)
        self.shutdown_flag = False
        self.is_active = False  # General mode
        self.is_assistance_mode = False  # Assistance mode
        self.is_currency_mode = False
        self.currency_model = YOLO("currency.pt")
        self.is_call_mode = False        # Call mode
        self.is_hazards_mode = False     # Hazards mode
        self.hazard_model = YOLO("hazard.pt")
        self.tts_engine = pyttsx3.init()
        self.tts_engine.setProperty('rate', 175)  # Speed of speech
        self.tts_engine.setProperty('volume', 1.0)  # Max volume


    def record_audio(self, duration: int = TIMEOUT, max_size_mb: float = 10.0) -> str:
        """Record audio from microphone and save to temporary WAV file."""
        try:
            logger.info("Recording audio for %d seconds...", duration)
            audio = sd.rec(
                int(duration * SAMPLE_RATE),
                samplerate=SAMPLE_RATE,
                channels=1,
                dtype='int16',
                device=MIC_INDEX
            )
            sd.wait()

            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                with wave.open(temp_file.name, 'wb') as wf:
                    wf.setnchannels(1)
                    wf.setsampwidth(2)
                    wf.setframerate(SAMPLE_RATE)
                    wf.writeframes(audio.tobytes())
                
                if os.path.getsize(temp_file.name) > max_size_mb * 1024 * 1024:
                    logger.warning("Audio file exceeds maximum size")
                    os.unlink(temp_file.name)
                    return None
                logger.info("Audio recorded to %s", temp_file.name)
                return temp_file.name
        except Exception as e:
            logger.error(f"Audio recording failed: {str(e)}")
            return None

    def recognize_speech(self, audio_path: str) -> Optional[str]:
        """Transcribe audio using Whisper."""
        try:
            logger.info("Transcribing audio from %s", audio_path)
            result = self.whisper_model.transcribe(audio_path)
            os.unlink(audio_path)
            logger.info("Transcription result: %s", result["text"])
            return result["text"]
        except Exception as e:
            logger.error(f"Speech recognition failed: {str(e)}")
            return None

    def process_image(self, img: Image.Image, command: str) -> str:
        """Process image using ollama model (faster version)."""
        try:
            max_size = (96, 96)
            img.thumbnail(max_size, Image.Resampling.LANCZOS)

            with BytesIO() as buffer:
                img.save(buffer, format='PNG')
                image_bytes = buffer.getvalue()

            full_response = ''
            for response in generate(
                model='llava:7b-v1.6',
                prompt=command,
                images=[image_bytes],
                stream=True
            ):
                full_response += response['response']
            return full_response
        except Exception as e:
            logger.error(f"Image processing failed: {str(e)}")
            return "Error processing image"

    def speak_text(self, text: str) -> None:
        """Speak text locally using pyttsx3 (fast and offline)."""
        try:
            logger.info(f"Speaking (local): {text}")
            self.tts_engine.say(text)
            self.tts_engine.runAndWait()
        except Exception as e:
            logger.error(f"Local TTS failed: {str(e)}")
    
    def capture_image(self, max_attempts: int = 2):
        """Capture a single frame from MJPEG stream (faster attempts)."""
        boundary = b"--123456789000000000000987654321"

        for attempt in range(max_attempts):
            try:
                with requests.get(CAMERA_URL, stream=True, timeout=3) as response:
                    if response.status_code != 200:
                        raise Exception(f"Bad response code: {response.status_code}")

                    buffer = b""
                    for chunk in response.iter_content(chunk_size=512):
                        buffer += chunk

                        start = buffer.find(b"\xff\xd8")
                        end = buffer.find(b"\xff\xd9")

                        if start != -1 and end != -1 and end > start:
                            jpeg_data = buffer[start:end+2]
                            return Image.open(BytesIO(jpeg_data))

            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed: {e}")
                time.sleep(1)

        logger.error("Failed to capture image from MJPEG stream.")
        return None
    
    def process_command(self, command: str) -> None:
        """Handle user command with faster flow and parallel tasks."""
        self.speak_text(f"Did you mean: {command}? Please say 'yes' or 'no'")

        audio_path = self.record_audio(duration=2)
        if audio_path:
            confirmation = self.recognize_speech(audio_path)
            if confirmation:
                logger.info(f"Confirmation received: {confirmation}")
                if "yes" in confirmation.lower():
                    self.speak_text("Processing your command")

                    img_result = {}
                    def capture(): img_result['image'] = self.capture_image()

                    capture_thread = threading.Thread(target=capture)
                    capture_thread.start()
                    capture_thread.join(timeout=3)

                    image = img_result.get('image')
                    if image:
                        try:
                            caption = self.process_image(image, command)
                            if caption and caption != "Error processing image":
                                self.speak_text(caption)
                                logger.info(f"Command processed successfully. Result: {caption}")
                            else:
                                self.speak_text("Sorry, I couldn't process the image")
                                logger.warning("Image processing failed or returned no result")
                        except Exception as e:
                            self.speak_text("An error occurred while processing your command")
                            logger.error(f"Command processing error: {str(e)}")
                    else:
                        self.speak_text("Failed to capture an image from the camera")
                        logger.error("Image capture failed")
                else:
                    self.speak_text("Command cancelled")
                    logger.info("Command cancelled by user")
            else:
                self.speak_text("I didn't hear your confirmation")
                logger.warning("No confirmation received")
        else:
            self.speak_text("Failed to record your confirmation. Command cancelled")
            logger.warning("Failed to record confirmation")

    
    def detect_currency(self):
        try:
            logger.info("Starting currency detection...")
            self.speak_text("Say 'stop' when you want to end the session.")

            if self.currency_model is None:
                logger.error("YOLO model not initialized")
                self.speak_text("Currency detection not available. YOLO model not initialized.")
                return

            last_announcement_time = {}
            announcement_cooldown = 3
            stream = requests.get(CAMERA_URL, stream=True)

            if stream.status_code != 200:
                self.speak_text("Failed to connect to the camera")
                logger.error("Camera connection failed")
                return

            bytes_stream = b""
            stop_detection = False
            frame_counter = 0

            def listen_for_stop():
                nonlocal stop_detection
                while not stop_detection:
                    audio_path = self.record_audio(duration=4)
                    if audio_path:
                        command = self.recognize_speech(audio_path)
                        if command and "stop" in command.lower():
                            logger.info("Stop command detected")
                            stop_detection = True
                            break
                    time.sleep(0.1)

            self.executor.submit(listen_for_stop)
            self.speak_text("Iam Looking for money.")

            while not stop_detection:
                try:
                    for chunk in stream.iter_content(chunk_size=1024):

                        if stop_detection:
                            break

                        bytes_stream += chunk
                        a = bytes_stream.find(b'\xff\xd8')
                        b = bytes_stream.find(b'\xff\xd9')

                        if a != -1 and b != -1:
                            jpg = bytes_stream[a:b+2]
                            bytes_stream = bytes_stream[b+2:]
                            frame = cv2.imdecode(np.frombuffer(jpg, dtype=np.uint8), cv2.IMREAD_COLOR)
                            if frame is None:
                                continue

                            cv2.imshow("Camera Stream", frame)
                            cv2.waitKey(1)

                            frame_counter += 1
                            if frame_counter % 20 != 0:
                                continue

                            results = self.currency_model(frame)
                            predictions = results.pred[0]  # tensor: (N, 6)

                            current_time = time.time()
                            for *xyxy, conf, cls_id in predictions.tolist():
                                if conf > 0.6:
                                    cls_name = self.currency_model.names[int(cls_id)]
                                    if cls_name not in last_announcement_time or \
                                       (current_time - last_announcement_time[cls_name]) > announcement_cooldown:
                                        logger.info(f"Detected currency: {cls_name} with confidence {conf:.2f}")
                                        self.speak_text(f"{cls_name} detected")
                                        last_announcement_time[cls_name] = current_time

                except requests.exceptions.Timeout:
                    continue
                except requests.exceptions.RequestException as e:
                    logger.error(f"Stream error: {str(e)}")
                    self.speak_text("Camera stream interrupted")
                    break

            self.speak_text("Currency detection session complete")
            logger.info("Currency detection completed")

        except Exception as e:
            logger.error(f"Unexpected error: {str(e)}")
            self.speak_text("An unexpected error occurred during currency detection.")



    def make_call(self, name: str) -> None:
        """Simulate making a call to the specified name."""
        try:
            logger.info(f"Attempting to call {name}...")
            self.speak_text(f"Calling {name}")
            # Placeholder for actual call logic
            time.sleep(1)  # Simulate call delay
            self.speak_text(f"Call to {name} completed (placeholder)")
            logger.info(f"Call to {name} processed")
        except Exception as e:
            logger.error(f"Call failed: {str(e)}")
            self.speak_text("An error occurred while making the call")

    def detect_hazards(self) -> None:
        """Detect hazards using YOLO model and announce them via speech. Stops on voice command."""
        try:
            logger.info("Starting hazards detection...")
            self.speak_text("Say 'stop' when you want to end the session.")
            
            # Check if YOLO model is loaded
            if self.hazard_model is None:
                logger.error("YOLO model not initialized")
                self.speak_text("Hazard detection not available. YOLO model not initialized.")
                return
            
            # Set up time tracking for detection frequency
            last_announcement_time = {}
            announcement_cooldown = 3  # seconds between repeated announcements
            
            # Start streaming from camera
            stream = requests.get(CAMERA_URL, stream=True)

            if stream.status_code != 200:
                self.speak_text("Failed to connect to the camera")
                logger.error("Camera connection failed")
                return
                
            bytes_stream = b""
            stop_detection = False
            frame_counter = 0
            
            # Create a separate thread to listen for the stop command
            def listen_for_stop():
                nonlocal stop_detection
                while not stop_detection:
                    audio_path = self.record_audio(duration=4)
                    if audio_path:
                        command = self.recognize_speech(audio_path)
                        if command and "stop" in command.lower():
                            logger.info("Stop command detected")
                            stop_detection = True
                            break
                    time.sleep(0.1)
            
            # Start the listening thread
            stop_thread = self.executor.submit(listen_for_stop)
            
            self.speak_text("Looking for objects.")
            
            while not stop_detection:
                try:
                    for chunk in stream.iter_content(chunk_size=1024):
                        if stop_detection:
                            break
                            
                        bytes_stream += chunk
                        a = bytes_stream.find(b'\xff\xd8')  # Start of JPEG
                        b = bytes_stream.find(b'\xff\xd9')  # End of JPEG
                        
                        if a != -1 and b != -1:
                            jpg = bytes_stream[a:b+2]
                            bytes_stream = bytes_stream[b+2:]
                            
                            frame = cv2.imdecode(np.frombuffer(jpg, dtype=np.uint8), cv2.IMREAD_COLOR)
                            if frame is None:
                                continue
                            
                            frame_counter += 1
                            if frame_counter % 15 != 0:
                                cv2.imshow("Camera Stream", frame)
                                cv2.waitKey(1)
                                continue
                                        
                            # Detect objects using YOLO
                            results = self.hazard_model(frame)[0]
                            annotated_frame = results.plot()

                            current_time = time.time()
                
                            # Process and announce detected objects
                            for box in results.boxes:
                                cls_id = int(box.cls.item())
                                cls_name = self.hazard_model.names[cls_id]
                                conf = box.conf.item()

                                # Only announce if confidence is high enough
                                if conf > 0.5:
                                    # Avoid repeating the same object too frequently
                                    if cls_name not in last_announcement_time or \
                                    (current_time - last_announcement_time[cls_name]) > announcement_cooldown:
                                        
                                        logger.info(f"Detected hazard: {cls_name} with confidence {conf:.2f}")
                                        self.speak_text(f"{cls_name} ahead")
                                        logger.info(f"{cls_name} ahead")
                                    
                                        last_announcement_time[cls_name] = current_time
                            
                            
                            # Display the frame with bounding boxes
                            cv2.imshow("Camera Stream", annotated_frame)
                            cv2.waitKey(1)
                
                
                except requests.exceptions.Timeout:
                    # Continue if timeout occurs in the request
                    continue
                except requests.exceptions.RequestException as e:
                    logger.error(f"Stream error: {str(e)}")
                    self.speak_text("Camera stream interrupted")
                    break
                    
            self.speak_text("Hazard detection session complete")
            logger.info("Hazard detection completed")
            
        except Exception as e:
            logger.error(f"Hazards detection failed: {str(e)}")
            self.speak_text("An error occurred during hazards detection")

    def run(self) -> None:
        """Main execution loop with wake word detection, general mode, and mode switching."""
        logger.info("Starting Vision Assistant. Listening for 'Hey Vision!'...")
        try:
            while not self.shutdown_flag:
                pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
                pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)

                if self.porcupine.process(pcm) >= 0:
                    logger.info("Wake word 'Hey Vision' detected!")
                    self.speak_text("Hello friend how can i help you today")
                 
                    self.is_active = True  # Enter general mode directly
                    # General mode (hub mode)
                    while self.is_active and not self.is_assistance_mode and not self.is_currency_mode and not self.is_call_mode and not self.is_hazards_mode:
                        time.sleep(0.1)
                        
                        audio_path = self.record_audio(duration=3)
                        if audio_path:
                            command = self.recognize_speech(audio_path)
                            if command:
                                command_lower = command.lower().strip()
                                logger.info(f"Recognized command: {command}")
                                if "close" in command_lower:
                                    logger.info("Close command detected! Shutting down...")
                                    self.speak_text("Goodbye, shutting down now")
                                    self.shutdown_flag = True
                                    break
                                elif "assistance" in command_lower or "assistant" in command_lower:
                                    logger.info("Switching to assistance mode!")
                                    self.speak_text("Switching to assistance mode")
                                    self.is_assistance_mode = True
                                    break
                                elif "money" in command_lower or "currency" in command_lower:
                                    logger.info("Switching to currency detection mode!")
                                    self.speak_text("Switching to currency detection mode")
                                    self.is_currency_mode = True
                                    break
                                elif "call" in command_lower:
                                    logger.info("Switching to call mode!")
                                    self.speak_text("Switching to call mode")
                                    self.is_call_mode = True
                                    break
                                elif "hazard" in command_lower or "dangerous" in command_lower:
                                    logger.info("Switching to hazards mode!")
                                    self.speak_text("Switching to hazards mode")
                                    self.is_hazards_mode = True
                                    break
                                else:
                                    logger.info("I didn't catch that, Please try again or say 'close' to exit")
                                    self.speak_text("I didn't catch that, Please try again or say 'close' to exit")
                                    continue

                            else:
                                logger.warning("No command recognized")
                                self.speak_text("You can say 'assistance', 'money', 'call', or 'hazards' to proceed, or 'close' to exit")
                                continue
                                

                # Assistance mode loop
                if self.is_assistance_mode:

                    while self.is_assistance_mode and not self.shutdown_flag:
                        audio_path = self.record_audio(duration=3)
                        if not audio_path:
                            logger.warning("Failed to record audio in assistance mode")
                            self.speak_text("Failed to record your command. Please try again")
                            continue

                        command = self.recognize_speech(audio_path)
                        if not command:
                            logger.warning("No command recognized in assistance mode")
                            self.speak_text("Please try again or say 'back' to return")
                            continue

                        command_lower = command.lower().strip()
                        logger.info(f"Recognized command in assistance mode: {command}")

                        if "close" in command_lower:
                            logger.info("Close command detected! Shutting down...")
                            self.speak_text("Goodbye, shutting down now")
                            self.shutdown_flag = True
                            break
                        elif "back" in command_lower:
                            logger.info("Returning to general mode")
                            self.speak_text("Returning to general mode")
                            self.is_assistance_mode = False
                            break
                        else:
                            success = self.process_command(command)
                            if success is False:
                                logger.warning("Command processing failed in assistance mode")
                                self.speak_text("Command could not be processed. Please try again")
                            else:
                                self.speak_text("Say 'back' to return or 'close' to exit")


                # Currency detection mode
                if self.is_currency_mode:
                    self.detect_currency()  # Automatically take a photo and process it
                    
                    while not self.shutdown_flag or self.is_currency_mode:
                        # Wait for "back" or "close" after detection
                        self.speak_text("Say 'back' to return or 'close' to exit")
                        audio_path = self.record_audio(duration=3)
                        if audio_path:
                            command = self.recognize_speech(audio_path)
                            if command:
                                command_lower = command.lower().strip()
                                logger.info(f"Recognized command in currency mode: {command}")
                                if "close" in command_lower:
                                    logger.info("Close command detected! Shutting down...")
                                    self.speak_text("Goodbye, shutting down now")
                                    self.shutdown_flag = True
                                    break
                                elif "back" in command_lower:
                                    logger.info("Returning to general mode")
                                    self.speak_text("Returning to general mode")
                                    self.is_currency_mode = False
                                    break

                # Call mode
                if self.is_call_mode:
                    self.speak_text("Call mode active. Who do you want to call?")
                    time.sleep(0.1)
                    
                    audio_path = self.record_audio(duration=3)
                    if audio_path:
                        name = self.recognize_speech(audio_path)
                        if name:
                            name_lower = name.lower().strip()
                            logger.info(f"Recognized name in call mode: {name}")
                            if "close" in name_lower:
                                logger.info("Close command detected! Shutting down...")
                                self.speak_text("Goodbye, shutting down now")
                                self.shutdown_flag = True
                                break
                            elif "back" in name_lower:
                                logger.info("Returning to general mode")
                                self.speak_text("Returning to general mode")
                                self.is_call_mode = False
                            else:
                                self.make_call(name)
                                self.speak_text("Say 'back' to return or 'close' to exit")
                        else:
                            logger.warning("No name recognized in call mode")
                            self.speak_text("Please try again or say 'back' to return")
                    else:
                        logger.warning("Failed to record audio in call mode")
                        self.speak_text("Failed to record your command. Please try again")

                # Hazards mode
                if self.is_hazards_mode:
                    self.detect_hazards()  # Connect to camera and process
                    
                    # Wait for "back" or "close" after detection
                    while not self.shutdown_flag or self.is_hazards_mode:
                        self.speak_text("Say 'back' to return or 'close' to exit")
                        audio_path = self.record_audio(duration=3)
                        if audio_path:
                            command = self.recognize_speech(audio_path)
                            if command:
                                command_lower = command.lower().strip()
                                logger.info(f"Recognized command in hazards mode: {command}")
                                if "close" in command_lower:
                                    logger.info("Close command detected! Shutting down...")
                                    self.speak_text("Goodbye, shutting down now")
                                    self.shutdown_flag = True
                                    break
                                elif "back" in command_lower:
                                    logger.info("Returning to general mode")
                                    self.speak_text("Returning to general mode")
                                    self.is_hazards_mode = False
                                    break
                            
        except KeyboardInterrupt:
            logger.info("Initiating graceful shutdown via keyboard interrupt...")
            self.shutdown_flag = True
        
        finally:
            logger.info("Performing cleanup...")
            self.cleanup()

    def cleanup(self) -> None:
        """Clean up resources."""
        self.audio_stream.close()
        self.pa.terminate()
        self.porcupine.delete()
        self.executor.shutdown(wait=True)

if __name__ == "__main__":
    assistant = VisionAssistant()
    assistant.run()