In [None]:
# ✅ All final updates: start prompt, image+prompt input, short response control, direction mode, and strict model-only output

import requests
import json
import os
import time
import speech_recognition as sr
from gtts import gTTS
import playsound
import base64

CHAT_HISTORY_FILE = "chat_history.json"
AUDIO_FILE = "bot_output.mp3"
BOT_URL = "http://localhost:11434/api/generate"
ESP32_CAM_URL = "http://192.168.217.184/camera"
IMAGE_FILE = "latest.jpg"

start_prompts = ["start", "let's start", "lets start", "suru karo", "started"]
exit_prompts = [
    "bye", "exit", "quit", "shutdown", "stop", "close", "turn off",
    "band karo", "band kar do", "niklo", "khatam", "goodbye", "disconnect", "off", "okay bye", "ok bye"
]
image_prompts = [
    "what is in front of me", "take a photo", "capture photo",
    "show me what's ahead", "read it", "explain the surrounding", "read the book",
    "which currency", "read currency", "mujhe kya dikh raha hai", "photo le lo",
    "mere saamne kya hai", "kitni currency hai", "currency padho", "kya likha hai"
]
direction_mode_trigger = ["direction mode"]
stop_direction_trigger = ["stop direction mode"]

def load_chat_history():
    if os.path.isfile(CHAT_HISTORY_FILE):
        try:
            with open(CHAT_HISTORY_FILE, 'r', encoding='utf-8') as file:
                return json.load(file)
        except:
            return []
    return []

def save_chat_history(history):
    try:
        with open(CHAT_HISTORY_FILE, 'w', encoding='utf-8') as file:
            json.dump(history, file, indent=4)
    except:
        print("Error saving chat history")

def speech_to_text(prompt_msg="🎤 Listening..."):
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print(prompt_msg)
        recognizer.adjust_for_ambient_noise(source, duration=0.2)
        try:
            audio = recognizer.listen(source, timeout=3, phrase_time_limit=5)
            print("🔎 Recognizing speech...")
            return recognizer.recognize_google(audio)
        except:
            print("❌ Speech not understood.")
            return None

def text_to_speech(text):
    try:
        tts = gTTS(text, lang='en')
        tts.save(AUDIO_FILE)
        playsound.playsound(AUDIO_FILE)
    except:
        print("❌ TTS error")

def fetch_image():
    try:
        print("📸 Fetching image from ESP32-CAM...")
        response = requests.get(ESP32_CAM_URL, timeout=5)
        if response.status_code == 200:
            with open(IMAGE_FILE, "wb") as f:
                f.write(response.content)
            print("✅ Image saved as latest.jpg")
            return IMAGE_FILE
        else:
            print(f"❌ Failed to get image: {response.status_code}")
    except Exception as e:
        print(f"⚠️ Error fetching image: {e}")
    return None

def send_image_and_prompt_to_bot(image_path, prompt_text):
    print("📡 Sending image + prompt to Bot...")
    with open(image_path, "rb") as img:
        image_data = img.read()
        b64_image = base64.b64encode(image_data).decode('utf-8')

    if "explain" in prompt_text.lower():
        prompt_to_send = f"{prompt_text}. Respond in English only."
    else:
        prompt_to_send = f"{prompt_text}. Respond in English only. Keep the answer within 13 words."

    data = {
        "model": "llava",
        "prompt": prompt_to_send,
        "images": [b64_image]
    }

    try:
        with requests.post(BOT_URL, headers={"Content-Type": "application/json"}, json=data, stream=True) as response:
            response.raise_for_status()
            full_response = ""
            for line in response.iter_lines():
                if line:
                    result = json.loads(line)
                    text = result.get("response", "")
                    print(text, end='', flush=True)
                    full_response += text
            return full_response
    except Exception as e:
        print(f"❌ Error talking to Bot: {e}")
        return "I'm not sure what's in front of you."

def send_text_prompt_to_bot(prompt_text):
    if "explain" in prompt_text.lower():
        prompt_to_send = f"{prompt_text}. Respond in English only."
    else:
        prompt_to_send = f"{prompt_text}. Respond in English only. Keep the answer within 13 words."

    data = {
        "model": "llava",
        "prompt": prompt_to_send,
        "images": []
    }

    try:
        with requests.post(BOT_URL, headers={"Content-Type": "application/json"}, json=data, stream=True) as response:
            response.raise_for_status()
            full_response = ""
            for line in response.iter_lines():
                if line:
                    result = json.loads(line)
                    text = result.get("response", "")
                    print(text, end='', flush=True)
                    full_response += text
            return full_response
    except Exception as e:
        print(f"❌ Error querying Bot: {e}")
        return "Something went wrong."

def handle_prompt(prompt):
    clear_triggers = [
        "clear history", "delete chat", "reset conversation", "delete history",
        "clear chat", "chat clear kar do", "history hata do", "clear karo"
    ]

    if any(phrase in prompt.lower() for phrase in clear_triggers):
        if os.path.exists(CHAT_HISTORY_FILE):
            os.remove(CHAT_HISTORY_FILE)
            print("🗑️ Chat history deleted.")
            text_to_speech("Chat history cleared.")
        else:
            print("⚠️ No chat history found.")
            text_to_speech("There was no history to delete.")
        return

    if any(phrase in prompt.lower() for phrase in direction_mode_trigger):
        text_to_speech("Direction mode activated. Say 'stop direction mode' to exit.")
        while True:
            image_path = fetch_image()
            if not image_path:
                text_to_speech("Could not capture image.")
                break
            direction_response = send_image_and_prompt_to_bot(image_path, "Give movement suggestion without saying to look")
            print(f"\n🤖 Bot: {direction_response}")
            text_to_speech(direction_response)
            print("🎤 Say 'stop direction mode' to exit or wait for next direction...")
            try:
                command = speech_to_text("🎧 Listening for stop command...")
                if command and any(phrase in command.lower() for phrase in stop_direction_trigger):
                    text_to_speech("Exiting direction mode.")
                    break
            except:
                continue
        return

    if any(img_trigger in prompt.lower() for img_trigger in image_prompts):
        image_path = fetch_image()
        if not image_path:
            text_to_speech("Failed to capture image.")
            return
        spoken_prompt = speech_to_text("🎧 What should I describe in this image?")
        if not spoken_prompt:
            text_to_speech("I didn't catch that.")
            return
        response = send_image_and_prompt_to_bot(image_path, spoken_prompt)
        print(f"\n🤖 Bot: {response}")
        text_to_speech(response)
        chat_history = load_chat_history()
        chat_history.append({"user": spoken_prompt, "llava": response})
        save_chat_history(chat_history)
        return

    response = send_text_prompt_to_bot(prompt)
    print(f"\n🤖 Bot: {response}")
    text_to_speech(response)
    chat_history = load_chat_history()
    chat_history.append({"user": prompt, "llava": response})
    save_chat_history(chat_history)

def main():
    print("🧠 Say 'start' to activate Bot Voice Assistant...")
    while True:
        first_prompt = speech_to_text()
        if first_prompt and first_prompt.strip().lower() in start_prompts:
            print("✅ Voice Assistant Activated.")
            break

    while True:
        prompt = speech_to_text()
        if prompt:
            print(f"👤 You: {prompt}")
            if prompt.strip().lower() in exit_prompts:
                print("👋 Exiting.")
                break
            handle_prompt(prompt)

if __name__ == "__main__":
    main()


🧠 Say 'start' to activate Bot Voice Assistant...
🎤 Listening...
🔎 Recognizing speech...
✅ Voice Assistant Activated.
🎤 Listening...
🔎 Recognizing speech...
❌ Speech not understood.
🎤 Listening...
🔎 Recognizing speech...
👤 You: direction
 I'm sorry, but I cannot give a direct answer without more context or information. Can you please provide more details or clarify your question? 
🤖 Bot:  I'm sorry, but I cannot give a direct answer without more context or information. Can you please provide more details or clarify your question? 
🎤 Listening...
🔎 Recognizing speech...
👤 You: direction mod
 Direction modulation is a method of modulating the carrier wave to transmit information. It allows the receiver to determine the direction from which the signal is coming. 
🤖 Bot:  Direction modulation is a method of modulating the carrier wave to transmit information. It allows the receiver to determine the direction from which the signal is coming. 
🎤 Listening...
🔎 Recognizing speech...
❌ Speech n