In [1]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import json
import time
import requests
import tkinter as tk
from tkinter import simpledialog
from tkinter import messagebox # Import messagebox explicitly for the popup
import re # Added for clean string manipulation

# --- Configuration ---
MODEL_PATH = 'isl_model_2hand.h5'
LABELS_PATH = 'labels_2hand.json'
NUM_HANDS = 2
FEATURES_PER_HAND = 42 # 21 landmarks * 2 coords
TOTAL_FEATURES = NUM_HANDS * FEATURES_PER_HAND # 84
OLLAMA_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "llama3.2:3b" # CHANGE THIS if you use a different model

# --- Buffer and Timing Configuration ---
MIN_BUFFER_SIZE = 3
TIMEOUT_SECONDS = 5.0
SKIP_CHAR = 'Q' # Character to skip at start/end of buffer
REQUIRED_STABILITY_FRAMES = 5 # Frames a character must be held to be accepted

# --- Global State for Buffer and Timing ---
char_buffer = ""
last_char_time = time.time()
camera_is_running = True # State variable for the camera loop

# --- Global State for Stability Tracking ---
current_stable_char = ""
stability_counter = 0

# --- Load Model and Labels ---
print("Loading 2-Handed model...")
try:
    model = tf.keras.models.load_model(MODEL_PATH)
except Exception as e:
    print(f"Error loading model: {e}")
    print(f"Please make sure '{MODEL_PATH}' is in the same folder.")
    exit()

print("Loading labels...")
try:
    with open(LABELS_PATH, 'r') as f:
        # Load and convert keys from string '0' to int 0
        label_map_str = json.load(f)
        label_map = {int(k): v for k, v in label_map_str.items()}
except FileNotFoundError:
    print(f"Error: '{LABELS_PATH}' not found.")
    print("Please make sure it is in the same folder.")
    exit()

print("Model and labels loaded. Starting camera...")

# --- MediaPipe and OpenCV Setup ---
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=NUM_HANDS,
    min_detection_confidence=0.6,
    min_tracking_confidence=0.5
)
mp_drawing = mp.solutions.drawing_utils

# Initialize video capture
cap = cv2.VideoCapture(0)

# --- Normalization Function (Must be IDENTICAL to training) ---
def normalize_landmarks(landmarks_raw):
    """Normalizes hand landmarks relative to the wrist."""
    landmarks_rel = []
    wrist_x, wrist_y = landmarks_raw[0]
    for x, y in landmarks_raw:
        landmarks_rel.append((x - wrist_x, y - wrist_y))
    
    max_val = max(abs(coord) for point in landmarks_rel for coord in point)
    if max_val == 0:
        return [0.0] * FEATURES_PER_HAND
        
    landmarks_norm = [(x / max_val, y / max_val) for x, y in landmarks_rel]
    return np.array(landmarks_norm).flatten()

# -----------------------------------------------
# --- Ollama and Tkinter Popup Functions (MODIFIED) ---
# -----------------------------------------------

def call_ollama(prompt_text, word_length):
    """
    Calls Ollama with the specific required prompt.
    """
    print(f"Sending to LLM: '{prompt_text}' (Length: {word_length})")

    # --- REQUIRED LLM PROMPT ---
    llm_prompt = (
        f"The user signed a sequence of {word_length} characters: '{prompt_text}'. "
        "Assume that one or two of these characters might be misread due to signing errors "
        "or model uncertainty. "
        f"What is the single, most likely English word of exactly {word_length} letters "
        "that the user was trying to spell? "
        "Only return the predicted word, nothing else."
    )
    # ---------------------------

    data = {
        "model": OLLAMA_MODEL,
        "prompt": llm_prompt,
        "stream": False,
        "options": {
            "temperature": 0.3,
            "top_p": 0.9,
            "num_predict": word_length + 5, # Limit output length slightly above word length
            "stop": ["\n", "Input:", "Output:", "Explanation:", "word", "word is"]
        }
    }

    try:
        response = requests.post(OLLAMA_URL, json=data, timeout=30)
        response.raise_for_status()
        
        raw_result = response.json().get('response', '').strip()
        
        print(f"Raw LLM response: '{raw_result}'")
        
        # Aggressively clean the output to extract a single word/sequence
        # 1. Take the first line
        cleaned = raw_result.split('\n')[0].strip()
        # 2. Remove all non-alphabetic characters and convert to uppercase
        cleaned = re.sub(r'[^A-Za-z]', '', cleaned).upper()
        
        if cleaned:
            print(f"Cleaned output: '{cleaned}'")
            return cleaned
        else:
            return f"[Unable to parse] Raw: {raw_result}"
            
    except requests.exceptions.ConnectionError:
        return f"Error: Could not connect to Ollama at {OLLAMA_URL}. Is it running?"
    except requests.exceptions.Timeout:
        return "Error: LLM request timed out"
    except requests.exceptions.RequestException as e:
        return f"Error: {str(e)}"


def show_llm_popup(input_chars):
    """
    Shows a popup with the detected characters and the LLM result.
    Stops the camera while the popup is active.
    """
    global char_buffer, camera_is_running, current_stable_char, stability_counter

    # 1. STOP CAMERA
    camera_is_running = False
    cv2.destroyAllWindows()
    
    # 2. Prepare LLM parameters
    word_length = len(input_chars)
    
    # 3. CALL LLM
    llm_output = call_ollama(input_chars, word_length)

    # 4. SHOW POPUP
    root = tk.Tk()
    root.withdraw()
    
    result_message = (
        f"Detected Characters: {input_chars}\n\n"
        f"LLM Interpretation:\n"
        f"'{llm_output}'"
    )
    
    messagebox.showinfo("Sign Language Interpretation Result", result_message)

    # 5. RESET STATE AND RESTART CAMERA
    char_buffer = ""
    current_stable_char = ""
    stability_counter = 0
    camera_is_running = True
    print("\n--- Awaiting new input ---\n")

# -----------------------------------------------
# --- Real-Time Loop ---
# -----------------------------------------------

print("\n--- Awaiting new input ---\n")
while cap.isOpened():
    # Only run the main loop body if the camera is marked as running
    if camera_is_running:
        # Check if the video capture object is valid before reading
        if not cap.isOpened():
             print("Camera was closed unexpectedly. Attempting to reopen...")
             cap = cv2.VideoCapture(0)
             if not cap.isOpened():
                 print("Failed to reopen camera. Exiting.")
                 break
             time.sleep(1) # Give camera time to initialize
             continue

        success, frame = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            # Still check the buffer timeout even if frame is empty
            pass
        else:
            frame = cv2.flip(frame, 1)
            img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(img_rgb)
            
            all_hands_features = []
            
            # Draw landmarks and collect features
            if results.multi_hand_landmarks:
                num_hands_detected = len(results.multi_hand_landmarks)
                
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(
                        frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    
                    landmarks_raw = [(lm.x, lm.y) for lm in hand_landmarks.landmark]
                    landmarks_norm = normalize_landmarks(landmarks_raw)
                    all_hands_features.extend(landmarks_norm)

                if num_hands_detected < NUM_HANDS:
                    padding = [0.0] * FEATURES_PER_HAND * (NUM_HANDS - num_hands_detected)
                    all_hands_features.extend(padding)
            else:
                all_hands_features = [0.0] * TOTAL_FEATURES

            # --- Prediction and Stability Logic ---
            if len(all_hands_features) == TOTAL_FEATURES:
                data_array = np.array([all_hands_features])
                prediction = model.predict(data_array, verbose=0)
                
                class_index = np.argmax(prediction[0])
                confidence = np.max(prediction[0]) * 100
                
                # Global access required for stability tracking and buffer
                global current_stable_char, stability_counter, char_buffer, last_char_time
                
                # Check for the minimum required confidence
                if confidence > 80: # Using the hardcoded 80 from original logic
                    predicted_char = label_map[class_index]
                    current_char = predicted_char
                    text = f"{predicted_char} ({confidence:.2f}%)"

                    # --- Stability Check ---
                    if current_char == current_stable_char:
                        stability_counter += 1
                    else:
                        current_stable_char = current_char
                        stability_counter = 1 

                    # --- Buffer Management (Triggered by Stability) ---
                    if stability_counter >= REQUIRED_STABILITY_FRAMES:
                        
                        # Check if the stable character is different from the last one in the buffer
                        is_new_char_for_buffer = not char_buffer or current_stable_char != char_buffer[-1]
                        
                        if is_new_char_for_buffer:
                            
                            # 1. Skip character 'Q' on the first frame
                            if not char_buffer and current_stable_char == SKIP_CHAR:
                                print(f"Ignored first stable char: {current_stable_char}")
                                # Reset stability to keep waiting for a valid start
                                current_stable_char = ""
                                stability_counter = 0 
                            
                            # 2. Add stable, new character to the main buffer
                            else:
                                char_buffer += current_stable_char
                                last_char_time = time.time() # Reset timer on new character
                                print(f"Buffer updated: '{char_buffer}'")
                                # Reset stability counter after acceptance
                                stability_counter = 0 
                                current_stable_char = "" # Clear tracking char to look for the next distinct sign
                                
                else:
                    text = "..."
                    # If confidence is low, reset stability tracking completely
                    stability_counter = 0
                    current_stable_char = ""
                    
                # Display the result
                (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
                cv2.rectangle(frame, (10, 10), (10 + text_width + 10, 10 + text_height + baseline + 10), (0,0,0), -1)
                cv2.putText(frame, text, (20, 20 + text_height), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

            # Display the current buffer and stability count
            buffer_text = f"Buffer: {char_buffer}"
            stability_text = f"Frames: {stability_counter}/{REQUIRED_STABILITY_FRAMES}"
            
            # Draw Buffer
            cv2.putText(frame, buffer_text, (frame.shape[1] - 300, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            # Draw Stability Status
            cv2.putText(frame, stability_text, (frame.shape[1] - 300, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2, cv2.LINE_AA)
            
            # Show the frame
            cv2.imshow('ISL Landmark Detection (2-Handed)', frame)
    
    # --- Timeout Check (Runs even if frame is empty or camera is paused) ---
    current_time = time.time()
    if char_buffer and (current_time - last_char_time) > TIMEOUT_SECONDS and len(char_buffer) >= MIN_BUFFER_SIZE:
        
        final_chars = char_buffer
        
        # 3. Skip 'Q' on the last character (if it's the last one)
        if final_chars[-1] == SKIP_CHAR:
            print(f"Removed final character '{SKIP_CHAR}' before processing.")
            final_chars = final_chars[:-1]
            
            # Check if the buffer is still long enough after removing 'Q'
            if len(final_chars) < MIN_BUFFER_SIZE:
                 print(f"Buffer too short ({len(final_chars)}) after removing 'Q'. Keeping the shortened buffer and resetting timer.")
                 char_buffer = final_chars # Update buffer to exclude the final 'Q'
                 last_char_time = time.time() # Reset timer to wait for new input
                 continue # Skip popup this cycle and wait for new input

        # Perform LLM call and popup, which resets state and restarts the camera
        show_llm_popup(final_chars)
        
    # Handle user quit command
    if cv2.waitKey(5) & 0xFF == 27: # Press 'ESC' to quit
        break

# --- Cleanup ---
cap.release()
cv2.destroyAllWindows()
hands.close()

Loading 2-Handed model...




Loading labels...
Model and labels loaded. Starting camera...

--- Awaiting new input ---





Buffer updated: 'E'
Buffer updated: 'EG'
Buffer updated: 'EGM'
Buffer updated: 'EGMF'
Sending to LLM: 'EGMF' (Length: 4)
Raw LLM response: 'MPEG'
Cleaned output: 'MPEG'

--- Awaiting new input ---

Buffer updated: 'F'
Buffer updated: 'FI'
Buffer updated: 'FI7'
Buffer updated: 'FI7Z'
Buffer updated: 'FI7ZM'
Buffer updated: 'FI7ZMA'
Buffer updated: 'FI7ZMAY'
Buffer updated: 'FI7ZMAYL'
Sending to LLM: 'FI7ZMAYL' (Length: 8)
Raw LLM response: 'FIZZY'
Cleaned output: 'FIZZY'

--- Awaiting new input ---

Buffer updated: 'F'
Buffer updated: 'FI'
Buffer updated: 'FIM'
Buffer updated: 'FIMA'
Buffer updated: 'FIMAL'
Sending to LLM: 'FIMAL' (Length: 5)
Raw LLM response: 'FIMAL'
Cleaned output: 'FIMAL'

--- Awaiting new input ---

Buffer updated: 'G'
Buffer updated: 'GZ'
Buffer updated: 'GZO'
Buffer updated: 'GZOS'
Buffer updated: 'GZOSY'
Sending to LLM: 'GZOSY' (Length: 5)
Raw LLM response: 'Glossy'
Cleaned output: 'GLOSSY'

--- Awaiting new input ---

