In [None]:
# Implementation 1:

import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np
import torch
from PIL import Image  
from transformers import ViTFeatureExtractor, ViTForImageClassification

# Initialize face mesh:
cam = cv2.VideoCapture(0)
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks=True)
screen_w, screen_h = pyautogui.size()

# Load ViT model and Feature Extractor:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

# Function to detect objects using ViT:
def detect_objects(frame):
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    
    inputs = feature_extractor(images=img, return_tensors="pt")  # Preprocess the image.

    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class_idx = logits.argmax(-1).item()
    predicted_class = model.config.id2label[predicted_class_idx]
    confidence = torch.softmax(logits, dim=1)[0, predicted_class_idx].item()

    first_term = predicted_class.split(',')[0].strip()  # Split and strip whitespace
    
    print(f"Detected {first_term} with confidence {confidence}")
    
    cv2.rectangle(frame, (0, 0), (frame.shape[1], frame.shape[0]), (0, 255, 0), 2)
    cv2.putText(frame, f"{first_term} ({confidence:.2f})", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    return frame, first_term if confidence > 0.5 else None

detection_done = False
detected_object = None

# Variables to track accuracy
total_samples = 0
correct_predictions = 0

while True:
    _, frame = cam.read()
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    output = face_mesh.process(rgb_frame)
    landmark_points = output.multi_face_landmarks
    frame_h, frame_w, _ = frame.shape

    if landmark_points:
        landmarks = landmark_points[0].landmark

        for id, landmark in enumerate(landmarks[474:478]):
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 0))

            if id == 1:
                screen_x = int(screen_w * landmark.x)
                screen_y = int(screen_h * landmark.y)
                pyautogui.moveTo(screen_x, screen_y)

        left = [landmarks[145], landmarks[159]]

        for landmark in left:
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 255))

        if (left[0].y - left[1].y) < 0.004 and not detection_done:
            pyautogui.click()
            time.sleep(1)

            region_size = 600  # Size of the region to capture.
            left = max(0, screen_x - region_size // 2)
            top = max(0, screen_y - region_size // 2)
            width = min(region_size, screen_w - left)
            height = min(region_size, screen_h - top)
            screenshot = pyautogui.screenshot(region=(left, top, width, height))
            screenshot = cv2.cvtColor(np.array(screenshot), cv2.COLOR_RGB2BGR)

            frame, best_object = detect_objects(screenshot)
            if best_object:
                detected_object = best_object
                detection_done = True
                # Display the detection results for a short period:
                cv2.imshow('Detected Objects', frame)
                cv2.waitKey(10000)  # Display the window for 10 seconds.
                break  # Exit the while loop after detection.
            else:
                print("No objects detected.")

        # Calculate accuracy by comparing predicted and actual screen coordinates
        if abs(screen_x - pyautogui.position().x) < 20 and abs(screen_y - pyautogui.position().y) < 20:
            correct_predictions += 1
        total_samples += 1
        accuracy = (correct_predictions / total_samples) * 100
        print(f"Accuracy: {accuracy:.2f}%")

    cv2.imshow('Object Detection', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

if detected_object:
    print(f"Detected object: {detected_object}")


In [3]:
# Controlled Environment:

import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np

# Initialize face mesh:
cam = cv2.VideoCapture(0)
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks=True)
screen_w, screen_h = pyautogui.size()

# Variables to track accuracy
total_samples = 0
correct_predictions = 0
accuracy_values = []

while True:
    _, frame = cam.read()
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    output = face_mesh.process(rgb_frame)
    landmark_points = output.multi_face_landmarks
    frame_h, frame_w, _ = frame.shape

    if landmark_points:
        landmarks = landmark_points[0].landmark

        for id, landmark in enumerate(landmarks[474:478]):
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 0))

            if id == 1:
                screen_x = int(screen_w * landmark.x)
                screen_y = int(screen_h * landmark.y)
                pyautogui.moveTo(screen_x, screen_y)

        left = [landmarks[145], landmarks[159]]

        for landmark in left:
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 255))

        # Calculate accuracy by comparing predicted and actual screen coordinates
        if abs(screen_x - pyautogui.position().x) < 20 and abs(screen_y - pyautogui.position().y) < 20:
            correct_predictions += 1
        total_samples += 1
        accuracy = (correct_predictions / total_samples) * 100
        accuracy_values.append(accuracy)

    cv2.imshow('Gaze Detection Accuracy', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

# Calculate and display final results
if accuracy_values:
    average_accuracy = sum(accuracy_values) / len(accuracy_values)
    min_accuracy = min(accuracy_values)
    max_accuracy = max(accuracy_values)
    print(f"Average Accuracy: {average_accuracy:.2f}%")
    print(f"Accuracy Range: {min_accuracy:.2f}% - {max_accuracy:.2f}%")


Average Accuracy: 100.00%
Accuracy Range: 100.00% - 100.00%


In [4]:
# Dim Light:

import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np

# Initialize face mesh:
cam = cv2.VideoCapture(0)
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks=True)
screen_w, screen_h = pyautogui.size()

# Variables to track accuracy
total_samples = 0
correct_predictions = 0
accuracy_values = []

while True:
    _, frame = cam.read()
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    output = face_mesh.process(rgb_frame)
    landmark_points = output.multi_face_landmarks
    frame_h, frame_w, _ = frame.shape

    if landmark_points:
        landmarks = landmark_points[0].landmark

        for id, landmark in enumerate(landmarks[474:478]):
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 0))

            if id == 1:
                screen_x = int(screen_w * landmark.x)
                screen_y = int(screen_h * landmark.y)
                pyautogui.moveTo(screen_x, screen_y)

        left = [landmarks[145], landmarks[159]]

        for landmark in left:
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 255))

        # Calculate accuracy by comparing predicted and actual screen coordinates
        if abs(screen_x - pyautogui.position().x) < 20 and abs(screen_y - pyautogui.position().y) < 20:
            correct_predictions += 1
        total_samples += 1
        accuracy = (correct_predictions / total_samples) * 100
        accuracy_values.append(accuracy)

    cv2.imshow('Gaze Detection Accuracy', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

# Calculate and display final results
if accuracy_values:
    average_accuracy = sum(accuracy_values) / len(accuracy_values)
    min_accuracy = min(accuracy_values)
    max_accuracy = max(accuracy_values)
    print(f"Average Accuracy: {average_accuracy:.2f}%")
    print(f"Accuracy Range: {min_accuracy:.2f}% - {max_accuracy:.2f}%")


Average Accuracy: 100.00%
Accuracy Range: 100.00% - 100.00%


In [5]:
# Bright Light:

import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np

# Initialize face mesh:
cam = cv2.VideoCapture(0)
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks=True)
screen_w, screen_h = pyautogui.size()

# Variables to track accuracy
total_samples = 0
correct_predictions = 0
accuracy_values = []

while True:
    _, frame = cam.read()
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    output = face_mesh.process(rgb_frame)
    landmark_points = output.multi_face_landmarks
    frame_h, frame_w, _ = frame.shape

    if landmark_points:
        landmarks = landmark_points[0].landmark

        for id, landmark in enumerate(landmarks[474:478]):
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 0))

            if id == 1:
                screen_x = int(screen_w * landmark.x)
                screen_y = int(screen_h * landmark.y)
                pyautogui.moveTo(screen_x, screen_y)

        left = [landmarks[145], landmarks[159]]

        for landmark in left:
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 255))

        # Calculate accuracy by comparing predicted and actual screen coordinates
        if abs(screen_x - pyautogui.position().x) < 20 and abs(screen_y - pyautogui.position().y) < 20:
            correct_predictions += 1
        total_samples += 1
        accuracy = (correct_predictions / total_samples) * 100
        accuracy_values.append(accuracy)

    cv2.imshow('Gaze Detection Accuracy', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

# Calculate and display final results
if accuracy_values:
    average_accuracy = sum(accuracy_values) / len(accuracy_values)
    min_accuracy = min(accuracy_values)
    max_accuracy = max(accuracy_values)
    print(f"Average Accuracy: {average_accuracy:.2f}%")
    print(f"Accuracy Range: {min_accuracy:.2f}% - {max_accuracy:.2f}%")


Average Accuracy: 99.35%
Accuracy Range: 98.15% - 100.00%


In [6]:
# Artificial Light:

import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np

# Initialize face mesh:
cam = cv2.VideoCapture(0)
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks=True)
screen_w, screen_h = pyautogui.size()

# Variables to track accuracy
total_samples = 0
correct_predictions = 0
accuracy_values = []

while True:
    _, frame = cam.read()
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    output = face_mesh.process(rgb_frame)
    landmark_points = output.multi_face_landmarks
    frame_h, frame_w, _ = frame.shape

    if landmark_points:
        landmarks = landmark_points[0].landmark

        for id, landmark in enumerate(landmarks[474:478]):
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 0))

            if id == 1:
                screen_x = int(screen_w * landmark.x)
                screen_y = int(screen_h * landmark.y)
                pyautogui.moveTo(screen_x, screen_y)

        left = [landmarks[145], landmarks[159]]

        for landmark in left:
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 255))

        # Calculate accuracy by comparing predicted and actual screen coordinates
        if abs(screen_x - pyautogui.position().x) < 20 and abs(screen_y - pyautogui.position().y) < 20:
            correct_predictions += 1
        total_samples += 1
        accuracy = (correct_predictions / total_samples) * 100
        accuracy_values.append(accuracy)

    cv2.imshow('Gaze Detection Accuracy', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

# Calculate and display final results
if accuracy_values:
    average_accuracy = sum(accuracy_values) / len(accuracy_values)
    min_accuracy = min(accuracy_values)
    max_accuracy = max(accuracy_values)
    print(f"Average Accuracy: {average_accuracy:.2f}%")
    print(f"Accuracy Range: {min_accuracy:.2f}% - {max_accuracy:.2f}%")


Average Accuracy: 99.20%
Accuracy Range: 95.48% - 100.00%


In [7]:
# With Glasses:

import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np

# Initialize face mesh:
cam = cv2.VideoCapture(0)
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks=True)
screen_w, screen_h = pyautogui.size()

# Variables to track accuracy
total_samples = 0
correct_predictions = 0
accuracy_values = []

while True:
    _, frame = cam.read()
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    output = face_mesh.process(rgb_frame)
    landmark_points = output.multi_face_landmarks
    frame_h, frame_w, _ = frame.shape

    if landmark_points:
        landmarks = landmark_points[0].landmark

        for id, landmark in enumerate(landmarks[474:478]):
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 0))

            if id == 1:
                screen_x = int(screen_w * landmark.x)
                screen_y = int(screen_h * landmark.y)
                pyautogui.moveTo(screen_x, screen_y)

        left = [landmarks[145], landmarks[159]]

        for landmark in left:
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 255))

        # Calculate accuracy by comparing predicted and actual screen coordinates
        if abs(screen_x - pyautogui.position().x) < 20 and abs(screen_y - pyautogui.position().y) < 20:
            correct_predictions += 1
        total_samples += 1
        accuracy = (correct_predictions / total_samples) * 100
        accuracy_values.append(accuracy)

    cv2.imshow('Gaze Detection Accuracy', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

# Calculate and display final results
if accuracy_values:
    average_accuracy = sum(accuracy_values) / len(accuracy_values)
    min_accuracy = min(accuracy_values)
    max_accuracy = max(accuracy_values)
    print(f"Average Accuracy: {average_accuracy:.2f}%")
    print(f"Accuracy Range: {min_accuracy:.2f}% - {max_accuracy:.2f}%")


Average Accuracy: 100.00%
Accuracy Range: 100.00% - 100.00%
