In [1]:
import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np
from ultralytics import YOLO

# This will automatically download the weights if not present:
model = YOLO("yolov8n.pt")

# Load YOLOv8:
model = YOLO("yolov8n.pt") 

# Initialise face mesh:
cam = cv2.VideoCapture(0)
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks = True)
screen_w, screen_h = pyautogui.size()

def detect_objects(frame):

    results = model(frame)
    boxes = []
    confidences = []
    class_ids = []

    for result in results:

        for obj in result.boxes:
            boxes.append(obj.xyxy.cpu().numpy().astype(int))
            confidences.append(obj.conf.item())
            class_ids.append(obj.cls.item())

    if len(boxes) > 0:

        best_idx = np.argmax(confidences)
        x, y, x2, y2 = boxes[best_idx][0]
        label = model.names[int(class_ids[best_idx])]
        confidence = confidences[best_idx]

        print(f"Detected {label} with confidence {confidence} at [{x}, {y}, {x2-x}, {y2-y}]")
        print(f"Bounding box coordinates: [{x}, {y}, {x2}, {y2}]")  # Print bounding box coordinates

        cv2.rectangle(frame, (x, y), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # Crop the detected region from the frame
        cropped_image = frame[y:y2, x:x2]
        cv2.imshow("Cropped Detected Object", cropped_image)  # Display the cropped image
        
        return frame, label
    
    else:
        return frame, None

detection_done = False
detected_object = None

while True:

    _, frame = cam.read()
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    output = face_mesh.process(rgb_frame)
    landmark_points = output.multi_face_landmarks
    frame_h, frame_w, _ = frame.shape

    if landmark_points:
        landmarks = landmark_points[0].landmark

        for id, landmark in enumerate(landmarks[474:478]):
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 0))

            if id == 1:
                screen_x = int(screen_w * landmark.x)
                screen_y = int(screen_h * landmark.y)
                pyautogui.moveTo(screen_x, screen_y)

        left = [landmarks[145], landmarks[159]]

        for landmark in left:
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 255))

        if (left[0].y - left[1].y) < 0.004 and not detection_done:
            pyautogui.click()
            time.sleep(1)

            # Capture a specific region around the click location:
            region_size = 900  # Size of the region to capture.
            left = max(0, screen_x - region_size // 2)
            top = max(0, screen_y - region_size // 2)
            width = min(region_size, screen_w - left)
            height = min(region_size, screen_h - top)
            print(f"Capturing region: left = {left}, top = {top}, width = {width}, height = {height}")
            screenshot = pyautogui.screenshot(region = (left, top, width, height))
            screenshot = cv2.cvtColor(np.array(screenshot), cv2.COLOR_RGB2BGR)

            print("Running object detection on the captured region...")
            frame, best_object = detect_objects(screenshot)
            if best_object:
                print("Best object detected:", best_object)
                detected_object = best_object
                detection_done = True
                # Display the detection results for a short period
                cv2.imshow('Detected Objects', frame)
                cv2.waitKey(10000)  # Display the window for 2 seconds
                break  # Exit the while loop after detection
            else:
                print("No objects detected.")

    cv2.imshow('Object Detection', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

if detected_object:
    print(f"Detected object: {detected_object}")




Capturing region: left = 770, top = 169, width = 600, height = 600
Running object detection on the captured region...

0: 640x640 13 bananas, 155.2ms
Speed: 4.0ms preprocess, 155.2ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)
Detected banana with confidence 0.7076525092124939 at [472, 174, 127, 153]
Bounding box coordinates: [472, 174, 599, 327]
Best object detected: banana
Detected object: banana


In [3]:
detected_object

'banana'