In [11]:
from openai import OpenAI
import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np
import torch
from PIL import Image  
from transformers import ViTFeatureExtractor, ViTForImageClassification

##### Phase 1:

In [12]:
prompt_phase_1 = "open"

##### Phase 2:

In [13]:
# Initialize face mesh:
cam = cv2.VideoCapture(0)
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks=True)
screen_w, screen_h = pyautogui.size()

# Load ViT model and Feature Extractor:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

# Function to detect objects using ViT:
def detect_objects(frame):
    # Convert frame to PIL image:
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    
    inputs = feature_extractor(images=img, return_tensors="pt")  # Preprocess the image.

    # Get predictions:
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class_idx = logits.argmax(-1).item()
    predicted_class = model.config.id2label[predicted_class_idx]
    confidence = torch.softmax(logits, dim=1)[0, predicted_class_idx].item()

    # Take only the first term of the predicted class:
    first_term = predicted_class.split(',')[0].strip()  # Split and strip whitespace
    
    print(f"Detected {first_term} with confidence {confidence}")
    
    # Add a bounding box and label (for illustration):
    cv2.rectangle(frame, (0, 0), (frame.shape[1], frame.shape[0]), (0, 255, 0), 2)
    cv2.putText(frame, f"{first_term} ({confidence:.2f})", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    return frame, first_term if confidence > 0.5 else None


detection_done = False
detected_object = None

while True:
    _, frame = cam.read()
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    output = face_mesh.process(rgb_frame)
    landmark_points = output.multi_face_landmarks
    frame_h, frame_w, _ = frame.shape

    if landmark_points:
        landmarks = landmark_points[0].landmark

        for id, landmark in enumerate(landmarks[474:478]):
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 0))

            if id == 1:
                screen_x = int(screen_w * landmark.x)
                screen_y = int(screen_h * landmark.y)
                pyautogui.moveTo(screen_x, screen_y)

        # Left eye landmarks
        left_eye = [landmarks[145], landmarks[159]]
        # Right eye landmarks
        right_eye = [landmarks[374], landmarks[386]]

        for landmark in left_eye + right_eye:
            x = int(landmark.x * frame_w)
            y = int(landmark.y * frame_h)
            cv2.circle(frame, (x, y), 3, (0, 255, 255))

        # Check for blinking on both eyes
        left_blink = (left_eye[0].y - left_eye[1].y) < 0.020
        right_blink = (right_eye[0].y - right_eye[1].y) < 0.020

        if (left_blink or right_blink) and not detection_done:
            pyautogui.click()

            # Capture a specific region around the click location:
            region_size = 600  # Size of the region to capture.
            left = max(0, screen_x - region_size // 2)
            top = max(0, screen_y - region_size // 2)
            width = min(region_size, screen_w - left)
            height = min(region_size, screen_h - top)
            print(f"Capturing region: left = {left}, top = {top}, width = {width}, height = {height}")
            screenshot = pyautogui.screenshot(region=(left, top, width, height))
            screenshot = cv2.cvtColor(np.array(screenshot), cv2.COLOR_RGB2BGR)

            print("Running object detection on the captured region...")
            frame, best_object = detect_objects(screenshot)
            if best_object:
                print("Best object detected:", best_object)
                detected_object = best_object
                detection_done = True
                # Display the detection results briefly:
                cv2.imshow('Detected Objects', frame)
                cv2.waitKey(100)  # Display the window briefly.
                break  # Exit the while loop after detection.
            else:
                print("No objects detected.")

    cv2.imshow('Object Detection', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

if detected_object:
    print(f"Detected object: {detected_object}")

prompt_phase_2 = detected_object


Capturing region: left = 659, top = 293, width = 600, height = 600
Running object detection on the captured region...
Detected window shade with confidence 0.9854831695556641
Best object detected: window shade
Detected object: window shade


##### Phase 3:

In [14]:
client = OpenAI(api_key = 'key')

def classify_comment(prompt_phase_1, prompt_phase_2):

    response = client.chat.completions.create(

        #model = "gpt-3.5-turbo-0125",
        model = "gpt-4o",

        messages=[
            # Define the purpose of this model: 
            {"role": "system", "content": "Generate a sentence using the given words in the exact order provided. The sentence must: Clearly describe an immediate action, need or observation directly related to the specific object detected in the user’s current environment. Avoid any reference to unrelated actions or contexts that could cause ambiguity. Be concise, direct and focused solely on the present moment."},
            # Prompt given by the user:
            {"role": "user", "content": f"Can you create a sentence using the following two words: {prompt_phase_1} and {prompt_phase_2}"}
        ],

        max_tokens = 300,
        temperature = 0.5,
    )

    classification_response = response.choices[0].message.content
    
    return classification_response

provided_response = classify_comment(prompt_phase_1, prompt_phase_2)

##### Output:

In [15]:
provided_response

'Open the window shade to let in more light.'