In [9]:
import cv2
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
import time

# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Initialize webcam
cap = cv2.VideoCapture(0)  # 0 is the default webcam
if not cap.isOpened():
    raise Exception("Error: Could not open webcam.")

candidate_captions = [
    "Person holding an object.",
    "Person performing an action.",
    "Person standing or sitting.",
    "Multiple people in the scene.",
    "Object on a surface.",
    "No person or object detected.",
    "Animal in the scene.",
    "Vehicle or machinery present.",
    "Person using a device.",
    "Cluttered scene with multiple objects."
]

def generate_caption(frame):
    # Convert OpenCV frame (BGR) to PIL image (RGB)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    
    # Preprocess image and captions
    inputs = processor(
        text=candidate_captions,
        images=pil_image,
        return_tensors="pt",
        padding=True
    )
    
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Calculate probabilities
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
    
    # Select the caption with the highest probability
    best_caption_idx = np.argmax(probs)
    return candidate_captions[best_caption_idx]

# Main loop for real-time processing
frame_skip = 5  # Process every 5th frame to improve performance
frame_count = 0
caption = "Initializing..."

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Failed to capture frame.")
            break
        
        frame_count += 1
        
        # Process every 'frame_skip' frame to reduce computational load
        if frame_count % frame_skip == 0:
            caption = generate_caption(frame)
        
        # Overlay caption on the frame
        cv2.putText(
            frame,
            caption,
            (10, 30),  # Position at top-left
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (0, 255, 0),  # Green text
            2,
            cv2.LINE_AA
        )
        
        # Display the frame
        cv2.imshow("Webcam Feed with Captions", frame)
        
        # Press 'q' to quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        
        # Small delay to control frame rate
        time.sleep(0.05)

except KeyboardInterrupt:
    print("Stopped by user.")

finally:
    # Clean up
    cap.release()
    cv2.destroyAllWindows()