In [1]:
import cv2
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

def initialize_captioning_model():
    """Load the BLIP model and processor."""
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    return processor, model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def generate_scene_description(frame, processor, model):
    """Generate a scene description for a given frame."""
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # Convert frame to PIL format
    inputs = processor(images=image, return_tensors="pt")
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption


In [3]:
processor, model = initialize_captioning_model()

In [4]:
def start_streaming():
    """Stream video feed and overlay scene descriptions."""
    # Initialize webcam
    cap = cv2.VideoCapture(0)  # Use 0 for the default camera

    # Load the model and processor

    # Set up window
    window_name = "Scene Description Stream"
    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
    print("In streaming function")
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                print("Failed to capture frame. Exiting...")
                break

            # Generate scene description
            description = generate_scene_description(frame, processor, model)

            # Overlay the description on the frame
            font = cv2.FONT_HERSHEY_SIMPLEX
            cv2.putText(frame, description, (10, 50), font, 1, (255, 0, 255), 2, cv2.LINE_AA)

            # Display the frame
            cv2.imshow(window_name, frame)

            # Exit on 'q' key press
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    finally:
        cap.release()
        cv2.destroyAllWindows()

In [5]:
print("Starting!")
start_streaming()

Starting!
In streaming function


