In [1]:
import cv2
import numpy as np
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
import time
import threading
import queue

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = VisionEncoderDecoderModel.from_pretrained(
    "nlpconnect/vit-gpt2-image-captioning"
)

feature_extractor = ViTImageProcessor.from_pretrained(
    "nlpconnect/vit-gpt2-image-captioning"
)

tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")



In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [4]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}


def predict_step(image):
    if image.mode != "RGB":
        image = image.convert(mode="RGB")

    pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return preds[0].strip() if preds else None


def caption_thread(frame_queue, caption_queue, stop_event):
    while not stop_event.is_set():
        try:
            frame = frame_queue.get(timeout=1)
            if frame is None:
                break

            pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            caption = predict_step(pil_image)
            caption_queue.put(caption)

            frame_queue.task_done()
        except queue.Empty:
            continue

In [5]:
# Open webcam
cap = cv2.VideoCapture(0)

frame_queue = queue.Queue(maxsize=1)  # Keep only one frame in the queue
caption_queue = queue.Queue(maxsize=1)  # Keep only one caption in the queue

# Create a stop event for the thread
stop_event = threading.Event()
# Start the captioning thread
thread = threading.Thread(
    target=caption_thread, args=(frame_queue, caption_queue, stop_event), daemon=True
)
thread.start()

last_caption_time = time.time()
caption = ""

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Add frame to queue for captioning
        if frame_queue.empty():  # Only add a new frame if the queue is empty
            frame_queue.put(frame)

        # Update caption every 0.5 seconds
        current_time = time.time()
        if current_time - last_caption_time >= 0.5:
            last_caption_time = current_time

            # Get the latest caption from the queue if available
            if not caption_queue.empty():
                caption = caption_queue.get()

        # Overlay translucent green background
        if caption:
            (w, h), _ = cv2.getTextSize(caption, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
            x, y = 10, frame.shape[0] - 10

            # Create a translucent background for text
            overlay = frame.copy()
            cv2.rectangle(
                overlay, (x, y - h - 10), (x + w + 10, y + 10), (0, 255, 0), -1
            )
            cv2.addWeighted(overlay, 0.5, frame, 0.5, 0, frame)

            # Overlay the text
            cv2.putText(
                frame,
                caption,
                (x + 5, y - 5),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,
                (0, 0, 0),
                1,
                cv2.LINE_AA,
            )

        # Display frame with caption
        cv2.imshow("Webcam Captioning", frame)

        # Exit on 'q' key press
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

finally:
    # Release resources
    cap.release()
    cv2.destroyAllWindows()
    # Signal the caption thread to exit
    stop_event.set()
    frame_queue.put(None)  # To unblock the thread if it's waiting for frames
    thread.join()  # Wait for the caption thread to finish


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
