In [1]:
!pip install ultralytics 

Collecting ultralytics
  Downloading ultralytics-8.2.75-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading ultralytics-8.2.75-py3-none-any.whl (865 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.6/865.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading ultralytics_thop-2.0.0-py3-none-any.whl (25 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.2.75 ultralytics-thop-2.0.0


In [2]:
import cv2
import numpy as np
import datetime
from ultralytics import YOLO
import webcolors
from scipy.spatial import KDTree
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image

import matplotlib.pyplot as plt

In [3]:
print(webcolors.__version__)

1.13


In [None]:
image_path = ["/kaggle/input/random-image-on-internet/test_image.png",
              "/kaggle/input/client-test-image/test_image_1.jpg", 
              "/kaggle/input/client-test-image/test_image_2.jpg",
              "/kaggle/input/client-test-image/test_image_3.jpg"]

In [None]:
def LoadImage(image_path):
    # Load image

    image = Image.open(image_path).convert("RGB")
    
    # image to array

    image_to_array = np.array(image)
    
    plt.imshow(image)
    plt.axis("off")
    plt.show()

    return image_to_array

In [None]:
# Load the BLIP model and processor for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")

# Precompute the RGB values and corresponding color names
css3_db = {name: webcolors.hex_to_rgb(hex_) for hex_, name in webcolors.CSS3_HEX_TO_NAMES.items()}
names = list(css3_db.keys())
rgb_values = np.array([css3_db[name] for name in names])

# Build a KDTree for fast nearest neighbor search
kd_tree = KDTree(rgb_values)

def get_color_name(rgb_color):
    dist, idx = kd_tree.query(rgb_color)
    return names[idx]

def get_primary_color(image, bbox):
    x, y, w, h = bbox
    roi = image[y:y+h, x:x+w]
    roi = cv2.resize(roi, (10, 10), interpolation=cv2.INTER_AREA)
    avg_color = roi.mean(axis=0).mean(axis=0)
    avg_color = avg_color.astype(int)
    avg_color_rgb = avg_color[::-1].tolist()
    color_name = get_color_name(avg_color_rgb)
    return color_name

def generate_caption(image):
    if image.mode != "RGB":
        image = image.convert(mode="RGB")
    
    # Unconditional image captioning
    inputs = processor(image, return_tensors="pt").to("cuda")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True).strip()
    
    return caption

def generate_vision_record(image: np.array) -> dict:
    yolo_model = YOLO('yolov10n.pt')
    
    results = yolo_model(image)

    detected_objects = []
    activities = []
    object_colors = []
    bounding_boxes = []

    for result in results:
        for bbox, class_id, confidence in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
            bbox = bbox.int().tolist()
            class_name = yolo_model.names[int(class_id)]
            detected_objects.append(class_name)

            # Extract the bounding box region and convert to PIL image for image captioning
            cropped_img = image[bbox[1]:bbox[3], bbox[0]:bbox[2]]
            cropped_img = Image.fromarray(cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB))

            # Generate a caption for the cropped image (if needed)
            caption = generate_caption(cropped_img)
            activities.append(caption)  # Using 'activities' to store captions

            color = get_primary_color(image, bbox)
            object_colors.append(color)
            
            bounding_boxes.append(bbox)

    frame_size = image.shape[:2]

    # Convert the entire frame to PIL image for full-frame captioning
    full_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    # Generate a caption for the entire frame
    frame_summary = generate_caption(full_image)

    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")

    vision_record = {
        "Time": timestamp,
        "Objects": detected_objects,
        "Objects Activities": activities,  
        "Object Colors": object_colors,
        "Object Bounding Boxes": bounding_boxes,
        "Frame Size": frame_size,
        "Frame Summary": frame_summary 
    }

    return vision_record

In [None]:
image_1 = LoadImage(image_path=image_path[0])

In [None]:
# Generate the vision record
vision_record = generate_vision_record(image_1)

vision_record

In [None]:
image_2 = LoadImage(image_path=image_path[1])

In [None]:
# Generate the vision record
vision_record = generate_vision_record(image_2)

vision_record

In [None]:
image_3 = LoadImage(image_path=image_path[2])

In [None]:
# Generate the vision record
vision_record = generate_vision_record(image_3)

vision_record

In [None]:
image_4 = LoadImage(image_path=image_path[3])

In [None]:
# Generate the vision record
vision_record = generate_vision_record(image_4)

vision_record