# Identify Object Positions in Images - YOLO vs VLM 

### Utils 
Taken from 
https://colab.research.google.com/drive/1eDvf_Ky9jLOZFShgHrm4GI-wkAaQnue6?usp=sharing#scrollTo=wizbxA1lm-Tj
https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/Object_detection.ipynb#scrollTo=245bc92a470f


In [1]:
# @title Plotting Utils
import json
import random
import io
from PIL import Image, ImageDraw
from PIL import ImageColor

additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]

def plot_bounding_boxes(im, noun_phrases_and_positions):
    """
    Plots bounding boxes on an image with markers for each noun phrase, using PIL, normalized coordinates, and different colors.

    Args:
        img_path: The path to the image file.
        noun_phrases_and_positions: A list of tuples containing the noun phrases
         and their positions in normalized [y1 x1 y2 x2] format.
    """

    # Load the image
    img = im
    width, height = img.size
    print(img.size)
    # Create a drawing object
    draw = ImageDraw.Draw(img)

    # Define a list of colors
    colors = [
    'red',
    'green',
    'blue',
    'yellow',
    'orange',
    'pink',
    'purple',
    'brown',
    'gray',
    'beige',
    'turquoise',
    'cyan',
    'magenta',
    'lime',
    'navy',
    'maroon',
    'teal',
    'olive',
    'coral',
    'lavender',
    'violet',
    'gold',
    'silver',
    ] + additional_colors

    # Iterate over the noun phrases and their positions
    for i, (noun_phrase, (y1, x1, y2, x2)) in enumerate(
        noun_phrases_and_positions):
        # Select a color from the list
        color = colors[i % len(colors)]

        # Convert normalized coordinates to absolute coordinates
        abs_x1 = int(x1/1000 * width)
        abs_y1 = int(y1/1000 * height)
        abs_x2 = int(x2/1000 * width)
        abs_y2 = int(y2/1000 * height)

        # Draw the bounding box
        draw.rectangle(
            ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4
        )

        # Draw the text
        draw.text((abs_x1 + 8, abs_y1 + 6), noun_phrase, fill=color)

    # Display the image
    img.show()

# @title Parsing utils
def parse_list_boxes(text):
  result = []
  for line in text.strip().splitlines():
    # Extract the numbers from the line, remove brackets and split by comma
    try:
      numbers = line.split('[')[1].split(']')[0].split(',')
    except:
      numbers =  line.split('- ')[1].split(',')

    # Convert the numbers to integers and append to the result
    result.append([int(num.strip()) for num in numbers])

  return result

def parse_list_boxes_with_label(text):
  text = text.split("```\n")[0]
  return json.loads(text.strip("```").strip("python").strip("json").replace("'", '"').replace('\n', '').replace(',}', '}'))





## 1.YOLO -> FUNKTIONIERT NICHT, LÖSUNGEN NEHMEN

In [None]:
# Import YOLO and load a pre-trained model
from ultralytics import YOLO
import cv2

# Load the YOLOv8 pre-trained model
model = YOLO('yolov8n.pt')  # nano model for quick inference

# Run inference on a sample image
results = model('images/table_scene.jpeg', save = False)  # Displays image with detections

for result in results:
    print(result.boxes)  # Boxes object for bounding box outputs


image 1/1 /workspaces/MultimodalInteraction_ObjDet/images/table_scene.jpeg: 640x640 1 cup, 2 potted plants, 2 dining tables, 4 books, 3 vases, 296.3ms
Speed: 25.4ms preprocess, 296.3ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)
ultralytics.engine.results.Boxes object with attributes:

cls: tensor([41., 58., 75., 75., 60., 73., 58., 73., 60., 73., 75., 73.])
conf: tensor([0.8666, 0.7697, 0.7267, 0.7095, 0.6095, 0.6043, 0.5443, 0.4876, 0.3747, 0.3382, 0.2693, 0.2544])
data: tensor([[6.1529e+02, 5.5336e+02, 8.4596e+02, 7.8049e+02, 8.6657e-01, 4.1000e+01],
        [3.8491e+02, 8.2020e+01, 5.8930e+02, 3.5290e+02, 7.6967e-01, 5.8000e+01],
        [7.2199e+02, 3.9503e+02, 9.6582e+02, 6.3202e+02, 7.2675e-01, 7.5000e+01],
        [6.1139e+02, 3.5216e+02, 7.2161e+02, 5.2007e+02, 7.0952e-01, 7.5000e+01],
        [0.0000e+00, 3.7149e+02, 1.0240e+03, 1.0240e+03, 6.0951e-01, 6.0000e+01],
        [8.7753e+01, 4.1335e+02, 4.2453e+02, 5.5627e+02, 6.0427e-01, 7.3000e+01],
        

In [19]:
plot_bounding_boxes("images/table_scene.jpeg", results)

AttributeError: 'str' object has no attribute 'size'

## 2. OpenAI

In [6]:
import openai
from dotenv import load_dotenv  
import os
import base64
import json
import textwrap

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


load_dotenv()
#openAIclient = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))
openAIclient = openai.OpenAI(api_key= os.getenv("OPENAI_API_KEY"))




TEXTMODEL = "gpt-4o-mini" 
IMGMODEL= "gpt-4o-mini" 

# Path to your image
img = "images/table_scene.jpeg"

In [11]:
def promptLLM(prompt : str = None, sysprompt : str = None,  image : str = None, wantJson : bool = False, returnDict : bool = False):
    returnValue = ""
    messages = [{"role": "system", "content" : sysprompt}]
    modelToUse = TEXTMODEL
    #force it to be a json answer prompt
    #prompt = prompt if not wantJson else returnJSONAnswerPrompt(prompt)
    messages.append({"role": "user", "content": [{ 
        "type" : "text", 
        "text" : prompt 
    }]})
    if image is not None:
        image = f"data:image/jpeg;base64,{image}"
        messages[1]["content"].append({"type": "image_url", "image_url": { "url" : image}})
        modelToUse = IMGMODEL

    if wantJson:
        returnValue = openAIclient.chat.completions.create(
            model=modelToUse,
            #max_tokens= 400,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "img_extract",
                    "schema": {
                    "type": "object",
                    "properties": {
                        "numberOfObjects": {
                        "type": "integer",
                        "description": "The total number of objects in the scene",
                        "minimum": 0
                        },
                        "atmosphere": {
                        "type": "string",
                        "description": "Description of the atmosphere, e.g., calm, lively, etc."
                        },
                        "hourOfTheDay": {
                        "type": "integer",
                        "description": "The hour of the day in 24-hour format",
                        "minimum": 0,
                        "maximum": 23
                        },
                        "objects": {
                        "type": "array",
                        "description": "List of objects and their details",
                        "items": {
                            "type": "object",
                            "properties": {
                            "position": {
                                "type": "string",
                                "description": "Position of the object in the scene"
                            },
                            "colour": {
                                "type": "string",
                                "description": "Clour of object",
                                "minimum": 0
                            },
                            "size": {
                                "type": "string",
                                "description": "Size of the object, e.g., small, medium, big etc."
                            },
                            "type": {
                                "type": "string",
                                "description": "the type of object"
                            }
                            },
                            "required": ["position", "colour", "size"]
                        }
                        }
                    },
                    "required": ["numberOfObjects", "atmosphere", "hourOfTheDay", "objects"]
                    }}},
            messages=messages,
            temperature=0,
            #n=1,
        )
    else :
        returnValue = openAIclient.chat.completions.create(
            model=modelToUse,
            messages=messages,
            temperature=0,
            #n=1,
        )
    returnValue = returnValue.choices[0].message.content
    if returnDict:
        return json.loads(returnValue)
    return returnValue

In [12]:
output_image_analysis = promptLLM(prompt = "describe the image in detail",sysprompt = "you are a careful observer. the response should be in json format", image = encode_image(img), wantJson=True, returnDict=True)
output_image_analysis

{'numberOfObjects': 8,
 'atmosphere': 'calm and cozy',
 'hourOfTheDay': 10,
 'objects': [{'type': 'book',
   'colour': 'black',
   'size': 'medium',
   'position': 'top left'},
  {'type': 'book',
   'colour': 'black',
   'size': 'medium',
   'position': 'middle left'},
  {'type': 'plant', 'colour': 'green', 'size': 'small', 'position': 'middle'},
  {'type': 'plant', 'colour': 'green', 'size': 'medium', 'position': 'right'},
  {'type': 'mug',
   'colour': 'brown',
   'size': 'medium',
   'position': 'bottom right'},
  {'type': 'glasses',
   'colour': 'black',
   'size': 'small',
   'position': 'bottom left'},
  {'type': 'phone',
   'colour': 'black',
   'size': 'small',
   'position': 'bottom right'},
  {'type': 'compass',
   'colour': 'silver',
   'size': 'small',
   'position': 'bottom right'}]}

In [22]:
alert_sys_prompt = "You are a special trained observer"
promptLLM(prompt =  "Detect if there is a cup in the picture and reutrn its coordinates as a list in the format '[ymin,xmin, ymax, xmax]'. Just output the list.", sysprompt= alert_sys_prompt, image = encode_image(img)) 

'[400, 550, 480, 600]'

In [21]:
plot_bounding_boxes(img, output)

AttributeError: 'str' object has no attribute 'size'

## 3. Gemini

In [15]:
%matplotlib inline
import os
from dotenv import load_dotenv  
import google.generativeai as genai
from PIL import Image

load_dotenv()
#genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
im = Image.open(img)

genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
model = genai.GenerativeModel("gemini-1.5-pro")

response = model.generate_content([
    im,
    (
        "Detect if there is a cup and reutrn its coordinates as a list in the format '[ymin,xmin, ymax, xmax]'. Just output the list.\n "
    ),
])
response.resolve()
print(response.text)

[543, 598, 755, 817]
