In [None]:
# VLM required libraries
import outlines
import torch
from transformers import AutoProcessor
from pydantic import BaseModel, Field
from typing import Literal, Optional, List, Dict

# Image stuff
from PIL import Image
import requests

from rich import pretty 

In [2]:
# Loading of the model
from transformers import Qwen2_5_VLForConditionalGeneration
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
model_class = Qwen2_5_VLForConditionalGeneration

model = outlines.models.transformers_vision(
    model_name,
    model_class=model_class,
    model_kwargs={
        "device_map": "auto",
        "torch_dtype": torch.bfloat16,
        "trust_remote_code": True,
    },
)

Loading checkpoint shards: 100%|██████████| 5/5 [00:15<00:00,  3.15s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# Define JSON schema 
from typing import Dict
class DetectedObject(BaseModel):
    label: str = Field(..., description="Label of the detected object")
    bbox: List[int] = Field(..., description="Bounding box in [x1, y1, x2, y2] format")

class ImageDetectionResult(BaseModel):
    objects: List[DetectedObject] = Field(..., description="List of detected objects with labels and bounding boxes")
    object_counts: Dict[str, int] = Field(..., description="Dictionary with the count of each detected object type")

In [None]:
import re 

# Path to the directory (library)
directory_path = "/home/jovyan/images"

# List all files in the directory
files = os.listdir(directory_path)

# Filter out directories and show only files
files = [file for file in files if os.path.isfile(os.path.join(directory_path, file))]

data = {}
  

for index, file in enumerate(files):
     
    print(f"Processing image {file}, {index+1}/{len(files)}")
    # Image path
    image_path = os.path.join(directory_path,file)
    
    image = Image.open(image_path).convert("RGB")
    
    # Prompt 
    messages = [
        {
            "role": "user",
            "content": [            
                {
                    # The image is provided as a PIL Image object
                    "type": "image",
                    "image" : image,
                },
                {
                    "type": "text",
                    "text": f"""
    You are very skilled at detecting simple objects in an image.
    Detect all the common, simple objects (e.g., dog, cat, car, chair) and their corresponding bounding boxes coordinates in the image. 
    Do not detect complex descriptions (e.g., 'dog in fire', 'cat on a table') or any text present in the image.
    Count how many of each objects there are, and return the results in the following JSON schema:
    {ImageDetectionResult.model_json_schema()}
                """},
            ],
        }
    ]

    # Loads the processor 
    processor = AutoProcessor.from_pretrained(
        model_name,
        trust_remote_code=True 
    )
    # Apply template for prompt to the processor 
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    

    # Prepare a function to process inputs
    summary_generator = outlines.generate.json(
        model,
        ImageDetectionResult,
        sampler=outlines.samplers.multinomial(1,temperature=0.8)
    )


    # Output
    with torch.no_grad():
        result = summary_generator(prompt, [image])
        data[file] = result.dict()
        
        
json_name = 'Processed_Image.json'
with open(json_name, "w") as f:
    json.dump(data,f,indent=4)