### Vision Detection Agent scratch

⸻

### 10 step

* [1-3 step]: 이해 및 계획
<br> 사용자 요청 분석 → 이미지 로드: VQA(Visual Question Answering)로 이미지 이해

* [4-6 step]: 도구 선택
<br> suggestion() → get_tool_for_task() → 최적 도구 결정

* [7-9 step]: 실행 및 검증

* [10 step]: 최종 코드 생성 

#### [1-3 step]: 이해 및 계획
ImageLoader - 이미지 파일 로드 / 이미지 리사이즈 / PIL to Numpy / 유효성
<br> VQAModel - 이미지에 대한 질문 / 전체 설명 / 특정 객체 개수 추정
<br> Planner - LMM 초기화/ 사용자 요청 분석 / 단계별 계획 생성/ 계획 to markdown / 검증
<br> Suggester - 작업별 추천 방법 / 제안 우선순위 정렬 / 제안 이유 설명 

In [None]:
# tools
import numpy as np
import os
import tempfile
import urllib.request
from PIL import Image

def load_image(image_path: str) -> np.ndarray:
    if isinstance(image_path, np.ndarray):
        return image_path
    if image_path.startswith(("http", "https")):
        _, image_suffix = os.path.splitext(image_path)
        with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
            # Download the image and save it to the temporary file
            with urllib.request.urlopen(image_path) as response:
                tmp_file.write(response.read())
            image_path = tmp_file.name
    image = Image.open(image_path).convert("RGB")
    return np.array(image)

detection 입력 형식
<br> Sequence[Message] = {
    "role": "user",
    "content": "Find tomatoes",
    "media": ["img1.png", "img2.png"]
}

LLM 입력 형식
<br> fixed_chat = {
    "role": msg["role"],
    "content": [
        {"type": "text", "text": "Find tomatoes."},
        {"type": "image_base64", "image_base64": "..."},
    ]
}

In [None]:
# Utils
import base64
import numpy as np

from io import BytesIO
from pathlib import Path
from PIL import Image
from typing import Optional, Union


def image_to_base64(image: Image.Image, resize: Optional[int] = None) -> str:
    if resize is not None:
        image.thumbnail((resize, resize))
    buffer = BytesIO()
    image.convert("RGB").save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")


def encode_media(
    media: Union[str, Path, np.ndarray, Image.Image],
    resize: Optional[int] = None,
) -> str:
    if isinstance(media, np.ndarray):
        return image_to_base64(Image.fromarray(media), resize)

    if isinstance(media, Image.Image):
        return image_to_base64(media, resize)

    if isinstance(media, (str, Path)):
        path = Path(media)
        suffix = path.suffix.lower()

        if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp"}:
            return image_to_base64(Image.open(path), resize)

    raise ValueError(f"Unsupported media type: {media}")

In [None]:
from __future__ import annotations

from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Iterator, Optional, Sequence, Union, TypedDict


class Message(TypedDict, total=False):
    role: str
    content: str
    media: Sequence[Union[str, Path]]

ReturnType = str | Iterator[str | None]

class LMM(ABC):
    @abstractmethod
    def generate(
        self,
        prompt: str,
        media: Optional[Sequence[Union[str, Path]]] = None,
        **kwargs: Any,
    ) -> ReturnType:
        """Single-prompt interface (optionally with media)."""
        raise NotImplementedError

    @abstractmethod
    def chat(
        self,
        chat: Sequence[Message],
        **kwargs: Any,
    ) -> ReturnType:
        """Chat interface with role/content(+media) messages."""
        raise NotImplementedError

    def __call__(
        self,
        input: str | Sequence[Message],
        **kwargs: Any,
    ) -> ReturnType:
        """Unified call interface: str -> generate, messages -> chat."""
        if isinstance(input, str):
            return self.generate(input, **kwargs)
        return self.chat(input, **kwargs)


In [7]:
# LLM Class: AnthropicLMM, OpsenAILMM
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
from pathlib import Path
from openai import OpenAI

class OpenAILMM(LMM):
    # 초기화
    def __init__(
        self,
        model_name: str = "gpt-4o-2024-05-13",
        api_key: Optional[str] = None,
        max_tokens: int = 4096,
        json_mode: bool = False,
        image_size: int = 768,
        image_detail: str = "low",
        **kwargs: Any
    ):
        if not api_key:
            self.client = OpenAI()
        else:
            self.client = OpenAI(api_key=api_key)

        self.model_name = model_name
        self.image_size = image_size
        self.image_detail = image_detail
        # o1 does not use max_tokens
        if "max_tokens" not in kwargs and not (
            model_name.startswith("o1") or model_name.startswith("o3")
        ):
            kwargs["max_tokens"] = max_tokens
        if json_mode:
            kwargs["response_format"] = {"type": "json_object"}
        self.kwargs = kwargs

    # 호출 인터페이스
    def __call__(
        self,
        input: Union[str, Sequence[Message]],
        **kwargs: Any
    ) -> str | Iterator[str | None]:
        if isinstance(input, str):
            return self.generate(input, **kwargs)
        return self.chat(input, **kwargs)
    
    # 응답 생성
    def chat(
        self,
        chat: Sequence[Message],
        **kwargs: Any
    ) -> Union[str, Iterator[Optional[str]]]:
        """LLM 모델 과의 채팅
        입력 파라미터의 형태
            str: [{"role": "user", "content": "Hello!"}, ...]
            multimodal:[{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
        """
        fixed_chat = []
        for msg in chat:
            fixed_c = {"role": msg["role"]}
            fixed_c["content"] = [{"type": "text", "text": msg["content"]}]
        
            if msg.get("media") is not None and self.model_name != "o3-mini":
                for media in msg["media"]:
                    resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                    image_detail = kwargs.get("image_detail", self.image_detail)
                    encoded_media = encode_media(cast(str, media), resize=resize)

                    fixed_c["content"].append(
                        {
                            "type": "image_base64",
                            "image_base64": encoded_media,
                            "detail": image_detail, 
                        }
                    )
            fixed_chat.append(fixed_c)

        tmp_kwargs = self.kwargs | kwargs  
        response = self.client.chat.completions.create(
            model=self.model_name, messages=fixed_chat, **tmp_kwargs 
        )
        if "stream" in tmp_kwargs and tmp_kwargs["stream"]:

            def f() -> Iterator[Optional[str]]:
                for chunk in response:
                    chunk_message = chunk.choices[0].delta.content
                    yield chunk_message

            return f()
        else:
            return cast(str, response.choices[0].message.content)
                
    # 코드 생성
    def generate(
        self,
        prompt: str,
        media: Optional[Sequence[Union[str, Path]]] = None,
        **kwargs: Any
    ) -> Union[str, Iterator[Optional[str]]]:
        message: List[Dict[str, Any]] = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
        if media and len(media) > 0 and self.model_name != "o3-mini":
            for m in media:
                resize = kwargs.get("resize")
                image_detail = kwargs.get("image_detail", self.image_detail)
                encoded_media = encode_media(m, resize=resize)

                message[0]["content"].append(
                    {
                        "type": "image_base64",
                        "image_base64": encoded_media,
                        "detail": image_detail,
                    }
                )
        # prefers kwargs from second dictionary over first
        tmp_kwargs = self.kwargs | kwargs

        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=message,
            **tmp_kwargs,
        )

        if tmp_kwargs.get("stream", False):

            def f() -> Iterator[Optional[str]]:
                for chunk in response:
                    yield chunk.choices[0].delta.content 
            return f()
        return cast(str, response.choices[0].message.content)

In [None]:
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
import anthropic
from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam, ThinkingBlockParam

class AnthropicLMM(LMM):
    """An LMM class for Anthropic models (base64-only image handling)."""

    def __init__(
        self,
        api_key: Optional[str] = None,
        model_name: str = "claude-3-5-sonnet-20240620",
        max_tokens: int = 4096,
        image_size: int = 768,
        **kwargs: Any,
    ):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.model_name = model_name
        self.image_size = image_size

        if "max_tokens" not in kwargs:
            kwargs["max_tokens"] = max_tokens
        self.kwargs = kwargs

    def __call__(
        self,
        input: Union[str, Sequence[Message]],
        **kwargs: Any,
    ) -> str | Iterator[str | None]:
        if isinstance(input, str):
            return self.generate(input, **kwargs)
        return self.chat(input, **kwargs)

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _setup_chat_kwargs(
        self, kwargs: Dict[str, Any]
    ) -> tuple[Dict[str, Any], bool]:
        tmp_kwargs = self.kwargs | kwargs
        thinking_enabled = (
            "thinking" in tmp_kwargs
            and tmp_kwargs["thinking"].get("type") == "enabled"
        )
        if thinking_enabled:
            tmp_kwargs["temperature"] = 1.0
        return tmp_kwargs, thinking_enabled

    def _convert_messages(
        self,
        chat: Sequence[Message],
        thinking_enabled: bool,
        **kwargs: Any,
    ) -> List[MessageParam]:
        messages: List[MessageParam] = []

        for msg in chat:
            role = msg["role"]

            if role == "user":
                content: List[Union[TextBlockParam, ImageBlockParam]] = [
                    TextBlockParam(type="text", text=cast(str, msg["content"]))
                ]

                if msg.get("media"):
                    for media_path in msg["media"]:
                        resize = kwargs.get("resize", self.image_size)
                        encoded = encode_media(cast(str, media_path), resize=resize)

                        # encode_media는 base64를 반환한다고 가정
                        content.append(
                            ImageBlockParam(
                                type="image",
                                source={
                                    "type": "base64",
                                    "media_type": "image/png",
                                    "data": encoded,
                                },
                            )
                        )

                messages.append(MessageParam(role="user", content=content))

            elif role == "assistant":
                if thinking_enabled:
                    messages.append(
                        self._create_thinking_message(cast(str, msg["content"]))
                    )
                else:
                    messages.append(
                        MessageParam(
                            role="assistant",
                            content=[
                                TextBlockParam(
                                    type="text", text=cast(str, msg["content"])
                                )
                            ],
                        )
                    )
            else:
                raise ValueError(f"Unsupported role: {role}")

        return messages

    def _create_thinking_message(self, text: str) -> MessageParam:
        content: List[Union[TextBlockParam, ThinkingBlockParam]] = []

        thinking = extract_tag(text, "thinking")
        signature = extract_tag(text, "signature")

        if thinking:
            content.append(
                ThinkingBlockParam(
                    type="thinking",
                    thinking=thinking.strip(),
                    signature=signature.strip() if signature else "",
                )
            )

        clean_text = text
        if thinking:
            clean_text = clean_text.replace(f"<thinking>{thinking}</thinking>", "")
        if signature:
            clean_text = clean_text.replace(f"<signature>{signature}</signature>", "")

        if clean_text.strip():
            content.append(TextBlockParam(type="text", text=clean_text.strip()))

        return MessageParam(role="assistant", content=content)

    def _handle_stream(
        self,
        stream: anthropic.Stream[anthropic.MessageStreamEvent],
    ) -> Iterator[Optional[str]]:
        def f() -> Iterator[Optional[str]]:
            for chunk in stream:
                if chunk.type == "content_block_delta":
                    if hasattr(chunk.delta, "text"):
                        yield chunk.delta.text
                elif chunk.type == "message_stop":
                    yield None

        return f()

    # ------------------------------------------------------------------
    # Public APIs
    # ------------------------------------------------------------------

    def chat(
        self,
        chat: Sequence[Message],
        **kwargs: Any,
    ) -> Union[str, Iterator[Optional[str]]]:
        tmp_kwargs, thinking_enabled = self._setup_chat_kwargs(kwargs)
        messages = self._convert_messages(chat, thinking_enabled, **kwargs)

        response = self.client.messages.create(
            model=self.model_name,
            messages=messages,
            **tmp_kwargs,
        )

        if tmp_kwargs.get("stream", False):
            return self._handle_stream(
                cast(anthropic.Stream[anthropic.MessageStreamEvent], response)
            )

        # non-streaming
        msg = cast(anthropic.types.Message, response)
        return cast(anthropic.types.TextBlock, msg.content[-1]).text

    def generate(
        self,
        prompt: str,
        media: Optional[Sequence[Union[str, Path]]] = None,
        **kwargs: Any,
    ) -> Union[str, Iterator[Optional[str]]]:
        content: List[Union[TextBlockParam, ImageBlockParam]] = [
            TextBlockParam(type="text", text=prompt)
        ]

        if media:
            for m in media:
                resize = kwargs.get("resize", self.image_size)
                encoded = encode_media(cast(str, m), resize=resize)

                content.append(
                    ImageBlockParam(
                        type="image",
                        source={
                            "type": "base64",
                            "media_type": "image/png",
                            "data": encoded,
                        },
                    )
                )

        tmp_kwargs = self.kwargs | kwargs
        response = self.client.messages.create(
            model=self.model_name,
            messages=[MessageParam(role="user", content=content)],
            **tmp_kwargs,
        )

        if tmp_kwargs.get("stream", False):

            def f() -> Iterator[Optional[str]]:
                for chunk in response:
                    if chunk.type == "content_block_delta":
                        yield chunk.delta.text
                    elif chunk.type == "message_stop":
                        yield None

            return f()

        return cast(anthropic.types.TextBlock, response.content[-1]).text

확인

In [18]:
# ========================================
# Step 1: LMM Configuration & Initialization
# ========================================
import os
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
env_path = Path.cwd().parent / ".env"
load_dotenv(env_path, override=True)


# config 
# Initialize different LMMs for different roles
analyzer_lmm = AnthropicLMM(
    model_name="claude-sonnet-4-5-20250929",
    temperature=0.0,  # Precise analysis
    max_tokens=2048
)

planner_lmm = AnthropicLMM(
    model_name="claude-sonnet-4-5-20250929",
    temperature=0.0,  # Precise planning
    max_tokens=4096
)

vqa_lmm = OpenAILMM(
    model_name="gpt-4o-mini",
    temperature=0.0,  # Fast image understanding
    max_tokens=1024
)

print("LMM Configuration Complete")
print(f"  - Analyzer: {analyzer_lmm.model_name}")
print(f"  - Planner: {planner_lmm.model_name}")
print(f"  - VQA: {vqa_lmm.model_name}")


LMM Configuration Complete
  - Analyzer: claude-sonnet-4-5-20250929
  - Planner: claude-sonnet-4-5-20250929
  - VQA: gpt-4o-mini


In [20]:
# ========================================
# Step 2: User Request Analyzer
# ========================================

ANALYZE_PROMPT = """**Role**: You are an expert request analyzer for vision AI tasks.

**Task**: Analyze the user's request and break it down into key components that will help in planning and execution.

**User Request**: {user_request}

**Instructions**:
1. Identify the main task (detection, counting, classification, segmentation, etc.)
2. Extract key objects or concepts mentioned
3. Determine if the task requires single or multiple steps
4. Identify any specific constraints or requirements
5. Suggest the type of vision tools that might be needed

**Output Format**:
Provide your analysis in the following JSON format:
{{
    "task_type": "detection|counting|segmentation|classification|tracking|other",
    "main_objects": ["object1", "object2"],
    "task_complexity": "simple|moderate|complex",
    "requirements": ["requirement1", "requirement2"],
    "suggested_tools": ["tool1", "tool2"],
    "reasoning": "Your explanation here"
}}
"""

def analyze_user_request(user_request: str, lmm) -> dict:
    """사용자 요청을 분석하여 구조화된 정보 반환"""
    
    prompt = ANALYZE_PROMPT.format(user_request=user_request)
    
    # FIX: LMM 타입에 따라 다르게 처리
    print(f"Using LMM type: {type(lmm).__name__}")
    
    if isinstance(lmm, OpenAILMM):
        # OpenAI는 response_format 지원
        response = lmm.generate(
            prompt, 
            response_format={"type": "json_object"}
        )
    elif isinstance(lmm, AnthropicLMM):
        # Anthropic은 response_format 미지원, 프롬프트에 명시
        json_instruction = "\n\nYou MUST respond with ONLY a valid JSON object, no additional text."
        response = lmm.generate(prompt + json_instruction)
    else:
        # 기타 LMM
        response = lmm.generate(prompt)
    
    try:
        import json
        # JSON 추출 (코드 블록 안에 있을 수 있음)
        response_str = response.strip()
        if response_str.startswith("```json"):
            response_str = response_str[7:-3].strip()
        elif response_str.startswith("```"):
            response_str = response_str[3:-3].strip()
        
        analysis = json.loads(response_str)
        return analysis
    except Exception as e:
        return {"error": f"Failed to parse response: {e}", "raw_response": response}

# Test the analyzer
test_request = "이 이미지에 있는 토마토를 감지하고 개수를 세어주세요"
print("=" * 70)
print("User Request Analysis")
print("=" * 70)
print(f"Request: {test_request}\n")

analysis = analyze_user_request(test_request, analyzer_lmm)
print("Analysis Result:")
for key, value in analysis.items():
    print(f"  {key}: {value}")

User Request Analysis
Request: 이 이미지에 있는 토마토를 감지하고 개수를 세어주세요

Using LMM type: AnthropicLMM
Analysis Result:
  task_type: detection|counting
  main_objects: ['tomato']
  task_complexity: simple
  requirements: ['detect all tomatoes in image', 'count total number of tomatoes']
  suggested_tools: ['object_detection', 'instance_segmentation', 'counting_algorithm']
  reasoning: The user is asking to detect tomatoes in an image and count them. This is a two-step task that requires: 1) Object detection to locate all tomatoes in the image, and 2) Counting the detected instances. This is a straightforward computer vision task that can be accomplished using standard object detection models (like YOLO, Faster R-CNN) or instance segmentation models that can identify individual tomato instances and provide an accurate count.


In [None]:
# planner tools
def vaq():

In [None]:
# planner tools
def suggestion