### Vision Detection Agent scratch

⸻

### 10 step

* [1-3 step]: 이해 및 계획
<br> 사용자 요청 분석 → 이미지 로드: VQA(Visual Question Answering)로 이미지 이해

* [4-6 step]: 도구 선택
<br> suggestion() → get_tool_for_task() → 최적 도구 결정

* [7-9 step]: 실행 및 검증

* [10 step]: 최종 코드 생성 

#### [1-3 step]: 이해 및 계획
ImageLoader - 이미지 파일 로드 / 이미지 리사이즈 / PIL to Numpy / 유효성
<br> VQAModel - 이미지에 대한 질문 / 전체 설명 / 특정 객체 개수 추정
<br> Planner - LMM 초기화/ 사용자 요청 분석 / 단계별 계획 생성/ 계획 to markdown / 검증
<br> Suggester - 작업별 추천 방법 / 제안 우선순위 정렬 / 제안 이유 설명 

In [27]:
import copy
import tempfile
import base64

import abc
import base64
import logging
import os
import platform
import re
import sys
import traceback
import warnings
from enum import Enum
from pathlib import Path
from time import sleep
from typing import Any, Dict, Iterable, List, Optional, Union

import nbformat
from dotenv import load_dotenv
from nbclient import NotebookClient
from nbclient import __version__ as nbclient_version
from nbclient.exceptions import CellTimeoutError, DeadKernelError
from nbclient.util import run_sync
from nbformat.v4 import new_code_cell
from opentelemetry.context import get_current
from opentelemetry.trace import SpanKind, Status, StatusCode, get_tracer
from pydantic import BaseModel, field_serializer
from typing_extensions import Self


from pathlib import Path
from typing import List, Tuple, Union, Optional, cast , Dict, Any
from pydantic import BaseModel
from PIL import Image, ImageDraw, ImageFont
from PIL.Image import Image as ImageTypes
from io import BytesIO

* 입력 표준화
<br> AgentMessage(content, media) 만들고 내부 형식으로 변환 / 텍스트와 이미지의 묶음

In [None]:
from pathlib import Path
from typing import List, Literal, Optional, Union
from pydantic import BaseModel

class AgentMessage(BaseModel):
    role: Union[
        Literal["user"],
        Literal["assistant"],  # planner, coder and conversation are of type assistant
        Literal["observation "],
        Literal["final_observation"],  # the observation from the final code output
        Literal["error_observation"],  # the observation from the error message
        Literal["interaction"],
        Literal["interaction_response"],
        Literal["conversation"],
        Literal["planner"],
        Literal[
            "planner_update"
        ],  # an intermediate update from the planner to show partial information
        Literal["coder"],
    ]
    content: str
    media: Optional[List[Union[str, Path]]] = None

In [11]:
# src/utils/types.py
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional


@dataclass
class Logs:
    stdout: List[str] = field(default_factory=list)
    stderr: List[str] = field(default_factory=list)


@dataclass
class Error:
    name: str
    value: str
    traceback_raw: List[str] = field(default_factory=list)


@dataclass
class Result:
    is_main_result: bool
    data: Dict[str, Any]


@dataclass
class Execution:
    logs: Logs = field(default_factory=Logs)
    results: List[Result] = field(default_factory=list)
    error: Optional[Error] = None

    @property
    def ok(self) -> bool:
        return self.error is None

In [29]:
# # src/utils/excute.py
# 표준화 유틸

from dataclasses import dataclass, field

load_dotenv()
_LOGGER = logging.getLogger(__name__)
_SESSION_TIMEOUT = 600  # 10 minutes
WORKSPACE = Path(os.getenv("WORKSPACE", ""))

class CodeInterpreter(abc.ABC):
    
    """Code interpreter interface."""

    def __init__(
        self,
        timeout: int,
        remote_path: Optional[Union[str, Path]] = None,
        non_exiting: bool = False,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        self.timeout = timeout
        self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
        self.non_exiting = non_exiting

    def __enter__(self) -> Self:
        return self

    def __exit__(self, *exc_info: Any) -> None:
        if not self.non_exiting:
            self.close()

    def close(self, *args: Any, **kwargs: Any) -> None:
        raise NotImplementedError()

    def restart_kernel(self) -> None:
        raise NotImplementedError()

    def exec_cell(self, code: str) -> Execution:
        raise NotImplementedError()

    def exec_isolation(self, code: str) -> Execution:
        self.restart_kernel()
        return self.exec_cell(code)

    def upload_file(self, file: Union[str, Path]) -> Path:
        # Default behavior is a no-op (for local code interpreter)
        return Path(file)

    def download_file(
        self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
    ) -> Path:
        # Default behavior is a no-op (for local code interpreter)
        return Path(local_file_path)

In [25]:
# 표준화 유틸
# # src/utils/excute.py
from dataclasses import dataclass, field

load_dotenv()
_LOGGER = logging.getLogger(__name__)
_SESSION_TIMEOUT = 600  # 10 minutes
WORKSPACE = Path(os.getenv("WORKSPACE", ""))

class CodeInterpreterFactory:
    """Factory class for creating code interpreters.
    Could be extended to support multiple code interpreters.
    """

    _instance_map: Dict[str, CodeInterpreter] = {}
    _default_key = "default"

    @staticmethod
    def get_default_instance() -> CodeInterpreter:
        warnings.warn(
            "Use new_instance() instead for production usage, get_default_instance() is for testing and will be removed in the future."
        )
        inst_map = CodeInterpreterFactory._instance_map
        instance = inst_map.get(CodeInterpreterFactory._default_key)
        if instance:
            return instance
        instance = CodeInterpreterFactory.new_instance()
        inst_map[CodeInterpreterFactory._default_key] = instance
        return instance

    @staticmethod
    def new_instance(
        code_sandbox_runtime: Optional[str] = None,
        remote_path: Optional[Union[str, Path]] = None,
        non_exiting: bool = False,
    ) -> CodeInterpreter:
        if not code_sandbox_runtime:
            code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
        if code_sandbox_runtime == "local":
            instance = LocalCodeInterpreter(
                timeout=_SESSION_TIMEOUT,
                remote_path=remote_path,
                non_exiting=non_exiting,
            )
        else:
            raise ValueError(
                f"Unsupported code sandbox runtime: {code_sandbox_runtime}. Supported runtimes: local"
            )
        return instance


def _parse_local_code_interpreter_outputs(outputs: List[Dict[str, Any]]) -> Execution:
    """Parse notebook cell outputs to Execution object. Output types:
    https://nbformat.readthedocs.io/en/latest/format_description.html#code-cell-outputs
    """
    execution = Execution()
    for data in outputs:
        if data["output_type"] == "error":
            _LOGGER.debug("Cell finished execution with error")
            execution.error = Error(
                name=data["ename"],
                value=data["evalue"],
                traceback_raw=data["traceback"],
            )
        elif data["output_type"] == "stream":
            if data["name"] == "stdout":
                execution.logs.stdout.append(data["text"])
            elif data["name"] == "stderr":
                execution.logs.stderr.append(data["text"])
        elif data["output_type"] in "display_data":
            result = Result(is_main_result=False, data=data["data"])
            execution.results.append(result)
        elif data["output_type"] == "execute_result":
            result = Result(is_main_result=True, data=data["data"])
            execution.results.append(result)
        else:
            raise ValueError(f"Unknown output type: {data['output_type']}")
    return execution

def _remove_escape_and_color_codes(input_str: str) -> str:
    pattern = re.compile(r"\x1b\[[0-9;]*[mK]")
    return pattern.sub("", input_str)


In [None]:
# 표준화 유틸 

def b64_to_pil(b64_str: str) -> ImageTypes:
    if "," in b64_str:
        b64_str = b64_str.split(",")[1]
    return Image.open(BytesIO(base64.b64decode(b64_str)))

def add_media_to_chat(
    chat: List[AgentMessage],
    code_interpreter: Optional[CodeInterpreter] = None,
    append_to_prompt: bool = True,
) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
    orig_chat = copy.deepcopy(chat)
    int_chat = copy.deepcopy(chat)
    media_list: List[Union[str, Path]] = []
    
    for chat_i in int_chat:
        if chat_i.media is not None:
            media_list_i: List[Union[str, Path]] = []
            for media in chat_i.media:
                if isinstance(media, str) and media.startswith("data:image/"):
                    media_pil = b64_to_pil(media)
                    with tempfile.NamedTemporaryFile(
                        mode="wb", suffix=".png", delete=False
                    ) as temp_file:
                        media_pil.save(temp_file, format="PNG")
                        media = str(temp_file.name)
                elif isinstance(media, str) and media.startswith("data:video/"):
                    ext = media.split(";")[0].split("/")[-1]
                    with tempfile.NamedTemporaryFile(
                        mode="wb", suffix=f".{ext}", delete=False
                    ) as temp_file:
                        media_bytes = base64.b64decode(media.split(",")[1])
                        temp_file.write(media_bytes)
                        media = str(temp_file.name)
                if code_interpreter is not None:
                    media = str(code_interpreter.upload_file(media))
                media_list_i.append(media)
                # don't duplicate appending media name and only add them for user messages
                if (
                    not str(chat_i.content).endswith(f" Media name {media}")
                    and chat_i.role == "user"
                    and append_to_prompt
                ):
                    chat_i.content += f" Media name {media}"
            chat_i.media = media_list_i if len(media_list_i) > 0 else None
            media_list.extend(media_list_i)

    int_chat = cast(
        List[AgentMessage],
        [
            (
                AgentMessage(
                    role=c.role,
                    content=c.content,
                    media=c.media,
                )
                if c.media is not None
                else AgentMessage(role=c.role, content=c.content, media=None)
            )
            for c in int_chat
        ],
    )
    return int_chat, orig_chat, media_list

In [18]:
import nbformat
from nbclient import NotebookClient

class LocalCodeInterpreter(CodeInterpreter):
    def __init__(
        self,
        timeout: int,
        remote_path: Optional[Union[str, Path]] = None,
        non_exiting: bool = False,
        kernel_name: str = "python3",
    ):
        super().__init__(timeout=timeout, remote_path=remote_path, non_exiting=non_exiting)
        self.kernel_name = kernel_name
        self.nb = nbformat.v4.new_notebook()
        self.client = NotebookClient(
            self.nb,
            timeout=self.timeout,
            kernel_name=self.kernel_name,
            allow_errors=True,
        )
        # 커널 시작
        self.client.create_kernel_manager()
        self.client.start_new_kernel()

    def close(self, *args: Any, **kwargs: Any) -> None:
        try:
            self.client.shutdown_kernel(now=True)
        except Exception:
            pass

    def restart_kernel(self) -> None:
        # 커널 재시작 + notebook reset
        try:
            self.client.shutdown_kernel(now=True)
        except Exception:
            pass
        self.nb = nbformat.v4.new_notebook()
        self.client = NotebookClient(
            self.nb,
            timeout=self.timeout,
            kernel_name=self.kernel_name,
            allow_errors=True,
        )
        self.client.create_kernel_manager()
        self.client.start_new_kernel()

    def exec_cell(self, code: str) -> Execution:
        cell = nbformat.v4.new_code_cell(code)
        self.nb.cells.append(cell)
        idx = len(self.nb.cells) - 1

        # 셀 실행
        self.client.execute_cell(cell, idx)

        outputs = cell.get("outputs", []) or []
        return _parse_local_code_interpreter_outputs(outputs)


* 확인

In [23]:
from pathlib import Path
from dotenv import load_dotenv

# env 로드
env_path = Path.cwd().parent / ".env"
load_dotenv(env_path, override=True)

# 입력 표준화 
user_request = "이 이미지에 있는 토마토를 감지하고 개수를 세어주세요."
image_path = Path("data/image.png")


chat = [
    AgentMessage(
        role="user",
        content=user_request,
        media=[str(image_path)],   # 이미지 경로를 media로 넣기
    )
]

with CodeInterpreterFactory.new_instance(None) as code_interpreter:
    int_chat, orig_chat, _ = add_media_to_chat(chat, code_interpreter)

print("\n===== orig_chat (deep copied original) =====")
for i, msg in enumerate(orig_chat):
    print(f"\n[orig_chat {i}]")
    print("role   :", msg.role)
    print("content:", msg.content)
    print("media  :", msg.media)

print("\n===== int_chat (internal standardized chat) =====")
for i, msg in enumerate(int_chat):
    print(f"\n[int_chat {i}]")
    print("role   :", msg.role)
    print("content:", msg.content)
    print("media  :", msg.media)

print("\nint_chat last message keys:", int_chat[-1].model_dump().keys())




===== orig_chat (deep copied original) =====

[orig_chat 0]
role   : user
content: 이 이미지에 있는 토마토를 감지하고 개수를 세어주세요.
media  : ['data\\image.png']

===== int_chat (internal standardized chat) =====

[int_chat 0]
role   : user
content: 이 이미지에 있는 토마토를 감지하고 개수를 세어주세요. Media name data\image.png
media  : ['data\\image.png']

int_chat last message keys: dict_keys(['role', 'content', 'media'])


* Planner

detection 입력 형식
<br> Sequence[Message] = {
    "role": "user",
    "content": "Find tomatoes",
    "media": ["img1.png", "img2.png"]
}

LLM 입력 형식
<br> fixed_chat = {
    "role": msg["role"],
    "content": [
        {"type": "text", "text": "Find tomatoes."},
        {"type": "image_base64", "image_base64": "..."},
    ]
}

In [None]:
# tools
import numpy as np
import os
import tempfile
import urllib.request
from PIL import Image

def load_image(image_path: str) -> np.ndarray:
    if isinstance(image_path, np.ndarray):
        return image_path
    image = Image.open(image_path).convert("RGB")
    return np.array(image)

In [None]:
# Utils
import base64
import numpy as np

from io import BytesIO
from pathlib import Path
from PIL import Image
from typing import Optional, Union


def image_to_base64(image: Image.Image, resize: Optional[int] = None) -> str:
    if resize is not None:
        image.thumbnail((resize, resize))
    buffer = BytesIO()
    image.convert("RGB").save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")


def encode_media(
    media: Union[str, Path, np.ndarray, Image.Image],
    resize: Optional[int] = None,
) -> str:
    if isinstance(media, np.ndarray):
        return image_to_base64(Image.fromarray(media), resize)

    if isinstance(media, Image.Image):
        return image_to_base64(media, resize)

    if isinstance(media, (str, Path)):
        path = Path(media)
        suffix = path.suffix.lower()

        if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp"}:
            return image_to_base64(Image.open(path), resize)

    raise ValueError(f"Unsupported media type: {media}")

In [None]:
from __future__ import annotations

from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Iterator, Optional, Sequence, Union, TypedDict


class Message(TypedDict, total=False):
    role: str
    content: str
    media: Sequence[Union[str, Path]]

ReturnType = str | Iterator[str | None]

class LMM(ABC):
    @abstractmethod
    def generate(
        self,
        prompt: str,
        media: Optional[Sequence[Union[str, Path]]] = None,
        **kwargs: Any,
    ) -> ReturnType:
        """Single-prompt interface (optionally with media)."""
        raise NotImplementedError

    @abstractmethod
    def chat(
        self,
        chat: Sequence[Message],
        **kwargs: Any,
    ) -> ReturnType:
        """Chat interface with role/content(+media) messages."""
        raise NotImplementedError

    def __call__(
        self,
        input: str | Sequence[Message],
        **kwargs: Any,
    ) -> ReturnType:
        """Unified call interface: str -> generate, messages -> chat."""
        if isinstance(input, str):
            return self.generate(input, **kwargs)
        return self.chat(input, **kwargs)


In [7]:
# LLM Class: AnthropicLMM, OpsenAILMM
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
from pathlib import Path
from openai import OpenAI

class OpenAILMM(LMM):
    # 초기화
    def __init__(
        self,
        model_name: str = "gpt-4o-2024-05-13",
        api_key: Optional[str] = None,
        max_tokens: int = 4096,
        json_mode: bool = False,
        image_size: int = 768,
        image_detail: str = "low",
        **kwargs: Any
    ):
        if not api_key:
            self.client = OpenAI()
        else:
            self.client = OpenAI(api_key=api_key)

        self.model_name = model_name
        self.image_size = image_size
        self.image_detail = image_detail
        # o1 does not use max_tokens
        if "max_tokens" not in kwargs and not (
            model_name.startswith("o1") or model_name.startswith("o3")
        ):
            kwargs["max_tokens"] = max_tokens
        if json_mode:
            kwargs["response_format"] = {"type": "json_object"}
        self.kwargs = kwargs

    # 호출 인터페이스
    def __call__(
        self,
        input: Union[str, Sequence[Message]],
        **kwargs: Any
    ) -> str | Iterator[str | None]:
        if isinstance(input, str):
            return self.generate(input, **kwargs)
        return self.chat(input, **kwargs)
    
    # 응답 생성
    def chat(
        self,
        chat: Sequence[Message],
        **kwargs: Any
    ) -> Union[str, Iterator[Optional[str]]]:
        """LLM 모델 과의 채팅
        입력 파라미터의 형태
            str: [{"role": "user", "content": "Hello!"}, ...]
            multimodal:[{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
        """
        fixed_chat = []
        for msg in chat:
            fixed_c = {"role": msg["role"]}
            fixed_c["content"] = [{"type": "text", "text": msg["content"]}]
        
            if msg.get("media") is not None and self.model_name != "o3-mini":
                for media in msg["media"]:
                    resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                    image_detail = kwargs.get("image_detail", self.image_detail)
                    encoded_media = encode_media(cast(str, media), resize=resize)

                    fixed_c["content"].append(
                        {
                            "type": "image_base64",
                            "image_base64": encoded_media,
                            "detail": image_detail, 
                        }
                    )
            fixed_chat.append(fixed_c)

        tmp_kwargs = self.kwargs | kwargs  
        response = self.client.chat.completions.create(
            model=self.model_name, messages=fixed_chat, **tmp_kwargs 
        )
        if "stream" in tmp_kwargs and tmp_kwargs["stream"]:

            def f() -> Iterator[Optional[str]]:
                for chunk in response:
                    chunk_message = chunk.choices[0].delta.content
                    yield chunk_message

            return f()
        else:
            return cast(str, response.choices[0].message.content)
                
    # 코드 생성
    def generate(
        self,
        prompt: str,
        media: Optional[Sequence[Union[str, Path]]] = None,
        **kwargs: Any
    ) -> Union[str, Iterator[Optional[str]]]:
        message: List[Dict[str, Any]] = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
        if media and len(media) > 0 and self.model_name != "o3-mini":
            for m in media:
                resize = kwargs.get("resize")
                image_detail = kwargs.get("image_detail", self.image_detail)
                encoded_media = encode_media(m, resize=resize)

                message[0]["content"].append(
                    {
                        "type": "image_base64",
                        "image_base64": encoded_media,
                        "detail": image_detail,
                    }
                )
        # prefers kwargs from second dictionary over first
        tmp_kwargs = self.kwargs | kwargs

        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=message,
            **tmp_kwargs,
        )

        if tmp_kwargs.get("stream", False):

            def f() -> Iterator[Optional[str]]:
                for chunk in response:
                    yield chunk.choices[0].delta.content 
            return f()
        return cast(str, response.choices[0].message.content)

In [None]:
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
import anthropic
from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam, ThinkingBlockParam

class AnthropicLMM(LMM):
    """An LMM class for Anthropic models (base64-only image handling)."""

    def __init__(
        self,
        api_key: Optional[str] = None,
        model_name: str = "claude-3-5-sonnet-20240620",
        max_tokens: int = 4096,
        image_size: int = 768,
        **kwargs: Any,
    ):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.model_name = model_name
        self.image_size = image_size

        if "max_tokens" not in kwargs:
            kwargs["max_tokens"] = max_tokens
        self.kwargs = kwargs

    def __call__(
        self,
        input: Union[str, Sequence[Message]],
        **kwargs: Any,
    ) -> str | Iterator[str | None]:
        if isinstance(input, str):
            return self.generate(input, **kwargs)
        return self.chat(input, **kwargs)

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _setup_chat_kwargs(
        self, kwargs: Dict[str, Any]
    ) -> tuple[Dict[str, Any], bool]:
        tmp_kwargs = self.kwargs | kwargs
        thinking_enabled = (
            "thinking" in tmp_kwargs
            and tmp_kwargs["thinking"].get("type") == "enabled"
        )
        if thinking_enabled:
            tmp_kwargs["temperature"] = 1.0
        return tmp_kwargs, thinking_enabled

    def _convert_messages(
        self,
        chat: Sequence[Message],
        thinking_enabled: bool,
        **kwargs: Any,
    ) -> List[MessageParam]:
        messages: List[MessageParam] = []

        for msg in chat:
            role = msg["role"]

            if role == "user":
                content: List[Union[TextBlockParam, ImageBlockParam]] = [
                    TextBlockParam(type="text", text=cast(str, msg["content"]))
                ]

                if msg.get("media"):
                    for media_path in msg["media"]:
                        resize = kwargs.get("resize", self.image_size)
                        encoded = encode_media(cast(str, media_path), resize=resize)

                        # encode_media는 base64를 반환한다고 가정
                        content.append(
                            ImageBlockParam(
                                type="image",
                                source={
                                    "type": "base64",
                                    "media_type": "image/png",
                                    "data": encoded,
                                },
                            )
                        )

                messages.append(MessageParam(role="user", content=content))

            elif role == "assistant":
                if thinking_enabled:
                    messages.append(
                        self._create_thinking_message(cast(str, msg["content"]))
                    )
                else:
                    messages.append(
                        MessageParam(
                            role="assistant",
                            content=[
                                TextBlockParam(
                                    type="text", text=cast(str, msg["content"])
                                )
                            ],
                        )
                    )
            else:
                raise ValueError(f"Unsupported role: {role}")

        return messages

    def _create_thinking_message(self, text: str) -> MessageParam:
        content: List[Union[TextBlockParam, ThinkingBlockParam]] = []

        thinking = extract_tag(text, "thinking")
        signature = extract_tag(text, "signature")

        if thinking:
            content.append(
                ThinkingBlockParam(
                    type="thinking",
                    thinking=thinking.strip(),
                    signature=signature.strip() if signature else "",
                )
            )

        clean_text = text
        if thinking:
            clean_text = clean_text.replace(f"<thinking>{thinking}</thinking>", "")
        if signature:
            clean_text = clean_text.replace(f"<signature>{signature}</signature>", "")

        if clean_text.strip():
            content.append(TextBlockParam(type="text", text=clean_text.strip()))

        return MessageParam(role="assistant", content=content)

    def _handle_stream(
        self,
        stream: anthropic.Stream[anthropic.MessageStreamEvent],
    ) -> Iterator[Optional[str]]:
        def f() -> Iterator[Optional[str]]:
            for chunk in stream:
                if chunk.type == "content_block_delta":
                    if hasattr(chunk.delta, "text"):
                        yield chunk.delta.text
                elif chunk.type == "message_stop":
                    yield None

        return f()

    # ------------------------------------------------------------------
    # Public APIs
    # ------------------------------------------------------------------

    def chat(
        self,
        chat: Sequence[Message],
        **kwargs: Any,
    ) -> Union[str, Iterator[Optional[str]]]:
        tmp_kwargs, thinking_enabled = self._setup_chat_kwargs(kwargs)
        messages = self._convert_messages(chat, thinking_enabled, **kwargs)

        response = self.client.messages.create(
            model=self.model_name,
            messages=messages,
            **tmp_kwargs,
        )

        if tmp_kwargs.get("stream", False):
            return self._handle_stream(
                cast(anthropic.Stream[anthropic.MessageStreamEvent], response)
            )

        # non-streaming
        msg = cast(anthropic.types.Message, response)
        return cast(anthropic.types.TextBlock, msg.content[-1]).text

    def generate(
        self,
        prompt: str,
        media: Optional[Sequence[Union[str, Path]]] = None,
        **kwargs: Any,
    ) -> Union[str, Iterator[Optional[str]]]:
        content: List[Union[TextBlockParam, ImageBlockParam]] = [
            TextBlockParam(type="text", text=prompt)
        ]

        if media:
            for m in media:
                resize = kwargs.get("resize", self.image_size)
                encoded = encode_media(cast(str, m), resize=resize)

                content.append(
                    ImageBlockParam(
                        type="image",
                        source={
                            "type": "base64",
                            "media_type": "image/png",
                            "data": encoded,
                        },
                    )
                )

        tmp_kwargs = self.kwargs | kwargs
        response = self.client.messages.create(
            model=self.model_name,
            messages=[MessageParam(role="user", content=content)],
            **tmp_kwargs,
        )

        if tmp_kwargs.get("stream", False):

            def f() -> Iterator[Optional[str]]:
                for chunk in response:
                    if chunk.type == "content_block_delta":
                        yield chunk.delta.text
                    elif chunk.type == "message_stop":
                        yield None

            return f()

        return cast(anthropic.types.TextBlock, response.content[-1]).text

planner 역할

Plan: LLM - 계획 만들게  -> LLM이 만든 코드 실행 ->
<br> Critic: 관찰값 -> 다시 LLM -> 
<br> Summerize: plan + instruction + code 정리

In [28]:
class Planner():
    
    def __init__(
        self,
        # 호출할 VLM 지정   
        planner: Optional[LMM] = None,
        critic: Optional[LMM] = None,
        summerizer: Optional[LMM] = None;
        verbose: bool = False,
    ):


SyntaxError: incomplete input (1617616477.py, line 2)

In [None]:
class Coder():

In [None]:
# Planner 호출
cfg = Config()

planner = Planner(
    planner=cfg.create_planner(),
    summarizer=cfg.create_summarizer(),
    critic=cfg.create_critic(),
    verbose=True,
)

with CodeInterpreterFactory.new_instance(None) as code_interpreter:
    plan_context = planner.generate_plan(
        int_chat,
        max_steps=10,                 # 필요하면 조절
        code_interpreter=code_interpreter,
    )

print("\nPlan context object type:", type(plan_context))
print("\nPlan context dump (if available):")
try:
    print(plan_context.model_dump())
except Exception:
    print(plan_context)

In [None]:
# planner tools
def suggestion