Rough한 코드

Utils 

In [None]:
# basic module
import os, sys, copy

# data handling / io
import io
import numpy as np
import pandas as pd
import json, base64

# image handling
from PIL import Image, ImageDraw, ImageFont
import cv2
import torch

# LLM
from openai import OpenAI
from PIL import Image, ImageDraw, ImageFont

# Utils
from tqdm import tqdm
import time, argparse, logging, uuid
from dotenv import load_dotenv
from pathlib import Path

# 환경 변수 로드
load_dotenv()

In [None]:
# project root 
def find_project_root(marker_filename=".project-root"):
    current_dir = os.path.abspath(os.getcwd())
    while True:
        if os.path.isfile(os.path.join(current_dir, marker_filename)):
            return current_dir
        parent_dir = os.path.dirname(current_dir)
        if parent_dir == current_dir:
            raise FileNotFoundError(f"Could not find {marker_filename} in any parent directory.")
        current_dir = parent_dir
        
# 경로/출력 폴더 생성(find_project_root() 호출 후 사용)
def ensure_dir(path):
    # 디렉토리 없으면 생성
    os.makedirs(path, exist_ok=True)

def get_project_path(*paths):
    project_root = os.getcwd()    # or use a specific absolute path if needed
    return os.path.join(project_root, *paths)

In [None]:
# b64 encoder: jpg image -> base64 string / VLM API
def encode_image_to_base64(
    img_path,
    jpeg_quality: int = 85,
    max_size: int | None = 1024,   # None이면 리사이즈 안 함
):
    """
    - PNG 입력 시 → RGB JPEG로 변환 후 base64 인코딩
    - JPEG 입력 시 → 그대로 (필요하면 리사이즈)
    - max_size: 한 변 최대 길이 (planning/VLM 단계용)
    """
    try:
        img_path = Path(img_path)
        img = Image.open(img_path)

        # PNG / RGBA → RGB
        if img.mode in ("RGBA", "LA", "P"):
            img = img.convert("RGB")

        if max_size is not None:
            img.thumbnail((max_size, max_size))

        buf = io.BytesIO()

        # PNG면 JPEG로 변환
        if img_path.suffix.lower() == ".png":
            img.save(
                buf,
                format="JPEG",
                quality=jpeg_quality,
                optimize=True,
            )
        else:
            # jpg / jpeg 등
            img.save(
                buf,
                format="JPEG",
                quality=jpeg_quality,
                optimize=True,
            )

        buf.seek(0)
        return base64.b64encode(buf.read()).decode("utf-8")

    except Exception as e:
        print(f"Error encoding image: {e}")
        return None


1. VQA
<br> input: error image + user image + user request + prompt_VQA
<br> &nbsp;&nbsp;&nbsp;&nbsp;prompt_VQA:
<br> LLM-VQA 호출 및 첫번째 응답 파싱
<br> output: 요청과 이미지를 보고 판단한 내용


In [None]:
# tag parser

import json
import re
from typing import Any, Dict, Tuple

_TAG_RE = re.compile(
    r"<(?P<tag>analysis_log|plan_json)>(?P<body>[\s\S]*?)</(?P=tag)>",
    re.IGNORECASE,
)

def parse_tagged_output(text: str) -> Tuple[str, Dict[str, Any]]:
    """
    LLM 응답에서 <analysis_log>와 <plan_json>을 추출하고,
    plan_json은 JSON으로 파싱해서 dict로 반환.
    """
    text = (text or "").strip()
    matches = {m.group("tag").lower(): m.group("body").strip() for m in _TAG_RE.finditer(text)}

    if "analysis_log" not in matches:
        raise ValueError("응답에 <analysis_log>...</analysis_log> 태그가 없습니다.")
    if "plan_json" not in matches:
        raise ValueError("응답에 <plan_json>...</plan_json> 태그가 없습니다.")

    analysis_log = matches["analysis_log"]
    plan_json_str = matches["plan_json"]

    try:
        plan = json.loads(plan_json_str)
    except json.JSONDecodeError as e:
        raise ValueError(f"<plan_json> 내부 JSON 파싱 실패: {e}\n\nJSON:\n{plan_json_str}")

    return analysis_log, plan


In [None]:
# LLM-VQA 호출 및 첫번째 응답 파싱

from typing import Optional, Union
from anthropic import Anthropic

def model_response_anthropic(
    anthropic_client: Anthropic,
    prompt_text: str,
    model: str = "claude-3-7-sonnet-20250219",
    temperature: float = 0.1,
    max_tokens: int = 1000,
    parse_tags: bool = True,
    print_log: bool = True,
    # 이미지 입력(선택)
    img_b64: Optional[str] = None,
    media_type: str = "image/png",
) -> Union[str, Dict[str, Any]]:
    """
    Anthropic(Claude) 전용.
    - prompt_text: prompt1 (태그 출력 규칙 포함)
    - img_b64/media_type

    Returns:
      - parse_tags=False: raw string
      - parse_tags=True: dict {"raw": str, "analysis_log": str, "plan": dict}
    """
    # content 파트 구성
    content = [{"type": "text", "text": prompt_text}]

    if img_b64 is not None:
        content.append({
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": media_type,
                "data": img_b64,
            }
        })

    resp = anthropic_client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[{"role": "user", "content": content}],
    )

    # Claude 응답은 content blocks로 옴 → text block만 합치기
    raw = "".join(
        blk.text for blk in resp.content
        if getattr(blk, "type", None) == "text"
    ).strip()

    if not parse_tags:
        return raw

    analysis_log, plan = parse_tagged_output(raw)

    if print_log:
        print("\n[analysis_log]\n" + analysis_log)

    return {"raw": raw, "analysis_log": analysis_log, "plan": plan}



확인

In [None]:
# === 1. VQA ===

# 1-1) image, request
    
PROJECT_ROOT = Path(find_project_root())

def get_project_path(*parts):
    return PROJECT_ROOT.joinpath(*parts)

env_path = get_project_path(".env")
img = get_project_path("notebook", "data", "tomato_error.png")
user_request = "semi-ripe 토마토 개수를 세어주세요."
img_b64 = encode_image_to_base64(img, 80, 1024)

display(Image.open(img).resize((512, 512)))
print("인코딩 문자열의 앞 50자: ")
print(img_b64[:50]) 

In [None]:
# 1-2) VQA propmt 
prompt_VQA = f"""
You are an expert vision task planner.

You will be given:
- A user request (Korean)
- ONE image (provided to you as an image input)

Your job in this step is ONLY to analyze the user request and propose a concrete, tool-agnostic plan.
Do NOT run code. Do NOT claim results. Do NOT hallucinate object counts.

User request: {user_request}

Output MUST contain EXACTLY TWO TAGS in this order:
1) <analysis_log> ... </analysis_log>  (Korean, human-readable, step-by-step, short)
2) <plan_json> ... </plan_json>        (machine-readable, MUST be valid JSON)

Rules:
- Do not output anything outside the two tags.
- <analysis_log> should be concise: 5–10 lines, each starting with "Step N:".
- <plan_json> must be STRICT JSON (no trailing commas, no comments, no markdown).

<plan_json> JSON schema:
{{
  "language": "ko",
  "intent_summary": string,
  "task_type": "counting",
  "target_definition": {{
    "primary_object": "tomato",
    "required_attributes": ["red"],
    "exclusions": [string],
    "edge_cases": [string]
  }},
  "subtasks": [
    {{
      "name": string,
      "goal": string,
      "suggested_method": string
    }}
  ],
  "tool_requirements": {{
    "needs_localization": boolean,
    "needs_instance_separation": boolean,
    "needs_attribute_reasoning": boolean,
    "preferred_outputs": [string]
  }},
  "verification_checks": [string],
  "questions_if_ambiguous": [string]
}}
"""

In [None]:
# 1-3) LLM 호출 및 응답 생성, 확인
import os
from anthropic import Anthropic

anthropic_key  = os.getenv("ANTHROPIC_API_KEY")
if not anthropic_key:
    raise RuntimeError("ANTHROPIC_API_KEY가 설정되어 있지 않습니다.")


anthropic_client = Anthropic(api_key=anthropic_key)

out = model_response_anthropic(
    anthropic_client=anthropic_client,
    prompt_text=prompt_VQA,
    model="claude-sonnet-4-20250514",
    max_tokens=1200,
    temperature=0.2,
    parse_tags=True,
    print_log=True,
    img_b64=img_b64,    
    media_type="image/jpeg", 
)

plan = out["plan"]

2. Plan
<br> input: error image + user request + LLM-VQA 응답 + prompt_plan
<br> &nbsp;&nbsp;&nbsp;&nbsp; prompt_plan: tuned for smartfam environment(tool list 포함)
<br> LLM-plan 호출 및 첫번째 응답 파싱
<br> output: tool list에서 tool 선택 후의 계획


In [None]:
import importlib, inspect
from typing import Any, Dict, List

def load_visionagent_tools_strict() -> List[Dict[str, Any]]:
    mod = importlib.import_module("vision_agent.tools.tools")
    tools: List[Dict[str, Any]] = []

    for name, obj in inspect.getmembers(mod):
        if not inspect.isfunction(obj) or name.startswith("_"):
            continue

        # 1) tools.py에 "정의된" 함수만 (재export된 표준함수 제거)
        if getattr(obj, "__module__", None) != mod.__name__:
            continue

        doc = inspect.getdoc(obj) or ""

        # 2) '... is a tool' 패턴만 통과 (유틸 제거)
        #    (원하면 조건 완화 가능)
        if " is a tool" not in doc:
            continue

        tools.append({
            "name": name,
            "qualname": f"{mod.__name__}.{name}",
            "type": "function",
            "signature": str(inspect.signature(obj)),
            "doc": doc,
        })

    return tools


In [None]:
def format_tool_desc(
    tools: List[Dict[str, Any]],
    max_tools: int = 50,
    max_doc_chars: int = 300,
) -> str:
    lines = []
    for t in tools[:max_tools]:
        lines.append(
            f"- {t['name']} ({t['type']})\n"
            f"  qualname: {t['qualname']}\n"
            f"  signature: {t['signature']}\n"
            f"  doc: {t['doc'][:max_doc_chars].replace('\\n', ' ')}"
        )
    return "\n".join(lines)


* 확인

In [None]:
# === 2. Plan ===
# 2-1) input

vqa_log = out["analysis_log"]
vqa_struct = out["plan"]

# vqa_struct를 JSON 형태로 보기 좋게 출력
print("=" * 60)
print("[VQA Plan Structure (JSON)]")
print("=" * 60)
print(json.dumps(vqa_struct, indent=2, ensure_ascii=False))

In [None]:
# 2-2) tool list
tools = load_visionagent_tools_strict()
tool_desc = format_tool_desc(tools, max_tools=80, max_doc_chars=350)
print("Loaded tools:", len(tools))
# print(tool_desc)
print("qwen" in tool_desc.lower())

In [None]:
# 2-3) tool list 추가: detected_image_crop 

#

In [None]:
# 2-4) prompt_plan
prompt_PLAN = f"""
You are a VisionAgent-style planner/controller.

Your job is to decide the NEXT ACTION(s) to take using the available tools, based on the user's request and the accumulated evidence. You do NOT execute tools. You only output tool calls or the final answer.

You will be given:
- A user request (Korean)
- ONE image (already annotated with detection boxes/labels overlaid)
- Tool list with available actions
- VQA log: chronological reasoning, detection notes, bounding-box/label evaluations, and any prior validation outcomes
- VQA structured JSON summary of the detection/analysis results
- Prior tool observations may also appear in the conversation history (as "observation").

Primary evidence:
- Build decisions primarily from [VQA_LOG] and [VQA_STRUCT_JSON].
- Do NOT hallucinate new detections, boxes, or attributes beyond the provided evidence and tool outputs.

User request (Korean):
{user_request}

[VQA_LOG]
{vqa_log}

[VQA_STRUCT_JSON]
{json.dumps(vqa_struct, ensure_ascii=False, indent=2)}

You MUST use ONLY the tools listed in [TOOLS] below.
Do NOT invent, rename, or assume any tools beyond this list.

[TOOLS]
{tool_desc}

────────────────────────────────
CORE CONTROL LOOP BEHAVIOR
────────────────────────────────
At each turn, output either:
(A) Tool calls for the NEXT immediate actions (one or more tool calls), OR
(B) A final answer if no more tools are needed.

Do NOT output a full end-to-end plan. Do NOT output steps[1..N].
The executor will run your tool calls in the order you provide, append observations, and call you again with updated context.

────────────────────────────────
DETECTION-SPECIFIC BEHAVIOR
────────────────────────────────
- The image is already annotated. Prefer verification of existing detections.
- If bounding-box coordinates are available in VQA_STRUCT_JSON, use them for cropping and verification.
- If bounding-box coordinates are NOT available, do NOT guess. Request the annotation file
  (COCO JSON / YOLO TXT / model output JSON) or propose a concrete method to obtain coordinates.

Recommended strategy for detection verification tasks:
1) Crop each existing detected region (one-by-one or batch) and verify whether it matches its label.
2) If misdetections or missing objects are suspected, run re-detection (only if a tool exists) and verify again.
3) When confident, produce the final count/summary.

────────────────────────────────
OUTPUT FORMAT (STRICT)
────────────────────────────────
Output MUST contain EXACTLY TWO tags in this exact order:
1) <analysis_log> ... </analysis_log>
2) <action_json> ... </action_json>

Do NOT output anything outside the two tags.

<analysis_log> rules:
- 3–7 lines only
- Each line must start with "Step N:"
- Only describe the immediate reasoning for the NEXT action(s), not a full multi-step plan.

<action_json> rules:
- MUST be STRICT JSON (no trailing commas, no comments, no markdown)
- Must match exactly one of the following schemas:

Schema 1: Tool calls
{
  "language": "ko",
  "mode": "tool_calls",
  "selected_tools": [string],
  "tool_calls": [
    {
      "id": int,
      "tool": string,
      "parameters": object,
      "expected_result": string
    }
  ],
  "open_questions": [string]
}

Schema 2: Final answer
{
  "language": "ko",
  "mode": "final",
  "final_answer": string,
  "open_questions": [string]
}

Additional rules:
- tool_calls must be listed in exact execution order; ids must start at 1 and increase strictly by 1 within this turn.
- Each tool call MUST reference a tool name from [TOOLS].
- Keep tool_calls minimal: only what is needed before the next observation.
- If you need missing inputs (e.g., box coordinates), set mode="final" and clearly request them in final_answer, or set open_questions accordingly.
"""


In [None]:
from winsound import PlaySound

out_plan = model_response_anthropic(
    anthropic_client=anthropic_client,
    prompt_text=prompt_PLAN,
    model="claude-sonnet-4-20250514",
    max_tokens=1400,
    temperature=0.2,
    parse_tags=True,
    print_log=True,
    img_b64=img_b64,
    media_type="image/jpeg",
)

plan_struct = out_plan["plan"]

PlaySound_struct = out["plan"]

# vqa_struct를 JSON 형태로 보기 좋게 출력
print("=" * 60)
print("[VQA Plan Structure (JSON)]")
print("=" * 60)
print(json.dumps(plan_struct, indent=2, ensure_ascii=False))