In [None]:
import argparse
import json
from pathlib import Path
import random
import os
from openai import OpenAI
import matplotlib.pyplot as plt
import cv2
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

import boto3



path = Path("baseline.json")
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

data = data["data"]

def detect_image_format(path: Path) -> str:
    ext = path.suffix.lower().lstrip(".")
    return "jpeg" if ext == "jpg" else ext

def load_image_bytes(path: Path) -> bytes:
    return path.read_bytes()

def call_converse_once(client, model_id: str, prompt: str, image_bytes: bytes, image_format: str,) -> dict:


    messages = [{
        "role": "user",
        "content": [
            {"text": prompt},
            {"image": {"format": image_format, "source": {"bytes": image_bytes}}},
        ],
    }]

    resp = client.converse(
        modelId=model_id,
        messages=messages,
    )

    # Extract concatenated text blocks from the first output message.
    content_blocks = resp.get("output", {}).get("message", {}).get("content", []) or []
    description = "".join(block.get("text", "") for block in content_blocks if "text" in block).strip()

    return {"model_id": model_id, "text": description, "raw": resp}

def call_openai(client, model_id: str, prompt: str, image_path: Path, image_format: str,
                max_tokens: int = 400, temperature: float = 0.2, top_p: float = 0.9) -> dict:

    image = client.files.create(
        file=open(image_path, "rb"),
        purpose="user_data"
    )
    messages = [{
        "role": "user",
        "content": 
        [
            {
                "type": "input_image",
                "file_id": image.id
            },
            {
                "type": "input_text",
                "text": prompt
            }
        ],
    }]

    resp = client.responses.create(
        model=model_id,
        input=messages,
    )


    return {"model_id": model_id, "text": resp.output_text, "raw": resp}

def random_image_path(folder):
    exts = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
    images = [p for p in Path(folder).iterdir() if p.suffix.lower() in exts]
    return random.choice(images) if images else None

def test_prompt(prompt, image_path, model_ids=None):
    if model_ids is None:
        model_ids = [
            "us.amazon.nova-premier-v1:0",
            "us.anthropic.claude-opus-4-1-20250805-v1:0",
            "gpt-5",
        ]
    image_format = detect_image_format(image_path)
    image_bytes = load_image_bytes(image_path)
    region = "us-east-1"
    bedrock_client = boto3.client("bedrock-runtime", region_name=region)
    openai_client = OpenAI()

    max_tokens = 8191

    results = []
    with ThreadPoolExecutor(max_workers=min(8, len(model_ids))) as pool:
            futs = {
                pool.submit(
                    call_openai if m.startswith("gpt") else call_converse_once,
                    openai_client if m.startswith("gpt") else bedrock_client,
                    m, prompt, 
                    image_path if m.startswith("gpt") else image_bytes,
                    image_format
                ): m for m in model_ids
            }
            for fut in as_completed(futs):
                try:
                    result = fut.result()
                    if result["raw"] and "usage" in result["raw"]:
                        result["totalTokens"] = result["raw"]["usage"].get("totalTokens")

                    elif result["raw"].usage:
                        result["totalTokens"] = result["raw"].usage.total_tokens
                    else:
                        result["totalTokens"] = "N/A"
                    results.append(result)
                except Exception as e:
                    results.append({"model_id": futs[fut], "text": f"[ERROR] {type(e).__name__}: {e}", "raw": None, "totalTokens": "N/A"})

    from IPython.display import HTML, display
    from pathlib import Path
    import html


    img_src = Path(image_path).as_posix()
    image_file_name = Path(image_path).name
    baseline_desc =[it for it in data["items"]
        if image_file_name.lower() in str(it.get("frameImageURL")).lower()]
    results.append({
        "model_id": "baseline",
        "text": baseline_desc,
        "totalTokens": "N/A"
    })

    text = "\n\n".join(
        f"{r['model_id']}:\n{r['text']}\nTotal Tokens: {r['totalTokens']}" for r in results
    )
    
    safe_text = html.escape(text)

    display(HTML(f"""
    <div style="display:flex; gap:24px; align-items:flex-start; margin:8px 0;">
    <img src="{img_src}"
        style="max-width:50%; height:auto; object-fit:contain; border:1px solid #ddd; border-radius:8px;"/>
    <div style="flex:1; white-space:pre-wrap; word-wrap:break-word; font-family:ui-monospace, SFMono-Regular, Menlo, Consolas, 'Liberation Mono', monospace; font-size:14px; line-height:1.5;">
        {safe_text}
    </div>
    </div>
    """))

image = random_image_path("pokemon-scenes-clips")

In [113]:
prompt = """You are an AI model designed to perform in-depth analysis of a single frame extracted from a video stream.
    The frame represents a clear image of a detected scene within the video. Your task is to analyze and describe the action, entities, and any relevant elements present in the image.
    Instructions:
    1. Identify and Describe the Action: 
    Determine if there is any action or movement occurring in the frame.
    Provide a detailed description of the action, including the subjects involved (e.g., humans, animals, vehicles).
    If multiple actions are occurring, describe each one separately.
    
    2. Identify and Describe Entities: 
    Identify all significant entities in the image, such as humans, animals, vehicles, or objects.
    For humans: Provide details about their number, gender (if identifiable), age group, posture, and interaction with other entities.
    For animals: Specify the type, number, and their behavior or interaction with the environment or other entities.
    For objects: Identify any important objects, their positioning, and their possible relevance to the scene.
    
    3. Environmental and Contextual Analysis: 
    Analyze the environment depicted in the frame, including the setting (e.g., indoor, outdoor, urban, rural).
    Note any specific environmental conditions such as weather, lighting, or time of day that might impact the scene.
    Consider the context of the action or entities—what might have led to this scene or what could happen next.
    
    4. Identify and Highlight Key Elements: 
    Point out any elements in the image that are crucial to understanding the scene, such as facial expressions, gestures, specific objects, or environmental details.
    Highlight any anomalies or unusual aspects in the frame.
    
    5. Provide Insights and Interpretations: 
    Offer potential interpretations of the scene, considering the interactions between entities and the context provided by the environment.
    Suggest any implications or consequences of the depicted action or scene, especially if they are important for further analysis or decision-making.
    
    6. General Guidelines: 
    Be as specific and detailed as possible in your descriptions.
    Avoid making assumptions beyond what can be directly inferred from the image.
    Focus on clarity and relevance, ensuring that your analysis is directly tied to the visual elements in the frame.
    Limit the number of returned tokens of the description to 8191 tokens.
    Directly describe the scene without any introductory phrases or explanations, like "in the image" or "the scene shows."
    
    Provide the results as a single paragraph for the overall information required by the instructions.
    Do not start the description with "in the image" or "the image shows" or "the scene depicts", or any other similar variants as it is already implied that you are describing the scene from the image.
    If the scene image does not contain any discernible features, object or entities visible, do not provide the description at all, just the rest of the information!
    The analysis will be used to understand the scene within the context of the video stream, potentially aiding in real-time decision-making or further processing of the video content."""

test_prompt(prompt,image)
model_ids = [
        "amazon.nova-pro-v1:0",
        "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "gpt-4.1"
    ]
test_prompt(prompt,image,model_ids=model_ids)


In [114]:
prompt = """You are an AI model designed to perform in-depth analysis of a single frame extracted from a video stream.
    The frame represents a clear image of a detected scene within the video. Your task is to analyze and describe the action, entities, and any relevant elements present in the image.
    Instructions:
    1. Provide a single paragraph summary that encapsulates the overall scene, including key actions, entities, and environmental context.
    2. Be specific with the names of any characters you recognize. If you do not recognize any characters say nothing on the subject 
    3. use simple but descriptive language
    4. Directly describe the scene without any introductory phrases or explanations, like "in the image" or "the scene shows."""
test_prompt(prompt,image)
model_ids = [
        "amazon.nova-pro-v1:0",
        "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "gpt-4.1"
    ]
test_prompt(prompt,image,model_ids=model_ids)

In [115]:
prompt = """You are an AI model tasked with in-depth analysis of a single video frame (a clear image of a detected scene). Produce exactly ONE paragraph that directly describes the scene (no phrases like “in the image,” “the scene shows,” or equivalents). Keep language simple but specific.

Content to include, in order:
1) Actions: State whether any action/motion is present; if multiple actions occur, describe each distinctly.
2) Entities: Name all significant entities (humans, animals, vehicles, other objects). For humans, include count and visible attributes (gender if clearly identifiable, age group, posture) and interactions. For animals, include type, count, and behavior. For objects, note important items, positions, and relevance.
3) Environment: Summarize setting (indoor/outdoor, urban/rural, etc.), conditions (weather, lighting, time-of-day cues), and any contextual cues that affect interpretation.
4) Key elements & anomalies: Call out details critical to understanding (facial expressions, gestures, notable objects, environmental details) and any unusual or out-of-place aspects.
5) Interpretations (cautious): Offer restrained, evidence-based implications of interactions or actions. Do not speculate beyond what is visually supported.

Style & guardrails:
- Use clear, concrete descriptions; avoid assumptions not directly supported by the frame.
- Provide proper names for characters if you recognize them; otherwise omit naming entirely.
- Be precise but concise; cap output well below the model’s token limit (keep it compact and readable).

Edge case:
- If the frame lacks discernible features/entities (e.g., blank/obstructed), return an empty string (no paragraph)."""
test_prompt(prompt,image)
model_ids = [
        "amazon.nova-pro-v1:0",
        "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "gpt-4.1"
    ]
test_prompt(prompt,image,model_ids=model_ids)