In [17]:
import base64
import json
import os
from dataclasses import dataclass
from pathlib import Path
from typing import List, Literal, Optional

import requests
from dotenv import load_dotenv
from PIL import Image
from pydantic import BaseModel, Field

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.runnables import RunnableLambda

load_dotenv()

True

In [18]:
# -----------------------------
# 1) Output schema (Pydantic)
# -----------------------------
class PlaceGuess(BaseModel):
    name: Optional[str] = None
    city: Optional[str] = None
    country: Optional[str] = None


class PlaceAnalysis(BaseModel):
    input_type: Literal["place_photo", "not_a_place", "ambiguous"]
    place_guess: PlaceGuess
    confidence: float = Field(ge=0.0, le=1.0)
    what_i_see: List[str]
    significance: List[str]
    response: str


parser = PydanticOutputParser(pydantic_object=PlaceAnalysis)

In [19]:
# -----------------------------
# 2) Helpers
# -----------------------------
def preprocess_image_to_b64(image_path: str) -> str:
    """
    Make a reasonable-size JPEG and return base64 (no data-url prefix).
    Keeps request sizes sane and ImgBB happy.
    """
    p = Path(image_path)
    if not p.exists():
        raise FileNotFoundError(f"Image not found: {image_path}")

    img = Image.open(p).convert("RGB")
    img.thumbnail((1600, 1600))

    tmp = p.with_suffix(".tmp.jpg")
    img.save(tmp, format="JPEG", quality=90)

    b64 = base64.b64encode(tmp.read_bytes()).decode("utf-8")
    tmp.unlink(missing_ok=True)
    return b64


def upload_to_imgbb(image_path: str) -> str:
    """
    Upload to ImgBB and return a public image URL.
    Endpoint pattern commonly used:
      POST https://api.imgbb.com/1/upload?key=API_KEY
    with form field 'image'. :contentReference[oaicite:1]{index=1}
    """
    api_key = os.getenv("IMGBB_API_KEY")
    if not api_key:
        raise RuntimeError("Missing IMGBB_API_KEY in environment")

    b64 = preprocess_image_to_b64(image_path)

    url = f"https://api.imgbb.com/1/upload"
    params = {"key": api_key}
    data = {"image": b64}

    r = requests.post(url, params=params, data=data, timeout=60)
    r.raise_for_status()
    payload = r.json()

    if not payload.get("success"):
        raise RuntimeError(f"ImgBB upload failed: {payload}")

    # ImgBB typically returns .data.url (public page) and .data.display_url (direct image)
    data_obj = payload["data"]
    return data_obj.get("display_url") or data_obj["url"]


def serpapi_reverse_image(public_image_url: str) -> dict:
    """
    Use SerpApi Google Lens (best general option).
    SerpApi docs: engine=google_lens, parameter 'url', and 'type' controls results. :contentReference[oaicite:2]{index=2}
    """
    api_key = os.getenv("SERPAPI_API_KEY")
    if not api_key:
        raise RuntimeError("Missing SERPAPI_API_KEY in environment")

    params = {
        "engine": "google_lens",
        "url": public_image_url,   # required by google_lens :contentReference[oaicite:3]{index=3}
        "type": "all",             # all/products/exact_matches/visual_matches :contentReference[oaicite:4]{index=4}
        "hl": "en",
        "api_key": api_key,
    }

    r = requests.get("https://serpapi.com/search", params=params, timeout=90)
    r.raise_for_status()
    return r.json()


def extract_serp_evidence(serp_json: dict, max_items: int = 8) -> dict:
    """
    Pull out the most useful fields for place ID:
    - titles/sources from exact matches
    - titles/sources from visual matches
    - any knowledge graph / entity hints if present
    Keep it compact so the LLM doesn't drown.
    """
    evidence = {
        "knowledge_graph": {},
        "exact_matches": [],
        "visual_matches": [],
    }

    kg = serp_json.get("knowledge_graph") or {}
    # Keep only a few common keys if present
    for k in ["title", "type", "description", "website", "address", "location"]:
        if k in kg and kg[k]:
            evidence["knowledge_graph"][k] = kg[k]

    exact = serp_json.get("exact_matches") or []
    for item in exact[:max_items]:
        evidence["exact_matches"].append({
            "title": item.get("title"),
            "source": item.get("source"),
            "link": item.get("link"),
        })

    visual = serp_json.get("visual_matches") or []
    for item in visual[:max_items]:
        evidence["visual_matches"].append({
            "title": item.get("title"),
            "source": item.get("source"),
            "link": item.get("link"),
        })

    return evidence

In [20]:
# -----------------------------
# 3) PromptTemplate
# -----------------------------
prompt = PromptTemplate(
    input_variables=["format_instructions", "serp_evidence_json"],
    template=(
        "You are a careful place-identification assistant.\n"
        "The user provides ONLY a photo, but you also receive reverse-image evidence.\n\n"
        "Rules:\n"
        "1) Decide: input_type is one of place_photo | not_a_place | ambiguous.\n"
        "2) Use the reverse-image evidence as grounding. Do NOT guess wildly.\n"
        "3) If evidence is weak or contradictory, set input_type=ambiguous and keep confidence low.\n"
        "4) Provide significance ONLY if you are confident the place is correctly identified.\n"
        "5) Output MUST be valid JSON matching the schema exactly.\n\n"
        "Reverse-image evidence (JSON):\n"
        "{serp_evidence_json}\n\n"
        "{format_instructions}\n"
    ),
)

In [None]:
# -----------------------------
# 4) LangChain chain (LCEL)
# -----------------------------
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0.2)

# from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

# llm = HuggingFaceEndpoint(
#     repo_id = "openai/gpt-oss-120b",
#     task = "text-generation",  
# )

# model = ChatHuggingFace(llm = llm)

def build_message(inputs: dict) -> List[HumanMessage]:
    image_path = inputs["image_path"]

    public_url = upload_to_imgbb(image_path)
    serp = serpapi_reverse_image(public_url)
    evidence = extract_serp_evidence(serp)

    prompt_text = prompt.format(
        format_instructions=parser.get_format_instructions(),
        serp_evidence_json=json.dumps(evidence, ensure_ascii=False, indent=2),
    )

    # Send BOTH the text + the public image URL to the model
    # (LangChain supports image_url content parts for Gemini). :contentReference[oaicite:5]{index=5}
    msg = HumanMessage(
        content=[
            {"type": "text", "text": prompt_text},
            {"type": "image_url", "image_url": {"url": public_url}},
        ]
    )
    return [msg]

In [22]:
chain = (
    RunnableLambda(build_message)
    | model
    | (lambda ai_msg: ai_msg.content)
    | parser
)

In [23]:
image_path = "img/img3.jpg"  # Example image path

In [None]:
result: PlaceAnalysis = chain.invoke({"image_path": image_path})
print(result.model_dump_json(indent=2))

# result: dict = chain.invoke({"image_path": image_path})
# print(json.dumps(result, indent=2))

In [None]:
# import json

# def test_all(image_path: str):
#     print("\n[1] preprocess_image_to_b64")
#     b64 = preprocess_image_to_b64(image_path)
#     print("  ok | b64 length:", len(b64))

#     print("\n[2] upload_to_imgbb")
#     public_url = upload_to_imgbb(image_path)
#     print("  ok | url:", public_url)

#     print("\n[3] serpapi_reverse_image")
#     serp = serpapi_reverse_image(public_url)
#     print("  ok | returned keys:", list(serp.keys())[:12])

#     print("\n[4] extract_serp_evidence")
#     evidence = extract_serp_evidence(serp)
#     print("  ok | evidence preview:\n", json.dumps(evidence, indent=2)[:1200])

#     print("\n[5] chain.invoke")
#     out = chain.invoke({"image_path": image_path})
#     print("  ok | final output:\n", out.model_dump_json(indent=2))

#     return out

# # usage:
# test_all(image_path)



[1] preprocess_image_to_b64
  ok | b64 length: 19108

[2] upload_to_imgbb
  ok | url: https://i.ibb.co/DDnJFDkF/f97501121e1d.jpg

[3] serpapi_reverse_image
  ok | returned keys: ['search_metadata', 'search_parameters', 'ai_overview', 'visual_matches', 'related_content']

[4] extract_serp_evidence
  ok | evidence preview:
 {
  "knowledge_graph": {},
  "exact_matches": [],
  "visual_matches": [
    {
      "title": "Universitato Jahangirnagar - Vikipedio",
      "source": "Wikipedia",
      "link": "https://eo.wikipedia.org/wiki/Universitato_Jahangirnagar"
    },
    {
      "title": "The first light ft. Shaheed Minar, JU",
      "source": "Instagram",
      "link": "https://www.instagram.com/p/DN6BLkyCRLX/"
    },
    {
      "title": "JU suspends Sunday classes, postpones exams over JUCSU polls ...",
      "source": "Amader Barta",
      "link": "https://www.amaderbarta.net/en/news/ju-suspends-sunday-classes-postpones-exams-over-jucsu-polls-342761"
    },
    {
      "title": "Somoy -

PlaceAnalysis(input_type='place_photo', place_guess=PlaceGuess(name='Shaheed Minar', city='Savar', country='Bangladesh'), confidence=0.8, what_i_see=['A large red brick monument with a tall, angular structure in the center.', 'Paved pathways leading to the monument.', 'Grassy areas with manicured bushes on either side of the pathways.', 'Buildings in the background.', 'A bright blue sky with scattered clouds.'], significance=['The monument appears to be the Shaheed Minar at Jahangirnagar University, a prominent landmark in Bangladesh.', "Shaheed Minar (Martyrs' Monument) is a national monument in Dhaka, Bangladesh, built in memory of those killed in the Bengali Language Movement demonstrations in 1952."], response='The photo shows the Shaheed Minar, a significant monument located at Jahangirnagar University in Savar, Bangladesh. This monument is a symbol of the Bengali Language Movement.')