In [1]:
%pip install sentence-transformers
%pip install faiss-cpu

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_

In [1]:
import base64
import os
import json

from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI


endpoint = "https://oai-aip-cv-ont-sdc.openai.azure.com/"
model_name = "gpt-4o-mini"
deployment = "gpt-4o-mini"
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    azure_ad_token_provider=token_provider,
)

In [None]:
"""
Sidewalk Defects VLM + RAG (Hackathon Notebook)

Step 1: Indexing
- Expect five grade folders `A+`, `A`, `B`, `C`, `D`
- Each folder contains:
  • one text file (any `*.txt`) with the grade description
  • a few example images (`*.jpg|*.jpeg|*.png|*.webp`)

Step 2: Retrieval
- Given a query image:
  • Embed it
  • Retrieve the most similar grade example images and grade descriptions
  • Use the retrieved grade as context for the LLM to answer: "Which score would you give to this image?"
"""

import os
import glob
from dataclasses import dataclass
from typing import List, Dict, Tuple, Any

import numpy as np
from PIL import Image

# Embeddings
from sentence_transformers import SentenceTransformer
import faiss

# =====================
# Config
# =====================
PROJECT_ROOT = os.getcwd()
GRADES_DIR = os.path.join(PROJECT_ROOT, "data", "grades")
PREFERRED_MODEL = "jinaai/jina-clip-v2"
FALLBACK_MODEL = "clip-ViT-B-32"

# =====================
# Data classes
# =====================
@dataclass
class GradeDesc:
    grade: str
    description: str

@dataclass
class RetrievalResult:
    idx: int
    score: float
    meta: Dict[str, str]

# =====================
# Utilities
# =====================
def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray:
    norm = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / np.maximum(norm, eps)

def load_image(path: str) -> Image.Image:
    return Image.open(path).convert("RGB")

def walk_images(root: str) -> List[str]:
    exts = ("*.jpg", "*.jpeg", "*.png", "*.webp")
    paths = []
    for e in exts:
        paths.extend(glob.glob(os.path.join(root, e)))
    return sorted(paths)

# =====================
# Model
# =====================
def load_st_model() -> SentenceTransformer:
    try:
        return SentenceTransformer(PREFERRED_MODEL)
    except Exception:
        return SentenceTransformer(FALLBACK_MODEL)

class Embedder:
    def __init__(self, model: SentenceTransformer):
        self.model = model

    def embed_images(self, imgs: List[Image.Image]) -> np.ndarray:
        return self.model.encode(imgs, convert_to_numpy=True, normalize_embeddings=True).astype("float32")

    def embed_texts(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).astype("float32")

# =====================
# Index
# =====================
class FaissIndex:
    def __init__(self, dim: int):
        self.index = faiss.IndexFlatIP(dim)
        self.meta: List[Dict[str, str]] = []
        self.dim = dim

    def add(self, vecs: np.ndarray, metas: List[Dict[str, str]]):
        vecs = l2_normalize(vecs)
        self.index.add(vecs.astype("float32"))
        self.meta.extend(metas)

    def search(self, q: np.ndarray, topk: int = 5) -> List[RetrievalResult]:
        """Cosine similarity search; accepts 1D (d,) or 2D (n,d) query arrays.
        Returns empty list if index is missing or empty.
        """
        # Handle missing/empty index gracefully
        if getattr(self, "index", None) is None:
            return []
        try:
            ntotal = self.index.ntotal
        except Exception:
            ntotal = 0
        if ntotal == 0:
            return []

        # Ensure 2D shape for FAISS
        if q.ndim == 1:
            q = q[None, :]
        # Normalize (cosine via inner product)
        if not np.allclose(np.linalg.norm(q, axis=1), 1.0, atol=1e-3):
            q = l2_normalize(q)
        # Search
        scores, idxs = self.index.search(q.astype("float32"), topk)
        results: List[RetrievalResult] = []
        for i, s in zip(idxs[0], scores[0]):
            if i == -1:
                continue
            results.append(RetrievalResult(idx=int(i), score=float(s), meta=self.meta[int(i)]))
        return results

# =====================
# Grade folder loader
# =====================
def read_grade_dirs(grades_root: str) -> Tuple[List[GradeDesc], List[str], List[Dict[str, str]]]:
    grade_descs: List[GradeDesc] = []
    img_paths: List[str] = []
    img_metas: List[Dict[str, str]] = []

    for grade in sorted(os.listdir(grades_root)):
        gdir = os.path.join(grades_root, grade)
        if not os.path.isdir(gdir):
            continue
        txts = glob.glob(os.path.join(gdir, "*.txt"))
        print(f"Found txts for {grade}: {txts}")
        desc = open(txts[0], "r", encoding="utf-8").read().strip() if txts else f"Grade {grade} (no description)"
        grade_descs.append(GradeDesc(grade=grade, description=desc))
        for p in walk_images(gdir):
            img_paths.append(p)
            img_metas.append({"grade": grade, "path": p})

    return grade_descs, img_paths, img_metas

# =====================
# Build indices
# =====================
def build_indices(embedder: Embedder, dim: int):
    grade_descs, img_paths, img_metas = read_grade_dirs(GRADES_DIR)
    print(grade_descs)

    # Text index
    texts = [f"Grade {g.grade}: {g.description}" for g in grade_descs]
    text_embs = embedder.embed_texts(texts)
    text_index = FaissIndex(dim)
    text_index.add(text_embs, [{"grade": g.grade, "text": g.description} for g in grade_descs])

    # Image index
    imgs = [load_image(p) for p in img_paths]
    img_embs = embedder.embed_images(imgs)
    img_index = FaissIndex(dim)
    img_index.add(img_embs, img_metas)

    return text_index, img_index

# =====================
# Retrieval for a query image
# =====================
def retrieve_for_query(img_path: str, embedder: Embedder, text_index: FaissIndex, img_index: FaissIndex, topk: int = 3):
    img = load_image(img_path)
    q_emb = embedder.embed_images([img])

    text_hits = text_index.search(q_emb, topk=topk)
    img_hits = img_index.search(q_emb, topk=topk)

    return text_hits, img_hits

# =====================
# Retrieval & LLM-context helpers
# =====================
from collections import defaultdict

@dataclass
class GradeDecision:
    chosen_grade: str
    fused_scores: Dict[str, float]
    img_best_by_grade: Dict[str, Tuple[str, float]]  # grade -> (path, score)
    top_img_hits: List[RetrievalResult]
    top_text_hits: List[RetrievalResult]


def embed_query_image(path: str, embedder: Embedder) -> np.ndarray:
    img = load_image(path)
    return embedder.embed_images([img])[0]


def retrieve_candidates(
    q_emb: np.ndarray,
    img_index: FaissIndex,
    text_index: FaissIndex,
    per_grade_top: int = 1,
    topk_img_global: int = 5,
    topk_text: int = 5,
) -> Tuple[Dict[str, List[Tuple[str, float]]], List[RetrievalResult], List[RetrievalResult]]:
    """Returns:
    - best_per_grade: dict grade -> list of (path, score) for the top image examples within that grade
    - top_img_global: top-K image hits across all grades
    - top_text_hits: top-K text description hits
    """
    # global image hits (guard if index empty)
    top_img_global = img_index.search(q_emb, topk=topk_img_global) if getattr(img_index, "index", None) else []

    # group best per grade
    per_grade = defaultdict(list)
    if getattr(img_index, "index", None):
        for hit in img_index.search(q_emb, topk=max(50, topk_img_global)):
            g = hit.meta.get("grade", "?")
            per_grade[g].append((hit.meta.get("path", ""), hit.score))
    best_per_grade = {g: sorted(v, key=lambda x: x[1], reverse=True)[:per_grade_top] for g, v in per_grade.items()}

    # text hits (guard if index empty)
    top_text_hits = text_index.search(q_emb, topk=topk_text) if getattr(text_index, "index", None) else []

    return best_per_grade, top_img_global, top_text_hits


def decide_grade(
    best_per_grade: Dict[str, List[Tuple[str, float]]],
    top_text_hits: List[RetrievalResult],
    weight_img: float = 0.6,
    weight_text: float = 0.4,
) -> Tuple[str, Dict[str, float]]:
    # image scores: take the best score within each grade
    img_scores: Dict[str, float] = {}
    for g, items in best_per_grade.items():
        if items:
            img_scores[g] = max(s for _, s in items)

    # text scores: keep best score per grade
    text_scores: Dict[str, float] = {}
    for h in top_text_hits:
        g = h.meta.get("grade", "?")
        text_scores[g] = max(text_scores.get(g, -1.0), h.score)

    all_grades = sorted(set(list(img_scores.keys()) + list(text_scores.keys())))
    fused: Dict[str, float] = {}
    for g in all_grades:
        fused[g] = weight_img * img_scores.get(g, 0.0) + weight_text * text_scores.get(g, 0.0)

    if not fused:
        return "N/A", {}

    # normalize for readability
    vals = np.array(list(fused.values()), dtype=float)
    if vals.max() > 0:
        vals = (vals - vals.min()) / (vals.max() - vals.min() + 1e-6)
        for g, v in zip(list(fused.keys()), vals.tolist()):
            fused[g] = float(v)

    chosen = max(fused.items(), key=lambda kv: kv[1])[0]
    return chosen, fused


def make_llm_context(
    query_image_path: str,
    decision: GradeDecision,
    max_examples: int = 3,
) -> Dict[str, Any]:
    """Prepare a compact context payload you can pass to your LLM alongside the user question."""
    # take up to N example images (favor the chosen grade first)
    chosen = decision.chosen_grade
    examples: List[Dict[str, Any]] = []

    # chosen-grade example(s)
    if chosen in decision.img_best_by_grade:
        for path, sc in [decision.img_best_by_grade[chosen]]:
            examples.append({"grade": chosen, "path": path, "similarity": sc})

    # fill with top global hits (diverse grades)
    seen = {e["path"] for e in examples}
    for h in decision.top_img_hits:
        if len(examples) >= max_examples:
            break
        p = h.meta.get("path", "")
        if p and p not in seen:
            examples.append({"grade": h.meta.get("grade", "?"), "path": p, "similarity": h.score})
            seen.add(p)

    # top text descriptions (trim)
    texts = []
    for h in decision.top_text_hits[:3]:
        texts.append({"grade": h.meta.get("grade","?"), "text": h.meta.get("text","")[:240], "similarity": h.score})

    return {
        "query_image": query_image_path,
        "proposed_grade": chosen,
        "fused_scores": decision.fused_scores,
        "example_images": examples,
        "grade_text_matches": texts,
        "instructions": (
            "Decide the sidewalk grade (A+, A, B, C, D). Use the example images and grade descriptions as guidance. "
            "Prefer visual similarity; use text to break ties. Explain briefly."
        ),
    }


def grade_query_image(
    query_image_path: str,
    embedder: Embedder,
    text_index: FaissIndex,
    img_index: FaissIndex,
    weight_img: float = 0.6,
    weight_text: float = 0.4,
) -> Tuple[str, Dict[str, Any]]:
    """High-level helper: embed → retrieve → fuse → pack LLM context.
    Returns (chosen_grade, context_dict).
    """
    q_emb = embed_query_image(query_image_path, embedder)
    best_per_grade, top_img_global, top_text_hits = retrieve_candidates(q_emb, img_index, text_index)

    # map best per grade into single (path,score) for context
    best_single = {g: v[0] for g, v in best_per_grade.items() if v}

    chosen, fused = decide_grade(best_per_grade, top_text_hits, weight_img=weight_img, weight_text=weight_text)
    decision = GradeDecision(
        chosen_grade=chosen,
        fused_scores=fused,
        img_best_by_grade=best_single,
        top_img_hits=top_img_global,
        top_text_hits=top_text_hits,
    )
    ctx = make_llm_context(query_image_path, decision)
    return chosen, ctx

In [7]:
# =====================
# Demo
# =====================
if __name__ == "__main__":
    model = load_st_model()
    embedder = Embedder(model)
    dim = embedder.embed_texts(["probe"]).shape[1]

    print("Building indices...")
    text_index, img_index = build_indices(embedder, dim)

    # Example query
    images_folder = "local_data/2025_Centrum/images/"  # See other notebook on how to download an image from blob store
    image_name = "071a94bf-5563-4126-8684-dc73b3ab2025.jpeg"
    image_name_path = os.path.join(images_folder, image_name)
    sample_query = os.path.join(image_name_path)
    if os.path.exists(sample_query):
        t_hits, i_hits = retrieve_for_query(sample_query, embedder, text_index, img_index)
        print("\nText hits:")
        for h in t_hits:
            print(f"  Grade={h.meta['grade']} | score={h.score:.3f} | desc={h.meta['text'][:60]}…")
        print("\nImage hits:")
        for h in i_hits:
            print(f"  Grade={h.meta['grade']} | score={h.score:.3f} | img={os.path.basename(h.meta['path'])}")
    else:
        print("No sample query image found at", sample_query)

Building indices...
Found txts for A: ['/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/A/description.txt']
Found txts for A+: ['/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/A+/DescriptionA+.txt']
Found txts for B: ['/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/B/description.txt']
Found txts for C: ['/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/C/description.txt']
Found txts for D: ['/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/D/description.txt']
[GradeDesc(grade='A', description='Er ontbreken geen elementen uit de verharding. Er zijn nauwelijks beschadigde elementen.\t\nBeschadigde elementen\t1'), GradeDesc(grade='A+', description='Er ontbreken geen elementen uit de verharding. Er zijn geen beschadigde elementen.\t\nBeschadigde elementen: 0'), GradeDesc(grade='B', description='Er ontbreken geen elementen uit de verharding. Er zijn hier en daar beschadigde elementen.\t\nBeschadigde elementen\t2

In [8]:
chosen, ctx = grade_query_image(sample_query, embedder, text_index, img_index)
print("Chosen grade:", chosen)
print("LLM context:", json.dumps(ctx, indent=2))

Chosen grade: C
LLM context: {
  "query_image": "local_data/2025_Centrum/images/071a94bf-5563-4126-8684-dc73b3ab2025.jpeg",
  "proposed_grade": "C",
  "fused_scores": {
    "A": 0.7691913652137267,
    "A+": 0.0,
    "B": 0.40345896546843774,
    "C": 0.9999813645165238,
    "D": 0.6303082055727589
  },
  "example_images": [
    {
      "grade": "C",
      "path": "/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/C/beschadigde elementen_C.jpg",
      "similarity": 0.8905883431434631
    },
    {
      "grade": "A",
      "path": "/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/A/PXL_20250826_095437041.NIGHT.jpg",
      "similarity": 0.8723095059394836
    },
    {
      "grade": "D",
      "path": "/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/D/beschadigde elementen_D.jpg",
      "similarity": 0.8564962148666382
    }
  ],
  "grade_text_matches": [
    {
      "grade": "A+",
      "text": "Er ontbreken geen elementen uit de verharding. 

In [33]:
# Function to encode an image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

images_folder = "local_data/2025_Centrum/images/"  # See other notebook on how to download an image from blob store
image_name = "ecf38198-dabd-4895-ac67-6478d1e5cc53.jpeg"
image_path = os.path.join(images_folder, image_name)
base64_image = encode_image(os.path.join(images_folder, image_name))

prompt = "You are a sidewalk inspector for the municipality. Your task is " \
"to check whether there is any damage on the sidewalk. " \
"If so, choose one or multiple categories of damage from the following list: "

list_defect = [
  "Local subsidence",
  "Edge damage",
  "Tree root damage",
  "Transverse crack",
  "Broken elements",
  "Unfilled borehole",
  "Hole",
  "Local raise",
  "Longitudinal crack",
  "Asphalt trench elements (HOOR)",
  "Square inspection chamber connection",
  "Surface connection",
  "Fraying",
  "Edge restraint",
  "Loose elements",
  "Failed area",
  "Joint width of elements",
  "Missing elements"
]

content = f"{prompt} {list_defect}. Please answer with only the categories in the list. If you don't find any defects, please say so. In any case, provide an explanation."

schadebeel = ['local subsidence', 'edge damage', 'tree root damage',
'transverse crack', 'broken elements', 'unfilled borehole', 'hole',
'local raise', 'longitudinal crack',
'asphalt trench elements (HOOR)',
'square inspection chamber connection', 'gully connection',
'fraying', 'edge restraint', 'loose elements',
'failed area', 'joint width of elements', 'missing elements']

system_prompt = f"""
You are a highly skilled sidewalk inspection assistant.
You work for Amsterdam, a city where soil subsidence is a major issue causing sidewalks to often be depressed because of the underlying soil conditions.

Before classifying an image, follow this reasoning process:

1. Examine the image carefully and detect any visible damage, cracks, or irregularities.
2. Decide which category from {schadebeel} best fits the observed damage. For example, if there are huge gaps, it would be a hole which needs new tiles to be placed.
3. Think what kind of maintenance needs to be carried out to fix the category. This could further help determine the category. Also, if its close to a treet, there are roots, it could be tree root damage or if its close to a bollard, it could be loose elements or gap.
4. Provide a short descriptive text explaining why you chose that category. Give me also a second likely option.
5. Output your final answer strictly in JSON format:

{{
    "category1": "<first estimation based on category>",
    "category2": "<second estimation based on category>",
    "description": "<short text explaining your assessment>",
    "reasoning_steps": "<optional, internal reasoning for traceability>"
}}

Important:
- Do not skip the reasoning step; it should briefly summarize your assessment process.
- Return only valid JSON with the three keys.
"""

# Query endpoint
response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": [
                { "type": "text", "text": "Find if there is any damage in this image." },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    "detail": "low", # reduces token usage
                },
            ],
        }
    ],
    max_tokens=4096,
    temperature=1.0,
    top_p=1.0,
    model=deployment
)

print(response.choices[0].message.content)

{
    "category1": "hole",
    "category2": "unfilled borehole",
    "description": "The image shows a significant depression in the sidewalk with a visible gap in the paving bricks, indicating a hole that needs to be filled with new tiles or repairs.",
    "reasoning_steps": "Upon examining the image, there is a clear depression in the surface with an absence of paving elements. This aligns best with the 'hole' category, while nearby exposed ground could suggest an unfilled borehole as a secondary option."
}


In [36]:
prompt_2 = ""

# if the previous answer contains hole or missing elements, we use the RAG-system to decide the grade
if "hole" in response.choices[0].message.content.lower() or "missing elements" in response.choices[0].message.content.lower():
    print("Using RAG system to decide the grade...")
    chosen, ctx = grade_query_image(image_path, embedder, text_index, img_index)
    print("Chosen grade:", chosen)
    print("LLM context:", json.dumps(ctx, indent=2))

    prompt = f"The previous answer was {response.choices[0].message.content}. " \
    f"Now, based on the retrieved context: {ctx}" \
    "Decide the sidewalk grade (A+, A, B, C, D). Use the example images and grade descriptions as guidance. Prefer visual similarity; use text to break ties. Explain briefly. "
    
    prompt_2 = prompt
else:
    prompt = f"The previous answer was {response.choices[0].message.content}. " \
    "Decide the sidewalk grade (A+, A, B, C, D). Use your own knowledge and reasoning. Explain your decision briefly. "
    prompt_2 = prompt
    print("No hole or missing elements found, using LLM reasoning only. Prompt for final grading:", prompt_2)

# Query endpoint
response_2 = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": content,
        },
        {
            "role": "user",
            "content": [
                { "type": "text", "text": f"{prompt_2}"},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    "detail": "low", # reduces token usage
                },
            ],
        }
    ],
    max_tokens=4096,
    temperature=0.3,
    top_p=1.0,
    model=deployment
)

print(response_2.choices[0].message.content)

Using RAG system to decide the grade...
Chosen grade: C
LLM context: {
  "query_image": "local_data/2025_Centrum/images/ecf38198-dabd-4895-ac67-6478d1e5cc53.jpeg",
  "proposed_grade": "C",
  "fused_scores": {
    "A": 0.8383898689905828,
    "A+": 0.0,
    "B": 0.7489669681978307,
    "C": 0.9999503374031107,
    "D": 0.8255494541812297
  },
  "example_images": [
    {
      "grade": "C",
      "path": "/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/C/beschadigde elementen_C.jpg",
      "similarity": 0.8655822277069092
    },
    {
      "grade": "B",
      "path": "/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/B/PXL_20250826_094441911.jpg",
      "similarity": 0.8767260313034058
    },
    {
      "grade": "A",
      "path": "/home/andrealombardo/GitHub/VLM-Hackathon/notebooks/data/grades/A/PXL_20250826_095437041.NIGHT.jpg",
      "similarity": 0.858893096446991
    }
  ],
  "grade_text_matches": [
    {
      "grade": "D",
      "text": "Er ontbreke