# Inference Notebook Template

**What this does:**
1. **Helper** (`save_results`): dumps any single `dict` into a timestamped JSON file under `results/`.
2. **Prompt & image**: only one each—just swap in your own strings/paths.
3. **Model loading**: picks the chosen variant from HF.
4. **Inference**: calls `.generate()` on one image + prompt.
5. **Output**: prints the output and writes all metadata + result into JSON.

In [30]:
!pip install -qqq num2words

## 📚 Helper: Save any results dict to JSON

In [13]:
import json, os
from datetime import datetime

def save_results(data: dict,
                 model_name: str,
                 variant: str,
                 output_dir: str = "results"):
    # Ensure nested directories are created
    model_dir = os.path.join(output_dir, model_name)
    os.makedirs(model_dir, exist_ok=True)

    ts       = datetime.now().strftime("%Y%m%d_%H%M%S")
    fname    = f"{variant}_{ts}.json"
    out_path = os.path.join(model_dir, fname)

    with open(out_path, "w") as f:
        json.dump(data, f, indent=4)
    print(f"✅ Saved results to {out_path}")


# Variant: SmolVLM2-256M-Video-Instruct (Supports videos + Images both!)
https://huggingface.co/HuggingFaceTB/SmolVLM2-256M-Video-Instruct

## 1️⃣ Prompt

In [14]:
prompt = "What is this best described as? Choose ONE from: 'Art', 'Graffiti', 'Vandalism', 'Activism', 'Advertisement', 'Other'."

## 2️⃣ Load Processor and Model

In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch

MODEL_NAME = "HuggingFaceTB/SmolVLM2"
VARIANT    = "256M-Video-Instruct"
repo_id    = f"{MODEL_NAME}-{VARIANT}"

processor = AutoProcessor.from_pretrained(repo_id)
model     = AutoModelForImageTextToText.from_pretrained(
    repo_id,
    torch_dtype=torch.bfloat16
).to("cuda").eval()

## 3️⃣ Inference

In [18]:
def infer_img(image_path: str, prompt: str, system_prompt: str = ""):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Load image
    raw_image = Image.open(image_path).convert("RGB")

    # Build messages
    messages = []
    if system_prompt:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        })

    messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": raw_image},
            {"type": "text", "text": prompt}
        ]
    })

    # Tokenize
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device, dtype=torch.bfloat16)

    # Generate
    with torch.inference_mode():
        generation = model.generate(**inputs, max_new_tokens=512, do_sample=False, temperature=0)

    decoded = processor.batch_decode(generation, skip_special_tokens=True)[0].strip()

    # Start only after "Assistant:" if present
    if "Assistant:" in decoded:
        decoded = decoded.split("Assistant:", 1)[1].strip()

    # Remove echoed prompt
    if decoded.startswith(prompt.strip()):
        decoded = decoded[len(prompt.strip()):].strip()

    return decoded


## 4️⃣ Package & Save to JSON

In [28]:
results = []

for image_path in image_paths:
    try:
        output = infer_img(image_path, prompt)
        print(f"{image_path} → {output}")

        result = {
            "image_path": image_path,
            "model":      MODEL_NAME,
            "variant":    VARIANT,
            "prompt":     prompt,
            "output":     output
        }

        results.append(result)

    except Exception as e:
        print(f"❌ Error processing {image_path}: {e}")

# Save once, after all images are processed
save_results(results, MODEL_NAME, VARIANT)


/content/input/image1.png → Graffiti
/content/input/image2.png → 'Graffiti'.
✅ Saved results to results/HuggingFaceTB/SmolVLM2/256M-Video-Instruct_20250502_092434.json


In [29]:
!sudo rm -rf ./results/*


# Variant: SmolVLM2-500M-Video-Instruct

https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct

In [26]:
MODEL_NAME = "HuggingFaceTB/SmolVLM2"
VARIANT    = "500M-Video-Instruct"
repo_id    = f"{MODEL_NAME}-{VARIANT}"


processor = AutoProcessor.from_pretrained(repo_id)
model     = AutoModelForImageTextToText.from_pretrained(
    repo_id,
    torch_dtype=torch.bfloat16
).to("cuda").eval()



## 3️⃣ Inference

In [32]:
def infer_img(image_path: str, prompt: str, system_prompt: str = ""):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Load image
    raw_image = Image.open(image_path).convert("RGB")

    # Build messages
    messages = []
    if system_prompt:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        })

    messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": raw_image},
            {"type": "text", "text": prompt}
        ]
    })

    # Tokenize
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device, dtype=torch.bfloat16)

    # Generate
    with torch.inference_mode():
        generation = model.generate(**inputs, max_new_tokens=512, do_sample=False, temperature=0)

    decoded = processor.batch_decode(generation, skip_special_tokens=True)[0].strip()

    # Start only after "Assistant:" if present
    if "Assistant:" in decoded:
        decoded = decoded.split("Assistant:", 1)[1].strip()

    # Remove echoed prompt
    if decoded.startswith(prompt.strip()):
        decoded = decoded[len(prompt.strip()):].strip()

    return decoded


## 4️⃣ Package & Save to JSON

In [33]:
results = []

for image_path in image_paths:
    try:
        output = infer_img(image_path, prompt)
        print(f"{image_path} → {output}")

        result = {
            "image_path": image_path,
            "model":      MODEL_NAME,
            "variant":    VARIANT,
            "prompt":     prompt,
            "output":     output
        }

        results.append(result)

    except Exception as e:
        print(f"❌ Error processing {image_path}: {e}")

# Save once, after all images are processed
save_results(results, MODEL_NAME, VARIANT)




/content/input/image1.png → Graffiti
/content/input/image2.png → 'Graffiti'.
✅ Saved results to results/HuggingFaceTB/SmolVLM2/256M-Video-Instruct_20250502_093342.json


In [None]:
!sudo rm -rf ./results/*


# Variant: SmolVLM2-2.2B-Instruct


https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct

In [35]:
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch
import os

MODEL_NAME = "HuggingFaceTB/SmolVLM2"
VARIANT    = "2.2B-Instruct"
repo_id    = f"{MODEL_NAME}-{VARIANT}"

processor = AutoProcessor.from_pretrained(repo_id)
model     = AutoModelForImageTextToText.from_pretrained(
    repo_id,
    torch_dtype=torch.bfloat16
).to("cuda").eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

## 3️⃣ Inference

In [None]:
def infer_img(image_path: str, prompt: str, system_prompt: str = ""):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Load image
    raw_image = Image.open(image_path).convert("RGB")

    # Build messages
    messages = []
    if system_prompt:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        })

    messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": raw_image},
            {"type": "text", "text": prompt}
        ]
    })

    # Tokenize
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device, dtype=torch.bfloat16)

    # Generate
    with torch.inference_mode():
        generation = model.generate(**inputs, max_new_tokens=512, do_sample=False, temperature=0)

    decoded = processor.batch_decode(generation, skip_special_tokens=True)[0].strip()

    # Start only after "Assistant:" if present
    if "Assistant:" in decoded:
        decoded = decoded.split("Assistant:", 1)[1].strip()

    # Remove echoed prompt
    if decoded.startswith(prompt.strip()):
        decoded = decoded[len(prompt.strip()):].strip()

    return decoded


## 4️⃣ Package & Save to JSON

In [36]:
results = []

for image_path in image_paths:
    try:
        output = infer_img(image_path, prompt)
        print(f"{image_path} → {output}")

        result = {
            "image_path": image_path,
            "model":      MODEL_NAME,
            "variant":    VARIANT,
            "prompt":     prompt,
            "output":     output
        }

        results.append(result)

    except Exception as e:
        print(f"❌ Error processing {image_path}: {e}")

# Save once, after all images are processed
save_results(results, MODEL_NAME, VARIANT)


/content/input/image1.png → 'Advertisement'.
/content/input/image2.png → 'Advertisement'.
✅ Saved results to results/HuggingFaceTB/SmolVLM2/2.2B-Instruct_20250502_094228.json
