# Inference Notebook Template

**What this does:**
1. **Helper** (`save_results`): dumps any single `dict` into a timestamped JSON file under `results/`.
2. **Prompt & image**: only one each—just swap in your own strings/paths.
3. **Model loading**: picks the chosen variant from HF.
4. **Inference**: calls `.generate()` on one image + prompt.
5. **Output**: prints the output and writes all metadata + result into JSON.

In [None]:
!pip install -qqq num2words

## 📚 Helper: Save any results dict to JSON

In [None]:
import json, os
from datetime import datetime

def save_results(data: dict,
                 model_name: str,
                 variant: str,
                 output_dir: str = "results"):
    # Ensure nested directories are created
    model_dir = os.path.join(output_dir, model_name)
    os.makedirs(model_dir, exist_ok=True)

    ts       = datetime.now().strftime("%Y%m%d_%H%M%S")
    fname    = f"{variant}_{ts}.json"
    out_path = os.path.join(model_dir, fname)

    with open(out_path, "w") as f:
        json.dump(data, f, indent=4)
    print(f"✅ Saved results to {out_path}")


# Variant: SmolVLM-256M-Instruct
https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct

## 1️⃣ Prompt

In [None]:
prompt = "What is this best described as? Choose ONE from: 'Art', 'Graffiti', 'Vandalism', 'Activism', 'Advertisement', 'Other'."

## 2️⃣ Load Processor and Model

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
## 📷 Collect image files

from glob import glob

input_dir = "/content/input"
image_extensions = ["*.png", "*.jpg", "*.jpeg", "*.webp", "*.bmp"]

image_paths = []
for ext in image_extensions:
    image_paths.extend(glob(os.path.join(input_dir, ext)))

In [None]:
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch

MODEL_NAME = "HuggingFaceTB/SmolVLM"
VARIANT    = "256M-Instruct"
repo_id    = f"{MODEL_NAME}-{VARIANT}"

processor = AutoProcessor.from_pretrained(repo_id)
model     = AutoModelForImageTextToText.from_pretrained(
    repo_id,
    torch_dtype=torch.bfloat16
).to("cuda").eval()

## 3️⃣ Inference

In [None]:
def infer_img(image_path: str, prompt: str, system_prompt: str = ""):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Load image
    raw_image = Image.open(image_path).convert("RGB")

    # Build messages
    messages = []
    if system_prompt:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        })

    messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": raw_image},
            {"type": "text", "text": prompt}
        ]
    })

    # Tokenize
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device, dtype=torch.bfloat16)

    # Generate
    with torch.inference_mode():
        generation = model.generate(**inputs, max_new_tokens=512, do_sample=False, temperature=0)

    decoded = processor.batch_decode(generation, skip_special_tokens=True)[0].strip()

    # Start only after "Assistant:" if present
    if "Assistant:" in decoded:
        decoded = decoded.split("Assistant:", 1)[1].strip()

    # Remove echoed prompt
    if decoded.startswith(prompt.strip()):
        decoded = decoded[len(prompt.strip()):].strip()

    return decoded


## 4️⃣ Package & Save to JSON

In [None]:
results = []

for image_path in image_paths:
    try:
        output = infer_img(image_path, prompt)
        print(f"{image_path} → {output}")

        result = {
            "image_path": image_path,
            "model":      MODEL_NAME,
            "variant":    VARIANT,
            "prompt":     prompt,
            "output":     output
        }

        results.append(result)

    except Exception as e:
        print(f"❌ Error processing {image_path}: {e}")

# Save once, after all images are processed
save_results(results, MODEL_NAME, VARIANT)


✅ Saved results to results/HuggingFaceTB/SmolVLM/256M-Instruct_20250502_102244.json


In [None]:
!sudo rm -rf ./results/*


# Variant: SmolVLM-500M-Instruct

https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct

In [None]:
MODEL_NAME = "HuggingFaceTB/SmolVLM"
VARIANT    = "500M-Instruct"
repo_id    = f"{MODEL_NAME}-{VARIANT}"


processor = AutoProcessor.from_pretrained(repo_id)
model     = AutoModelForImageTextToText.from_pretrained(
    repo_id,
    torch_dtype=torch.bfloat16
).to("cuda").eval()



processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.2k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.74k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

## 3️⃣ Inference

In [None]:
def infer_img(image_path: str, prompt: str, system_prompt: str = ""):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Load image
    raw_image = Image.open(image_path).convert("RGB")

    # Build messages
    messages = []
    if system_prompt:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        })

    messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": raw_image},
            {"type": "text", "text": prompt}
        ]
    })

    # Tokenize
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device, dtype=torch.bfloat16)

    # Generate
    with torch.inference_mode():
        generation = model.generate(**inputs, max_new_tokens=512, do_sample=False, temperature=0)

    decoded = processor.batch_decode(generation, skip_special_tokens=True)[0].strip()

    # Start only after "Assistant:" if present
    if "Assistant:" in decoded:
        decoded = decoded.split("Assistant:", 1)[1].strip()

    # Remove echoed prompt
    if decoded.startswith(prompt.strip()):
        decoded = decoded[len(prompt.strip()):].strip()

    return decoded


## 4️⃣ Package & Save to JSON

In [None]:
results = []

for image_path in image_paths:
    try:
        output = infer_img(image_path, prompt)
        print(f"{image_path} → {output}")

        result = {
            "image_path": image_path,
            "model":      MODEL_NAME,
            "variant":    VARIANT,
            "prompt":     prompt,
            "output":     output
        }

        results.append(result)

    except Exception as e:
        print(f"❌ Error processing {image_path}: {e}")

# Save once, after all images are processed
save_results(results, MODEL_NAME, VARIANT)


✅ Saved results to results/HuggingFaceTB/SmolVLM/500M-Instruct_20250502_102429.json


In [None]:
!sudo rm -rf ./results/*


# Variant: SmolVLM-Instruct (2.2 B)

https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct

In [None]:
MODEL_NAME = "HuggingFaceTB/SmolVLM"
VARIANT    = "Instruct"
repo_id    = f"{MODEL_NAME}-{VARIANT}"


processor = AutoProcessor.from_pretrained(repo_id)
model     = AutoModelForImageTextToText.from_pretrained(
    repo_id,
    torch_dtype=torch.bfloat16
).to("cuda").eval()



processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.2k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.74k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

## 3️⃣ Inference

In [None]:
def infer_img(image_path: str, prompt: str, system_prompt: str = ""):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Load image
    raw_image = Image.open(image_path).convert("RGB")

    # Build messages
    messages = []
    if system_prompt:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        })

    messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": raw_image},
            {"type": "text", "text": prompt}
        ]
    })

    # Tokenize
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device, dtype=torch.bfloat16)

    # Generate
    with torch.inference_mode():
        generation = model.generate(**inputs, max_new_tokens=512, do_sample=False, temperature=0)

    decoded = processor.batch_decode(generation, skip_special_tokens=True)[0].strip()

    # Start only after "Assistant:" if present
    if "Assistant:" in decoded:
        decoded = decoded.split("Assistant:", 1)[1].strip()

    # Remove echoed prompt
    if decoded.startswith(prompt.strip()):
        decoded = decoded[len(prompt.strip()):].strip()

    return decoded


## 4️⃣ Package & Save to JSON

In [None]:
results = []

for image_path in image_paths:
    try:
        output = infer_img(image_path, prompt)
        print(f"{image_path} → {output}")

        result = {
            "image_path": image_path,
            "model":      MODEL_NAME,
            "variant":    VARIANT,
            "prompt":     prompt,
            "output":     output
        }

        results.append(result)

    except Exception as e:
        print(f"❌ Error processing {image_path}: {e}")

# Save once, after all images are processed
save_results(results, MODEL_NAME, VARIANT)


✅ Saved results to results/HuggingFaceTB/SmolVLM/500M-Instruct_20250502_102429.json
