In [1]:
import os
import torch
import pandas as pd
from pathlib import Path
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

# ---- CONFIG ----
base_model_id = "unsloth/Llama-3.2-11B-Vision-Instruct"
folder = Path("/workspace/50set")
output_csv = folder / "caption_results_base.csv"

# ---- LOAD BASE MODEL ONLY ----
model = MllamaForConditionalGeneration.from_pretrained(
    base_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# ---- LOAD PROCESSOR ----
processor = AutoProcessor.from_pretrained(base_model_id)

# ---- INSTRUCTION ----
instruction = (
    "You are a vision-language assistant. Describe an image using exactly six categories in a single line:\n\n"
    "main_objects: ... main_object_attributes: ... location: ... action: ... surroundings: ... background: ...\n\n"
    "Format rules:\n"
    "- Each category must start with its name, followed by a colon and a space\n"
    "- Use detailed, specific descriptions\n"
    "- Separate categories with a comma and a space\n"
    "- If a category is unclear, write: [category name]: none\n"
    "- Do NOT include commentary, line breaks, or extra text\n\n"
    "Example:\n"
    "main_objects: rabbit main_object_attributes: brown fur, fluffy tail, big ears, happy expression location: meadow action: none surroundings: butterflies, flowers, green grass background: foliage, sunlight"
)

# ---- HELPER FUNCTION ----
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]}]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200)
    return processor.decode(outputs[0], skip_special_tokens=True)

# ---- PROCESS BATCH ----
results = []
for i in range(1, 51):
    index = str(i).zfill(3)
    student_img = folder / f"{index}_student.png"
    teacher_img = folder / f"{index}_teacher.png"

    if student_img.exists() and teacher_img.exists():
        student_caption = generate_caption(student_img)
        teacher_caption = generate_caption(teacher_img)
        results.append({"Row": i, "Student": student_caption, "Teacher": teacher_caption})
        print(f"✅ Row {i} done.")
    else:
        print(f"⚠️ Missing pair for row {i}.")

# ---- SAVE TO CSV ----
df = pd.DataFrame(results)
df.to_csv(output_csv, index=False)
print(f"\n📄 Saved results to {output_csv}")


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Row 1 done.
✅ Row 2 done.
✅ Row 3 done.
✅ Row 4 done.
✅ Row 5 done.
✅ Row 6 done.
✅ Row 7 done.
✅ Row 8 done.
✅ Row 9 done.
✅ Row 10 done.
✅ Row 11 done.
✅ Row 12 done.
✅ Row 13 done.
✅ Row 14 done.
✅ Row 15 done.
✅ Row 16 done.
✅ Row 17 done.
✅ Row 18 done.
✅ Row 19 done.
✅ Row 20 done.
✅ Row 21 done.
✅ Row 22 done.
✅ Row 23 done.
✅ Row 24 done.
✅ Row 25 done.
✅ Row 26 done.
✅ Row 27 done.
✅ Row 28 done.
✅ Row 29 done.
✅ Row 30 done.
✅ Row 31 done.
✅ Row 32 done.
✅ Row 33 done.
✅ Row 34 done.
✅ Row 35 done.
✅ Row 36 done.
✅ Row 37 done.
✅ Row 38 done.
✅ Row 39 done.
✅ Row 40 done.
✅ Row 41 done.
✅ Row 42 done.
✅ Row 43 done.
✅ Row 44 done.
✅ Row 45 done.
✅ Row 46 done.
✅ Row 47 done.
✅ Row 48 done.
✅ Row 49 done.
✅ Row 50 done.

📄 Saved results to /workspace/50set/caption_results_base.csv
