In [2]:
# --- Notebook Cell: Build the JSONL dataset for Qwen2.5-VL via Unsloth ---

import os, json, random, shutil
from pathlib import Path
import pandas as pd
from PIL import Image

# -------------------------------
# Configuration (edit if needed)
# -------------------------------
SRC_OVERLAYS_DIR = Path("./out/overlays_overlay")   # your green-box overlays
QC_CSV           = Path("./out/qc_labels.csv")      # file,label,reason
COCO_JSON        = Path("./train/_annotations.coco.json")  # optional consistency check

OUTPUT_DIR       = Path("./out/datasets/qwen_cardboard_qc")
IMAGES_OUT_DIR   = OUTPUT_DIR / "images"
TRAIN_JSONL      = OUTPUT_DIR / "train.jsonl"
VAL_JSONL        = OUTPUT_DIR / "val.jsonl"
SPLIT_VAL_FRAC   = 0.15
RANDOM_SEED      = 42

# Optional: normalize long side for all images (recommended 300–1000 px).
# Set to an int like 768 to enable, or None to skip.
NORMALIZE_LONG_SIDE = None  # e.g., 768

# Your task prompt (kept verbatim, as provided)
PROMPT = (
    "You are an expert cardboard quality inspector. Analyze the cardboard pieces within the green bounding boxes in this image.\n\n"
    "Focus on:\n"
    "1. WARP: Look for major bending, curving, or deformation from a flat surface, slight bending is acceptable.\n"
    " 2. Reason: A short description. \n"
    "Return ONLY this exact JSON format with no additional text:\n"
    "{\"warp\": true, \"reason\": \"good\"}\n\n"
    "warp: true if ANY major bendingm/curving, false if completely flat or slight bending"
)

# -------------------------------
# Helpers
# -------------------------------
def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def resize_to_long_side(img: Image.Image, long_side: int) -> Image.Image:
    w, h = img.size
    if max(w, h) == long_side:
        return img
    if w >= h:
        new_w = long_side
        new_h = int(h * (long_side / float(w)))
    else:
        new_h = long_side
        new_w = int(w * (long_side / float(h)))
    return img.resize((max(1, new_w), max(1, new_h)), Image.BICUBIC)

def stratified_split_indices(indices_by_class, val_frac, rng):
    val_indices = set()
    for cls, idxs in indices_by_class.items():
        idxs = idxs[:]  # copy
        rng.shuffle(idxs)
        n_val = max(1, int(len(idxs) * val_frac)) if len(idxs) > 0 else 0
        val_indices.update(idxs[:n_val])
    return val_indices

# -------------------------------
# Load inputs
# -------------------------------
assert SRC_OVERLAYS_DIR.exists(), f"Overlays dir not found: {SRC_OVERLAYS_DIR}"
assert QC_CSV.exists(), f"Labels CSV not found: {QC_CSV}"

df = pd.read_csv(QC_CSV)
expected_cols = {"file", "label", "reason"}
missing_cols = expected_cols - set(df.columns.str.lower())
if missing_cols:
    raise ValueError(f"qc_labels.csv is missing columns: {missing_cols}. Expect columns: file,label,reason")

# Normalize column names
df.columns = df.columns.str.lower()

# Optional: COCO check (best-effort, no failure if missing)
coco_image_set = set()
if COCO_JSON.exists():
    try:
        with open(COCO_JSON, "r", encoding="utf-8") as f:
            coco = json.load(f)
        for im in coco.get("images", []):
            coco_image_set.add(im.get("file_name"))
    except Exception as e:
        print(f"[WARN] Could not read COCO file: {e}")

# -------------------------------
# Prepare output dirs
# -------------------------------
ensure_dir(OUTPUT_DIR)
ensure_dir(IMAGES_OUT_DIR)

# -------------------------------
# Build examples
# -------------------------------
records = []
missing_files = []
copied = 0

for i, row in df.iterrows():
    filename = row["file"]
    label    = str(row["label"]).strip()
    reason   = str(row["reason"]).strip()

    src_path = SRC_OVERLAYS_DIR / filename
    if not src_path.exists():
        missing_files.append(filename)
        continue

    # Target (assistant) mapping:
    # Pass  -> warp = False
    # Fail  -> warp = True
    warp = (label.lower() != "pass")
    assistant_obj = {"warp": bool(warp), "reason": reason}
    assistant_text = json.dumps(assistant_obj, ensure_ascii=False)  # ensures true/false lowercase

    # Copy (and optional resize) to dataset images folder using the same basename
    dst_path = IMAGES_OUT_DIR / filename
    if NORMALIZE_LONG_SIDE is not None:
        try:
            with Image.open(src_path) as im:
                im = im.convert("RGB")
                im = resize_to_long_side(im, NORMALIZE_LONG_SIDE)
                im.save(dst_path, quality=95)
        except Exception as e:
            print(f"[WARN] Could not resize {src_path}: {e}. Copying as-is.")
            shutil.copy2(src_path, dst_path)
    else:
        if not dst_path.exists():
            shutil.copy2(src_path, dst_path)
    copied += 1

    # Messages use the repo-relative path "images/<filename>"
    rel_image = f"images/{filename}"
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text",  "text": PROMPT},
                {"type": "image", "image": rel_image}
            ],
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": assistant_text}
            ],
        },
    ]

    records.append({
        "id": Path(filename).stem,
        "messages": messages,
        # optional metadata for traceability:
        "label": label,
        "reason": reason,
        "image": rel_image
    })

print(f"Loaded {len(df)} CSV rows; usable overlays: {len(records)}; copied: {copied}")
if missing_files:
    print(f"[INFO] {len(missing_files)} filenames in CSV missing in overlays:\n  - " + "\n  - ".join(missing_files[:10]) + ("\n  ..." if len(missing_files) > 10 else ""))

if coco_image_set:
    not_in_coco = [r["id"] for r in records if (Path(r["image"]).name not in coco_image_set)]
    if not_in_coco:
        print(f"[INFO] {len(not_in_coco)} overlays not listed in COCO. That's OK for finetune, just FYI.")

# -------------------------------
# Stratified split & write JSONL
# -------------------------------
rng = random.Random(RANDOM_SEED)
indices_by_class = {False: [], True: []}
for idx, r in enumerate(records):
    warp = json.loads(r["messages"][1]["content"][0]["text"])["warp"]
    indices_by_class[warp].append(idx)

val_indices = stratified_split_indices(indices_by_class, SPLIT_VAL_FRAC, rng)

def write_jsonl(path: Path, idx_set):
    with open(path, "w", encoding="utf-8") as f:
        for i, r in enumerate(records):
            if (i in idx_set) == True:
                f.write(json.dumps({"messages": r["messages"]}, ensure_ascii=False) + "\n")

def write_jsonl_complement(path: Path, idx_set):
    with open(path, "w", encoding="utf-8") as f:
        for i, r in enumerate(records):
            if (i in idx_set) == False:
                f.write(json.dumps({"messages": r["messages"]}, ensure_ascii=False) + "\n")

write_jsonl(VAL_JSONL, val_indices)
write_jsonl_complement(TRAIN_JSONL, val_indices)

print(f"Wrote: {TRAIN_JSONL} and {VAL_JSONL}")
print(f"Train size: {sum(1 for _ in open(TRAIN_JSONL, 'r', encoding='utf-8'))}  |  Val size: {sum(1 for _ in open(VAL_JSONL, 'r', encoding='utf-8'))}")

# Quick peek at one line
with open(TRAIN_JSONL, "r", encoding="utf-8") as f:
    print("Sample JSONL line:\n", f.readline().strip()[:300] + " ...")


Loaded 168 CSV rows; usable overlays: 168; copied: 168
Wrote: out\datasets\qwen_cardboard_qc\train.jsonl and out\datasets\qwen_cardboard_qc\val.jsonl
Train size: 143  |  Val size: 25
Sample JSONL line:
 {"messages": [{"role": "user", "content": [{"type": "text", "text": "You are an expert cardboard quality inspector. Analyze the cardboard pieces within the green bounding boxes in this image.\n\nFocus on:\n1. WARP: Look for major bending, curving, or deformation from a flat surface, slight bending i ...


## Push to HF

In [None]:
%%capture
# --- Notebook Cell: Push JSONL + images to Hugging Face Datasets repo ---

# pip installs in-notebook if needed:
# %pip install -qU huggingface_hub

from huggingface_hub import HfApi, create_repo, upload_folder, upload_file, whoami

HF_TOKEN      = "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"     # <-- paste your token
HF_USERNAME   = None                      # set to None to auto-detect from token
DATASET_NAME  = "cardboard-qc-qwen-vl"    # you can rename

api = HfApi(token=HF_TOKEN)
if HF_USERNAME is None:
    HF_USERNAME = whoami(token=HF_TOKEN)["name"]

repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
create_repo(repo_id, repo_type="dataset", private=True, exist_ok=True, token=HF_TOKEN)

# Upload images folder
upload_folder(
    repo_id=repo_id,
    repo_type="dataset",
    folder_path=str(IMAGES_OUT_DIR),
    path_in_repo="images",
    token=HF_TOKEN,
)

# Upload JSONL files
upload_file(
    path_or_fileobj=str(TRAIN_JSONL),
    path_in_repo="train.jsonl",
    repo_id=repo_id,
    repo_type="dataset",
    token=HF_TOKEN,
)
upload_file(
    path_or_fileobj=str(VAL_JSONL),
    path_in_repo="val.jsonl",
    repo_id=repo_id,
    repo_type="dataset",
    token=HF_TOKEN,
)

# Minimal dataset card with usage snippet
readme_text = """---
pretty_name: Cardboard QC (Qwen2.5-VL)
task_categories:
- visual-question-answering
- image-classification
license: cc-by-4.0
---

# Cardboard QC — Qwen2.5-VL

Each row is a chat with **messages** following Unsloth's vision finetune format:

```json
{{
  "messages": [
    {{
      "role": "user",
      "content": [
        {{"type": "text", "text": "INSTRUCTION..."}},
        {{"type": "image", "image": "images/IMG_....jpg"}}
      ]
    }},
    {{
      "role": "assistant",
      "content": [{{"type": "text", "text": "{{\\"warp\\": true/false, \\"reason\\": \\"...\\}}"}}]
    }}
  ]
}}
Example load
from datasets import load_dataset
ds = load_dataset("{repo_id}")
print(ds["train"][0]["messages"][0]["content"])
"""
Images are under images/. The user instruction asks the model to focus on green bounding boxes drawn in the overlays.

readme_path = OUTPUT_DIR / "README.md"
with open(readme_path, "w", encoding="utf-8") as f:
f.write(readme_text)

upload_file(
path_or_fileobj=str(readme_path),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
token=HF_TOKEN,
)

print(f"✅ Uploaded dataset to https://huggingface.co/datasets/{repo_id}")



## Play with Chat temp

In [1]:
import torch
print(torch.cuda.is_available())
from unsloth.chat_templates import CHAT_TEMPLATES
print(list(CHAT_TEMPLATES.keys()))

False


NotImplementedError: Unsloth currently only works on NVIDIA GPUs and Intel GPUs.