In [1]:
# If you're in the project root: iqa_vlm/
# And you created utils/dataset_utils.py as discussed.

import os
import logging
from pathlib import Path

import matplotlib.pyplot as plt
from datasets import load_from_disk

# Make sure Python can see the project package (if needed)
# If running from notebooks/ folder, go one level up:
import sys
root = Path("..").resolve()
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

from utils.dataset_utils import (
    ProcessedDatasetLoader,
    inspect_sample,
    make_test_subset,
)

# Basic logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)


ModuleNotFoundError: No module named 'matplotlib'

In [1]:
# Adjust if your processed data is somewhere else
data_dir = "data/processed"

loader = ProcessedDatasetLoader(data_dir=data_dir)

loader.print_summary()


NameError: name 'ProcessedDatasetLoader' is not defined

In [None]:
print("Validating datasets...")
ok = loader.validate()
print(f"\nValidation result: {'✅ PASS' if ok else '❌ FAIL'}")


In [None]:
if not ok:
    raise RuntimeError("Datasets failed validation; fix before proceeding.")

train_ds = loader.load_train()
val_ds   = loader.load_val()
test_ds  = loader.load_test()

len(train_ds), len(val_ds), len(test_ds)


In [None]:
# First train sample – no image load yet
inspect_sample(train_ds, index=0, load_image=False)


In [None]:
for idx in [0, 5, 10]:
    print(f"\n=== SAMPLE {idx} ===")
    inspect_sample(train_ds, index=idx, load_image=False)


In [None]:
from PIL import Image
import numpy as np

def show_image_from_sample(ds, index=0):
    sample = ds[index]
    cols = set(ds.column_names)
    
    if "images" in cols:
        # tif_direct mode: image is already in the dataset
        img = sample["images"]
    elif "image_path" in cols:
        # png_paths mode: open from path
        img_path = sample["image_path"]
        img = Image.open(img_path)
    else:
        raise ValueError(f"No image or image_path column in dataset. Columns: {cols}")
    
    plt.figure(figsize=(4, 4))
    plt.imshow(img, cmap="gray" if img.mode == "L" else None)
    plt.axis("off")
    plt.title(f"Index {index} | {sample['metadata'].get('category', '?')} | MOS={sample['metadata'].get('mos', '?')}")
    plt.show()

show_image_from_sample(train_ds, index=0)


In [None]:
import json

train_jsonl_path = loader.train_jsonl
val_jsonl_path   = loader.val_jsonl
test_jsonl_path  = loader.test_jsonl

def count_jsonl(path: Path) -> int:
    if not path.exists():
        print(f"{path} missing")
        return 0
    with path.open("r", encoding="utf-8") as f:
        return sum(1 for _ in f)

print("JSONL vs HF counts:")
print("  train:", count_jsonl(train_jsonl_path), "vs HF", len(train_ds))
print("  val  :", count_jsonl(val_jsonl_path),   "vs HF", len(val_ds))
print("  test :", count_jsonl(test_jsonl_path),  "vs HF", len(test_ds))


In [None]:
tiny_train = make_test_subset(train_ds, num_samples=32)
len(tiny_train)
tiny_dir = Path(data_dir) / "train_dataset_tiny"
tiny_dir.mkdir(parents=True, exist_ok=True)
tiny_train.save_to_disk(str(tiny_dir))
print(f"Tiny train dataset saved to {tiny_dir}")



In [None]:
from collections import Counter

def mos_distribution(ds, name="train"):
    cats = [m["category"] for m in ds["metadata"]]
    c = Counter(cats)
    print(f"\n{name} distribution:")
    total = sum(c.values()) or 1
    for k, v in sorted(c.items()):
        print(f"  {k:10s}: {v:5d} ({100.0*v/total:5.1f}%)")

mos_distribution(train_ds, "train")
mos_distribution(val_ds, "val")
mos_distribution(test_ds, "test")
