In [2]:
! pip install datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosigna

In [3]:
# AdImageNet: download → filter → save → manifest
# Requirements:
#   pip install -U datasets pillow pandas tqdm huggingface_hub

from datasets import load_dataset
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import io, os, re, sys

# ======================
# Config
# ======================
OUT_ROOT   = Path("/data/thesis/AdImageNet")   # change if you want
IMG_DIR    = OUT_ROOT / "images"
CSV_PATH   = OUT_ROOT / "adimagenet_manifest.csv"
KEEP_TEXT_MAX_LEN = 80
MIN_W = 250
MIN_H = 250

OUT_ROOT.mkdir(parents=True, exist_ok=True)
IMG_DIR.mkdir(parents=True, exist_ok=True)

# ======================
# Load dataset
# ======================
# If you ever get 401/403: run `huggingface-cli login` in the same environment
try:
    ds = load_dataset("PeterBrendan/AdImageNet", split="train")
except Exception:
    # fallback if split not defined (defensive)
    dsdict = load_dataset("PeterBrendan/AdImageNet")
    ds = next(iter(dsdict.values()))

# ======================
# Helpers
# ======================
dim_re = re.compile(r"\d+")

def get_dims(record, pil_img: Image.Image | None):
    """
    Return (w, h) using record['dimensions'] if present, otherwise PIL size.
    """
    w = h = None
    dims = record.get("dimensions")
    if isinstance(dims, str):
        nums = dim_re.findall(dims)
        if len(nums) >= 2:
            w, h = int(nums[0]), int(nums[1])

    if pil_img is not None:
        pw, ph = pil_img.size
        # prefer explicit dims if present; otherwise use PIL
        w = w if w is not None else pw
        h = h if h is not None else ph
    return w, h

def get_pil(record):
    """
    Make a PIL.Image from the 'image' column (the dataset uses Image feature).
    Also handles raw bytes/string path just in case.
    """
    val = record.get("image")
    if isinstance(val, Image.Image):
        return val.convert("RGB")
    if isinstance(val, (bytes, bytearray)):
        return Image.open(io.BytesIO(val)).convert("RGB")
    if isinstance(val, str) and os.path.exists(val):
        return Image.open(val).convert("RGB")
    raise KeyError("No usable image payload in record['image'].")

def pass_filters(record, pil_img):
    # text length
    txt = record.get("text") or ""
    if isinstance(txt, str) and len(txt) > KEEP_TEXT_MAX_LEN:
        return False

    # dimensions (require BOTH width & height ≥ thresholds)
    w, h = get_dims(record, pil_img)
    if w is None or h is None:
        return False
    return (w >= MIN_W) and (h >= MIN_H)

# ======================
# Save loop + manifest
# ======================
rows = []
kept = 0
skipped = 0

for i, rec in enumerate(tqdm(ds, desc="Filtering & saving")):
    try:
        img = get_pil(rec)
        if not pass_filters(rec, img):
            skipped += 1
            continue

        # organize optionally by dimensions folder, e.g. "(300, 250)"
        sub = str(rec.get("dimensions") or "")
        save_dir = IMG_DIR / sub if sub else IMG_DIR
        save_dir.mkdir(parents=True, exist_ok=True)

        # filename: prefer dataset file_name, else generate
        fname = rec.get("file_name") or f"ad_{i:06d}.jpg"
        # normalize extension to .jpg
        stem, ext = os.path.splitext(fname)
        if ext.lower() not in [".jpg", ".jpeg", ".png", ".webp"]:
            fname = f"{stem}.jpg"

        out_path = save_dir / fname
        img.save(out_path, format="JPEG", quality=95, optimize=True)

        w, h = get_dims(rec, img)
        rows.append({
            "file_path": str(out_path.resolve()),
            "file_name": fname,
            "text": rec.get("text") or "",
            "dimensions": rec.get("dimensions") or "",
            "width": w, "height": h,
        })
        kept += 1

    except Exception as e:
        skipped += 1
        print(f"[warn] row {i}: {e}", file=sys.stderr)

# ======================
# Write manifest
# ======================
df = pd.DataFrame(rows)
df.to_csv(CSV_PATH, index=False)

print(f"\nDone. Kept {kept} creatives, skipped {skipped}.")
print(f"Images → {IMG_DIR}")
print(f"Manifest → {CSV_PATH}")


README.md:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

DatasetNotFoundError: Dataset 'PeterBrendan/AdImageNet' is a gated dataset on the Hub. You must be authenticated to access it.