# Notes on Reproducibility:
These scripts were used to download images and generate the data manifests employed during training and evaluation. They represent the actual runs that produced the manifests used in this dissertation project.
While the scripts remain functional and can be executed successfully, their outputs may differ if the underlying data sources have changed since the original collection.
For access to the exact manifests used in this work, please contact the author.

# LAION 2B - Download Script

In [None]:
from datasets import load_dataset
import itertools, os, requests
import pandas as pd
from tqdm import tqdm
from PIL import Image
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed

# ======================
# Config
# ======================
total_size  = 5_000_000    # adjust as needed
batch_size  = 50_000       # safe batch size for progress + memory
output_dir  = "/data/thesis/laion_5m_images"
csv_path    = "/data/thesis/laion_5m_manifest.csv"

os.makedirs(output_dir, exist_ok=True)

# ======================
# Load dataset (streaming)
# ======================
ds = load_dataset(
    "laion/laion2B-en-aesthetic",
    split="train",
    streaming=True,
    token=os.environ["HF_TOKEN"]  # assumes token is in env
)

def good(example):
    return float(example.get("aesthetic", 0)) >= 5.0

stream = filter(good, ds)

# ======================
# Download function
# ======================
def fetch_and_save(idx, ex, output_dir):
    url = ex.get("URL") or ex.get("url")
    caption = ex.get("TEXT") or ex.get("text")
    aesthetic = ex.get("aesthetic", 0)
    local_path = os.path.join(output_dir, f"{idx:07d}.jpg")

    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            img = Image.open(BytesIO(r.content)).convert("RGB")
            img.save(local_path, "JPEG")
            return {
                "id": idx,
                "url": url,
                "caption": caption,
                "aesthetic": aesthetic,
                "local_path": local_path,
            }
    except Exception:
        return None


# ======================
# Batched Download Loop
# ======================
records = []
for batch_start in range(0, total_size, batch_size):
    print(f"\n=== Starting batch {batch_start//batch_size + 1} ===")
    batch_stream = itertools.islice(stream, batch_size)

    with ThreadPoolExecutor(max_workers=64) as executor:
        futures = [executor.submit(fetch_and_save, i + batch_start, ex, output_dir)
                   for i, ex in enumerate(batch_stream)]

        for future in tqdm(as_completed(futures), total=batch_size, desc=f"Batch {batch_start//batch_size + 1}"):
            result = future.result()
            if result:
                records.append(result)

    # Save partial CSV after each batch
    pd.DataFrame(records).to_csv(csv_path, index=False)
    print(f"✅ Batch {batch_start//batch_size + 1} complete. Saved {len(records)} records so far.")


print(f"\n🎉 Done. Total records saved: {len(records)}")


Resolving data files:   0%|          | 0/128 [00:00<?, ?it/s]


=== Starting batch 1 ===


Batch 1: 100%|██████████| 50000/50000 [04:45<00:00, 175.14it/s]  


✅ Batch 1 complete. Saved 33579 records so far.

=== Starting batch 2 ===


Batch 2: 100%|██████████| 50000/50000 [06:00<00:00, 138.64it/s] 


✅ Batch 2 complete. Saved 67055 records so far.

=== Starting batch 3 ===


Batch 3: 100%|██████████| 50000/50000 [06:43<00:00, 124.01it/s] 


✅ Batch 3 complete. Saved 100362 records so far.

=== Starting batch 4 ===


Batch 4: 100%|██████████| 50000/50000 [07:37<00:00, 109.37it/s] 


✅ Batch 4 complete. Saved 133839 records so far.

=== Starting batch 5 ===


Batch 5: 100%|██████████| 50000/50000 [07:11<00:00, 115.87it/s] 


✅ Batch 5 complete. Saved 167332 records so far.

=== Starting batch 6 ===


Batch 6: 100%|██████████| 50000/50000 [07:06<00:00, 117.20it/s] 


✅ Batch 6 complete. Saved 200988 records so far.

=== Starting batch 7 ===


Batch 7: 100%|██████████| 50000/50000 [07:13<00:00, 115.26it/s] 


✅ Batch 7 complete. Saved 234383 records so far.

=== Starting batch 8 ===


Batch 8: 100%|██████████| 50000/50000 [08:25<00:00, 98.93it/s]  


✅ Batch 8 complete. Saved 267846 records so far.

=== Starting batch 9 ===


Batch 9: 100%|██████████| 50000/50000 [08:10<00:00, 101.89it/s] 


✅ Batch 9 complete. Saved 300877 records so far.

=== Starting batch 10 ===


Batch 10: 100%|██████████| 50000/50000 [08:55<00:00, 93.33it/s] 


✅ Batch 10 complete. Saved 334407 records so far.

=== Starting batch 11 ===


Batch 11:  45%|████▍     | 22317/50000 [02:44<05:14, 88.16it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Batch 13: 100%|██████████| 50000/50000 [07:48<00:00, 106.62it/s] 


✅ Batch 13 complete. Saved 434478 records so far.

=== Starting batch 14 ===


Batch 14: 100%|██████████| 50000/50000 [07:00<00:00, 118.98it/s] 


✅ Batch 14 complete. Saved 467758 records so far.

=== Starting batch 15 ===


Batch 15: 100%|██████████| 50000/50000 [07:19<00:00, 113.73it/s] 


✅ Batch 15 complete. Saved 501047 records so far.

=== Starting batch 16 ===


Batch 16: 100%|██████████| 50000/50000 [07:21<00:00, 113.24it/s] 


✅ Batch 16 complete. Saved 534518 records so far.

=== Starting batch 17 ===


Batch 17: 100%|██████████| 50000/50000 [07:02<00:00, 118.44it/s] 


✅ Batch 17 complete. Saved 567911 records so far.

=== Starting batch 18 ===


Batch 18: 100%|██████████| 50000/50000 [07:13<00:00, 115.41it/s] 


✅ Batch 18 complete. Saved 601385 records so far.

=== Starting batch 19 ===


Batch 19: 100%|██████████| 50000/50000 [07:35<00:00, 109.69it/s] 


✅ Batch 19 complete. Saved 634802 records so far.

=== Starting batch 20 ===


Batch 20: 100%|██████████| 50000/50000 [07:08<00:00, 116.57it/s] 


✅ Batch 20 complete. Saved 668109 records so far.

=== Starting batch 21 ===


Batch 21: 100%|██████████| 50000/50000 [07:10<00:00, 116.20it/s] 


✅ Batch 21 complete. Saved 701344 records so far.

=== Starting batch 22 ===


Batch 22: 100%|██████████| 50000/50000 [07:06<00:00, 117.32it/s] 


✅ Batch 22 complete. Saved 734644 records so far.

=== Starting batch 23 ===


Batch 23: 100%|██████████| 50000/50000 [07:07<00:00, 116.88it/s] 


✅ Batch 23 complete. Saved 768159 records so far.

=== Starting batch 24 ===


Batch 24: 100%|██████████| 50000/50000 [06:52<00:00, 121.21it/s] 


✅ Batch 24 complete. Saved 801781 records so far.

=== Starting batch 25 ===


Batch 25: 100%|██████████| 50000/50000 [07:04<00:00, 117.90it/s] 


✅ Batch 25 complete. Saved 835091 records so far.

=== Starting batch 26 ===


Batch 26: 100%|██████████| 50000/50000 [07:13<00:00, 115.30it/s] 


✅ Batch 26 complete. Saved 868706 records so far.

=== Starting batch 27 ===


Batch 27: 100%|██████████| 50000/50000 [06:59<00:00, 119.22it/s] 


✅ Batch 27 complete. Saved 902188 records so far.

=== Starting batch 28 ===


Batch 28: 100%|██████████| 50000/50000 [07:46<00:00, 107.22it/s] 


✅ Batch 28 complete. Saved 935570 records so far.

=== Starting batch 29 ===


Batch 29: 100%|██████████| 50000/50000 [07:10<00:00, 116.15it/s] 


✅ Batch 29 complete. Saved 968873 records so far.

=== Starting batch 30 ===


Batch 30: 100%|██████████| 50000/50000 [06:38<00:00, 125.50it/s] 


✅ Batch 30 complete. Saved 1002481 records so far.

=== Starting batch 31 ===


Batch 31: 100%|██████████| 50000/50000 [07:09<00:00, 116.43it/s] 


✅ Batch 31 complete. Saved 1035872 records so far.

=== Starting batch 32 ===


Batch 32: 100%|██████████| 50000/50000 [07:06<00:00, 117.28it/s] 


✅ Batch 32 complete. Saved 1069257 records so far.

=== Starting batch 33 ===


Batch 33: 100%|██████████| 50000/50000 [07:18<00:00, 114.05it/s] 


✅ Batch 33 complete. Saved 1102616 records so far.

=== Starting batch 34 ===


Batch 34: 100%|██████████| 50000/50000 [07:32<00:00, 110.44it/s] 


✅ Batch 34 complete. Saved 1136228 records so far.

=== Starting batch 35 ===


Batch 35: 100%|██████████| 50000/50000 [06:59<00:00, 119.22it/s] 


✅ Batch 35 complete. Saved 1169923 records so far.

=== Starting batch 36 ===


Batch 36: 100%|██████████| 50000/50000 [07:45<00:00, 107.42it/s] 


✅ Batch 36 complete. Saved 1203271 records so far.

=== Starting batch 37 ===


Batch 37: 100%|██████████| 50000/50000 [07:03<00:00, 118.08it/s] 


✅ Batch 37 complete. Saved 1236588 records so far.

=== Starting batch 38 ===


Batch 38: 100%|██████████| 50000/50000 [07:18<00:00, 114.02it/s] 


✅ Batch 38 complete. Saved 1269932 records so far.

=== Starting batch 39 ===


Batch 39:  64%|██████▍   | 32017/50000 [03:47<02:34, 116.43it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Batch 57: 100%|██████████| 50000/50000 [06:39<00:00, 125.05it/s]


✅ Batch 57 complete. Saved 1902866 records so far.

=== Starting batch 58 ===


Batch 58:  70%|███████   | 35101/50000 [04:23<02:17, 108.46it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Batch 63: 100%|██████████| 50000/50000 [06:57<00:00, 119.90it/s]


✅ Batch 63 complete. Saved 2102807 records so far.

=== Starting batch 64 ===


Batch 64: 100%|██████████| 50000/50000 [06:46<00:00, 123.09it/s] 


✅ Batch 64 complete. Saved 2135957 records so far.

=== Starting batch 65 ===


Batch 65: 100%|██████████| 50000/50000 [06:43<00:00, 123.80it/s] 


✅ Batch 65 complete. Saved 2169228 records so far.

=== Starting batch 66 ===


Batch 66: 100%|██████████| 50000/50000 [07:32<00:00, 110.50it/s] 


✅ Batch 66 complete. Saved 2202505 records so far.

=== Starting batch 67 ===


Batch 67: 100%|██████████| 50000/50000 [07:05<00:00, 117.59it/s] 


✅ Batch 67 complete. Saved 2236026 records so far.

=== Starting batch 68 ===


Batch 68: 100%|██████████| 50000/50000 [06:59<00:00, 119.31it/s] 


✅ Batch 68 complete. Saved 2269319 records so far.

=== Starting batch 69 ===


Batch 69: 100%|██████████| 50000/50000 [07:02<00:00, 118.32it/s] 


✅ Batch 69 complete. Saved 2302855 records so far.

=== Starting batch 70 ===


Batch 70: 100%|██████████| 50000/50000 [06:42<00:00, 124.27it/s] 


✅ Batch 70 complete. Saved 2336397 records so far.

=== Starting batch 71 ===


Batch 71: 100%|██████████| 50000/50000 [07:01<00:00, 118.64it/s] 


✅ Batch 71 complete. Saved 2369695 records so far.

=== Starting batch 72 ===


Batch 72: 100%|██████████| 50000/50000 [07:05<00:00, 117.46it/s] 


✅ Batch 72 complete. Saved 2403203 records so far.

=== Starting batch 73 ===


Batch 73: 100%|██████████| 50000/50000 [07:25<00:00, 112.29it/s]


✅ Batch 73 complete. Saved 2436397 records so far.

=== Starting batch 74 ===


Batch 74: 100%|██████████| 50000/50000 [07:14<00:00, 115.00it/s] 


✅ Batch 74 complete. Saved 2469575 records so far.

=== Starting batch 75 ===


Batch 75: 100%|██████████| 50000/50000 [07:18<00:00, 114.08it/s] 


✅ Batch 75 complete. Saved 2502905 records so far.

=== Starting batch 76 ===


Batch 76: 100%|██████████| 50000/50000 [06:44<00:00, 123.72it/s] 


✅ Batch 76 complete. Saved 2536294 records so far.

=== Starting batch 77 ===


Batch 77: 100%|██████████| 50000/50000 [07:27<00:00, 111.82it/s] 


✅ Batch 77 complete. Saved 2569569 records so far.

=== Starting batch 78 ===


Batch 78: 100%|██████████| 50000/50000 [07:04<00:00, 117.74it/s] 


✅ Batch 78 complete. Saved 2602851 records so far.

=== Starting batch 79 ===


Batch 79: 100%|██████████| 50000/50000 [07:23<00:00, 112.63it/s] 


✅ Batch 79 complete. Saved 2636186 records so far.

=== Starting batch 80 ===


Batch 80: 100%|██████████| 50000/50000 [07:01<00:00, 118.63it/s] 


✅ Batch 80 complete. Saved 2669432 records so far.

=== Starting batch 81 ===


Batch 81: 100%|██████████| 50000/50000 [06:57<00:00, 119.69it/s] 


✅ Batch 81 complete. Saved 2702651 records so far.

=== Starting batch 82 ===


Batch 82: 100%|██████████| 50000/50000 [07:05<00:00, 117.44it/s] 


✅ Batch 82 complete. Saved 2736058 records so far.

=== Starting batch 83 ===


Batch 83: 100%|██████████| 50000/50000 [06:51<00:00, 121.56it/s] 


✅ Batch 83 complete. Saved 2769632 records so far.

=== Starting batch 84 ===


Batch 84: 100%|██████████| 50000/50000 [06:57<00:00, 119.89it/s] 


✅ Batch 84 complete. Saved 2802908 records so far.

=== Starting batch 85 ===


Batch 85: 100%|██████████| 50000/50000 [06:49<00:00, 122.24it/s] 


✅ Batch 85 complete. Saved 2836460 records so far.

=== Starting batch 86 ===


Batch 86: 100%|██████████| 50000/50000 [07:03<00:00, 118.08it/s] 


✅ Batch 86 complete. Saved 2869767 records so far.

=== Starting batch 87 ===


Batch 87: 100%|██████████| 50000/50000 [06:51<00:00, 121.59it/s] 


✅ Batch 87 complete. Saved 2903112 records so far.

=== Starting batch 88 ===


Batch 88: 100%|██████████| 50000/50000 [07:06<00:00, 117.31it/s] 


✅ Batch 88 complete. Saved 2936573 records so far.

=== Starting batch 89 ===


Batch 89: 100%|██████████| 50000/50000 [07:19<00:00, 113.72it/s] 


✅ Batch 89 complete. Saved 2970095 records so far.

=== Starting batch 90 ===


Batch 90: 100%|██████████| 50000/50000 [06:58<00:00, 119.56it/s] 


✅ Batch 90 complete. Saved 3003287 records so far.

=== Starting batch 91 ===


Batch 91: 100%|██████████| 50000/50000 [07:03<00:00, 118.04it/s] 


✅ Batch 91 complete. Saved 3036634 records so far.

=== Starting batch 92 ===


Batch 92: 100%|██████████| 50000/50000 [06:28<00:00, 128.74it/s] 


✅ Batch 92 complete. Saved 3069985 records so far.

=== Starting batch 93 ===


Batch 93: 100%|██████████| 50000/50000 [07:43<00:00, 107.93it/s] 


✅ Batch 93 complete. Saved 3103125 records so far.

=== Starting batch 94 ===


Batch 94: 100%|██████████| 50000/50000 [07:07<00:00, 117.00it/s] 


✅ Batch 94 complete. Saved 3136506 records so far.

=== Starting batch 95 ===


Batch 95: 100%|██████████| 50000/50000 [06:51<00:00, 121.47it/s] 


✅ Batch 95 complete. Saved 3169774 records so far.

=== Starting batch 96 ===


Batch 96: 100%|██████████| 50000/50000 [06:51<00:00, 121.46it/s] 


✅ Batch 96 complete. Saved 3202920 records so far.

=== Starting batch 97 ===


Batch 97: 100%|██████████| 50000/50000 [07:20<00:00, 113.47it/s] 


✅ Batch 97 complete. Saved 3236029 records so far.

=== Starting batch 98 ===


Batch 98: 100%|██████████| 50000/50000 [07:08<00:00, 116.63it/s] 


✅ Batch 98 complete. Saved 3269368 records so far.

=== Starting batch 99 ===


Batch 99: 100%|██████████| 50000/50000 [06:53<00:00, 120.95it/s] 


✅ Batch 99 complete. Saved 3302936 records so far.

=== Starting batch 100 ===


Batch 100: 100%|██████████| 50000/50000 [06:52<00:00, 121.26it/s] 


✅ Batch 100 complete. Saved 3336240 records so far.

🎉 Done. Total records saved: 3336240


# COCO - Download Script

In [1]:
import os, json, zipfile, hashlib, requests
from pathlib import Path
from tqdm import tqdm
import pandas as pd

# ======================
# Config
# ======================
root_dir   = Path("/data/thesis/coco2017")
img_dir    = root_dir / "images"   # will contain train2017/ and val2017/
ann_dir    = root_dir / "annotations"
csv_path   = root_dir / "coco_manifest.csv"

root_dir.mkdir(parents=True, exist_ok=True)
img_dir.mkdir(parents=True, exist_ok=True)
ann_dir.mkdir(parents=True, exist_ok=True)

# Official COCO 2017 URLs (mirrors sometimes differ; these are the canonical ones)
URLS = {
    "train_images": ("http://images.cocodataset.org/zips/train2017.zip", img_dir),
    "val_images":   ("http://images.cocodataset.org/zips/val2017.zip",   img_dir),
    "annotations":  ("http://images.cocodataset.org/annotations/annotations_trainval2017.zip", ann_dir),
}

def download_with_progress(url: str, dest: Path):
    dest.parent.mkdir(parents=True, exist_ok=True)
    if dest.exists() and dest.stat().st_size > 0:
        print(f"✔ Already present: {dest}")
        return dest
    with requests.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        total = int(r.headers.get("Content-Length", 0))
        tmp = dest.with_suffix(dest.suffix + ".part")
        with open(tmp, "wb") as f, tqdm(
            total=total, unit="B", unit_scale=True, desc=dest.name
        ) as pbar:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
        tmp.rename(dest)
    return dest

def safe_unzip(zip_path: Path, target_dir: Path):
    target_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as zf:
        for m in tqdm(zf.infolist(), desc=f"Extract {zip_path.name}"):
            # basic zip-slip guard
            out_path = target_dir / m.filename
            if not str(out_path.resolve()).startswith(str(target_dir.resolve())):
                raise RuntimeError("Zip path traversal detected")
            if m.is_dir():
                out_path.mkdir(parents=True, exist_ok=True)
            else:
                out_path.parent.mkdir(parents=True, exist_ok=True)
                zf.extract(m, path=target_dir)

# ======================
# 1) Download zips
# ======================
downloaded = {}
for key, (url, target_dir) in URLS.items():
    fname = url.split("/")[-1]
    dest = target_dir / fname
    downloaded[key] = download_with_progress(url, dest)

# ======================
# 2) Extract
# ======================
# Images go into img_dir (which already contains the zip). Each zip contains its own subfolder train2017/ or val2017/
safe_unzip(downloaded["train_images"], img_dir)
safe_unzip(downloaded["val_images"],   img_dir)
# Annotations zip extracts json files into ann_dir/annotations/
safe_unzip(downloaded["annotations"],  ann_dir)

# Resolve final paths
train_img_root = img_dir / "train2017"
val_img_root   = img_dir / "val2017"

# Annotations usually end up in ann_dir/"annotations"/captions_train2017.json, etc.
ann_root = ann_dir / "annotations"
cap_train_json = ann_root / "captions_train2017.json"
cap_val_json   = ann_root / "captions_val2017.json"

assert train_img_root.exists(), f"Missing {train_img_root}"
assert val_img_root.exists(),   f"Missing {val_img_root}"
assert cap_train_json.exists(), f"Missing {cap_train_json}"
assert cap_val_json.exists(),   f"Missing {cap_val_json}"

# ======================
# 3) Build CSV manifest from captions
# ======================
def build_manifest(captions_json: Path, split: str, images_root: Path):
    with open(captions_json, "r") as f:
        data = json.load(f)

    # Map image_id -> file_name
    id_to_name = {img["id"]: img["file_name"] for img in data["images"]}
    rows = []
    for ann in data["annotations"]:
        image_id = ann["image_id"]
        caption  = ann.get("caption", "").strip()
        file_name = id_to_name.get(image_id)
        if not file_name:
            continue
        local_path = str((images_root / file_name).resolve())
        rows.append({
            "split": split,
            "image_id": image_id,
            "file_name": file_name,
            "local_path": local_path,
            "caption": caption
        })
    return rows

records = []
records += build_manifest(cap_train_json, "train", train_img_root)
records += build_manifest(cap_val_json,   "val",   val_img_root)

df = pd.DataFrame(records)
df.to_csv(csv_path, index=False)
print(f"✅ Wrote manifest with {len(df)} rows to {csv_path}")
print(df.head())


train2017.zip:  83%|████████▎ | 16.0G/19.3G [13:05<02:05, 26.7MB/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

train2017.zip: 100%|██████████| 19.3G/19.3G [15:48<00:00, 20.4MB/s]
val2017.zip: 100%|██████████| 816M/816M [00:50<00:00, 16.2MB/s] 
annotations_trainval2017.zip: 100%|██████████| 253M/253M [00:11<00:00, 22.6MB/s] 
Extract train2017.zip: 100%|██████████| 118288/118288 [00:48<00:00, 2450.52it/s]
Extract val2017.zip: 100%|██████████| 5001/5001 [00:02<00:00, 2481.80it/s]
Extract annotations_trainval2017.zip: 100%|██████████| 6/6 [00:03<00:00,  1.73it/s]


✅ Wrote manifest with 616767 rows to /data/thesis/coco2017/coco_manifest.csv
   split  image_id         file_name  \
0  train    203564  000000203564.jpg   
1  train    322141  000000322141.jpg   
2  train     16977  000000016977.jpg   
3  train    106140  000000106140.jpg   
4  train    106140  000000106140.jpg   

                                          local_path  \
0  /data/thesis/coco2017/images/train2017/0000002...   
1  /data/thesis/coco2017/images/train2017/0000003...   
2  /data/thesis/coco2017/images/train2017/0000000...   
3  /data/thesis/coco2017/images/train2017/0000001...   
4  /data/thesis/coco2017/images/train2017/0000001...   

                                             caption  
0  A bicycle replica with a clock as the front wh...  
1  A room with blue walls and a white sink and door.  
2  A car that seems to be parked illegally behind...  
3  A large passenger airplane flying through the ...  
4  There is a GOL plane taking off in a partly cl...  


# AdImageNet - Download Script

In [None]:
# AdImageNet: download → filter → save → manifest
# Requirements:
#   pip install -U datasets pillow pandas tqdm huggingface_hub

from datasets import load_dataset
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import io, os, re, sys

# ======================
# Config
# ======================
OUT_ROOT   = Path("/data/thesis/AdImageNet")
IMG_DIR    = OUT_ROOT / "images"
CSV_PATH   = OUT_ROOT / "adimagenet_manifest.csv"
KEEP_TEXT_MAX_LEN = 80
MIN_W = 250
MIN_H = 250

OUT_ROOT.mkdir(parents=True, exist_ok=True)
IMG_DIR.mkdir(parents=True, exist_ok=True)

# ======================
# Load dataset
# ======================
# If you ever get 401/403: run `huggingface-cli login` in the same environment
try:
    ds = load_dataset("PeterBrendan/AdImageNet", split="train")
except Exception:
    dsdict = load_dataset("PeterBrendan/AdImageNet")
    ds = next(iter(dsdict.values()))

# ======================
# Helpers
# ======================
dim_re = re.compile(r"\d+")

def get_dims(record, pil_img: Image.Image | None):
    """
    Return (w, h) using record['dimensions'] if present, otherwise PIL size.
    """
    w = h = None
    dims = record.get("dimensions")
    if isinstance(dims, str):
        nums = dim_re.findall(dims)
        if len(nums) >= 2:
            w, h = int(nums[0]), int(nums[1])

    if pil_img is not None:
        pw, ph = pil_img.size
        # prefer explicit dims if present; otherwise use PIL
        w = w if w is not None else pw
        h = h if h is not None else ph
    return w, h

def get_pil(record):
    """
    Make a PIL.Image from the 'image' column (the dataset uses Image feature).
    Also handles raw bytes/string path just in case.
    """
    val = record.get("image")
    if isinstance(val, Image.Image):
        return val.convert("RGB")
    if isinstance(val, (bytes, bytearray)):
        return Image.open(io.BytesIO(val)).convert("RGB")
    if isinstance(val, str) and os.path.exists(val):
        return Image.open(val).convert("RGB")
    raise KeyError("No usable image payload in record['image'].")

def pass_filters(record, pil_img):
    # text length
    txt = record.get("text") or ""
    if isinstance(txt, str) and len(txt) > KEEP_TEXT_MAX_LEN:
        return False

    # dimensions (require BOTH width & height ≥ thresholds)
    w, h = get_dims(record, pil_img)
    if w is None or h is None:
        return False
    return (w >= MIN_W) and (h >= MIN_H)

# ======================
# Save loop + manifest
# ======================
rows = []
kept = 0
skipped = 0

for i, rec in enumerate(tqdm(ds, desc="Filtering & saving")):
    try:
        img = get_pil(rec)
        if not pass_filters(rec, img):
            skipped += 1
            continue

        # organize optionally by dimensions folder, e.g. "(300, 250)"
        sub = str(rec.get("dimensions") or "")
        save_dir = IMG_DIR / sub if sub else IMG_DIR
        save_dir.mkdir(parents=True, exist_ok=True)

        # filename: prefer dataset file_name, else generate
        fname = rec.get("file_name") or f"ad_{i:06d}.jpg"
        # normalize extension to .jpg
        stem, ext = os.path.splitext(fname)
        if ext.lower() not in [".jpg", ".jpeg", ".png", ".webp"]:
            fname = f"{stem}.jpg"

        out_path = save_dir / fname
        img.save(out_path, format="JPEG", quality=95, optimize=True)

        w, h = get_dims(rec, img)
        rows.append({
            "file_path": str(out_path.resolve()),
            "file_name": fname,
            "text": rec.get("text") or "",
            "dimensions": rec.get("dimensions") or "",
            "width": w, "height": h,
        })
        kept += 1

    except Exception as e:
        skipped += 1
        print(f"[warn] row {i}: {e}", file=sys.stderr)

# ======================
# Write manifest
# ======================
df = pd.DataFrame(rows)
df.to_csv(CSV_PATH, index=False)

print(f"\nDone. Kept {kept} creatives, skipped {skipped}.")
print(f"Images → {IMG_DIR}")
print(f"Manifest → {CSV_PATH}")


README.md:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

data/train-00000-of-00002-6e587552aa3c8a(…):   0%|          | 0.00/344M [00:00<?, ?B/s]

data/train-00001-of-00002-823ac5dae71e0e(…):   0%|          | 0.00/338M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9003 [00:00<?, ? examples/s]

Filtering & saving: 100%|██████████| 9003/9003 [00:17<00:00, 519.40it/s]



Done. Kept 2080 creatives, skipped 6923.
Images → /data/thesis/AdImageNet/images
Manifest → /data/thesis/AdImageNet/adimagenet_manifest.csv
