In [1]:
# Install the package in development mode if needed
# !pip install -e .

import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from multiprocessing import cpu_count
import os
import time
import platform
import psutil
import shutil

# Import the DEGIS package
import degis
from degis.data.dataset import UnifiedImageDataset
import config

In [4]:
# build_cost_matrices.py
# Creates ground-cost matrices consistent with your *fast* histograms:
# - RGB512  : bins over [0,256) for each channel (8×8×8 = 512 bins)
# - LAB514  : same 8×8×8 grid but mapped back to (L*,a*,b*), plus 2 neutral bins (black, white)
# - HCL514  : L*∈[0,100], C∈[0,c_max], H∈[0,360); mapped to (L*,a*,b*) via a=C cosH, b=C sinH; + neutrals
#
# Output: cost_rgb512.npy, cost_lab514.npy, cost_hcl514.npy  (float32, symmetric, zero diagonal)

import os, hashlib
import numpy as np

# --------------------------- helpers ---------------------------------
def bin_centers(low: float, high: float, bins: int) -> np.ndarray:
    edges = np.linspace(low, high, bins + 1, dtype=np.float64)
    return (edges[:-1] + edges[1:]) * 0.5  # midpoints (float64)

def grid_centers(c1, c2, c3) -> np.ndarray:
    X, Y, Z = np.meshgrid(c1, c2, c3, indexing="ij")
    P = np.stack([X, Y, Z], axis=-1).reshape(-1, 3)
    return P  # [K,3] float64

def pairwise_euclidean(X: np.ndarray) -> np.ndarray:
    # X: [K,3] float64 -> returns [K,K] float32
    diff = X[:, None, :] - X[None, :, :]
    D = np.sqrt((diff * diff).sum(axis=-1), dtype=np.float64)
    return D.astype(np.float32)

def add_neutral_bins(points_Lab: np.ndarray) -> np.ndarray:
    # points_Lab: [K,3] in (L*, a*, b*)
    black = np.array([[0.0,   0.0, 0.0]])
    white = np.array([[100.0, 0.0, 0.0]])
    return np.vstack([points_Lab, black, white])

def check_and_save(M: np.ndarray, path: str):
    assert M.ndim == 2 and M.shape[0] == M.shape[1], f"Not square: {M.shape}"
    assert np.allclose(M, M.T, atol=1e-6), "Matrix not symmetric"
    assert np.allclose(np.diag(M), 0.0, atol=1e-6), "Diagonal not zero"
    os.makedirs(os.path.dirname(path), exist_ok=True)
    np.save(path, M.astype(np.float32))
    h = hashlib.sha256(open(path, "rb").read()).hexdigest()[:16]
    print(f"✓ {os.path.basename(path)}  shape={M.shape}  sha256={h}")

# --------------------------- builders --------------------------------
def build_rgb_cost(bins: int = 8) -> np.ndarray:
    # histogram space: R,G,B ∈ [0,256) as in fast_rgb_histogram
    c = bin_centers(0.0, 256.0, bins)         # e.g., [16, 48, ..., 240]
    P = grid_centers(c, c, c)                 # [512,3] in RGB units
    return pairwise_euclidean(P)

def build_lab_cost(bins: int = 8) -> np.ndarray:
    # histogram space used 0..255 buckets after transforms:
    # L8 in [0,256) corresponds to L* via L* = L8 * 100/255
    # a8 = a* + 128, b8 = b* + 128
    c8 = bin_centers(0.0, 256.0, bins)        # midpoints in 0..256
    Ls = c8 * (100.0 / 255.0)                 # back to L* domain
    As = c8 - 128.0                           # back to a*
    Bs = c8 - 128.0                           # back to b*
    P = grid_centers(Ls, As, Bs)              # [512,3] in (L*,a*,b*)
    P = add_neutral_bins(P)                   # + black/white => [514,3]
    return pairwise_euclidean(P)

def build_hcl_cost(bins: int = 8, c_max: float = 150.0) -> np.ndarray:
    # histogram space: L*∈[0,100], C∈[0,c_max], H∈[0,360)
    Lc = bin_centers(0.0, 100.0, bins)
    Cc = bin_centers(0.0, c_max, bins)
    Hc = bin_centers(0.0, 360.0, bins)        # degrees
    # Build grid in (L*,C,H), then map (C,H) -> (a*,b*)
    Lg, Cg, Hg = np.meshgrid(Lc, Cc, Hc, indexing="ij")
    a = Cg * np.cos(np.deg2rad(Hg))
    b = Cg * np.sin(np.deg2rad(Hg))
    P = np.stack([Lg, a, b], axis=-1).reshape(-1, 3)  # [512,3] in (L*,a*,b*)
    P = add_neutral_bins(P)                            # [514,3]
    return pairwise_euclidean(P)

# ----------------------------- main ----------------------------------
if __name__ == "__main__":
    outdir = "assets/costs"
    rgb = build_rgb_cost(bins=8)
    lab = build_lab_cost(bins=8)
    hcl = build_hcl_cost(bins=8, c_max=150.0)

    check_and_save(rgb, os.path.join(outdir, "cost_rgb512.npy"))
    check_and_save(lab, os.path.join(outdir, "cost_lab514.npy"))
    check_and_save(hcl, os.path.join(outdir, "cost_hcl514.npy"))


✓ cost_rgb512.npy  shape=(512, 512)  sha256=5c8ac2fb072dd176
✓ cost_lab514.npy  shape=(514, 514)  sha256=3ba7d8807f777a4c
✓ cost_hcl514.npy  shape=(514, 514)  sha256=46db118d1adf911a


In [6]:
! pip install POT

Collecting POT
  Downloading POT-0.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading POT-0.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (901 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m901.7/901.7 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.5
[0m

In [9]:
import json, numpy as np, hashlib, os

def edges(a,b,nb): return np.linspace(a,b,nb+1).tolist()
def centers(a,b,nb): 
    e = np.linspace(a,b,nb+1); return ((e[:-1]+e[1:])/2).tolist()

cfg = {
  "version": "v1-fast",
  "bins": 8,
  "rgb512": {
    "axes": {"R":{"range":[0,256],"edges":edges(0,256,8)},
             "G":{"range":[0,256],"edges":edges(0,256,8)},
             "B":{"range":[0,256],"edges":edges(0,256,8)}},
    "dim": 512,
    "neutrals": None
  },
  "lab514": {
    # fast_lab_histogram: L∈[0,100]→L8, a,b shifted by +128 to [0,256]
    "axes": {"L8":{"range":[0,256],"edges":edges(0,256,8)},
             "a8":{"range":[0,256],"edges":edges(0,256,8)},
             "b8":{"range":[0,256],"edges":edges(0,256,8)}},
    "neutrals": {"type":"black_white",
                 "black":{"L<":10, "abs(a),abs(b)<":2.0},
                 "white":{"L>":90, "abs(a),abs(b)<":2.0}},
    "dim": 514,
    "neutral_bin_indices": {"black":512, "white":513}
  },
  "hcl514": {
    # fast_hcl_histogram: L∈[0,100], C∈[0,150], H∈[0,360)
    "axes": {"L":{"range":[0,100],"edges":edges(0,100,8)},
             "C":{"range":[0,150],"edges":edges(0,150,8)},
             "H":{"range":[0,360],"edges":edges(0,360,8)}},
    "neutrals": {"type":"black_white",
                 "black":{"L<":10, "C<":2.0},
                 "white":{"L>":90, "C<":2.0}},
    "dim": 514,
    "neutral_bin_indices": {"black":512, "white":513}
  }
}

os.makedirs("assets/config", exist_ok=True)
out = "assets/config/hist_config.json"
with open(out,"w") as f: json.dump(cfg, f, indent=2)
print(f"Wrote {out}")


Wrote assets/config/hist_config.json


In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from PIL import Image

import config  # ✅ will work if notebook is in same folder as config.py
from data.dataset import UnifiedImageDataset

In [11]:
layout_df = pd.read_csv("/data/thesis/adimagenet_manifest.csv")
layout_dataset = UnifiedImageDataset(layout_df, mode="file_df", size=(224,224))
layout_df.head()

Unnamed: 0,file_path,file_name,text,dimensions,width,height
0,"/data/thesis/AdImageNet/images/(300, 250)/ad_0...",ad_000001.jpg,$3\nSTULZ\nDifferential for 2nd Shift\nManufac...,"(300, 250)",300,250
1,"/data/thesis/AdImageNet/images/(300, 250)/ad_0...",ad_000009.jpg,VULTURE\ninto\nwith\nSam Sanders\nApple Podcasts,"(300, 250)",300,250
2,"/data/thesis/AdImageNet/images/(300, 250)/ad_0...",ad_000017.jpg,smart\ncare\nO\ndesign\nbuild\n& install\nrepa...,"(300, 250)",300,250
3,"/data/thesis/AdImageNet/images/(300, 600)/ad_0...",ad_000020.jpg,"TREE\nSANTOR\nMatch On!\nThe Showstopper,\nLuc...","(300, 600)",300,600
4,"/data/thesis/AdImageNet/images/(300, 250)/ad_0...",ad_000021.jpg,Local experts connecting\ncustomers to YOUR bu...,"(300, 250)",300,250


In [13]:
import json, numpy as np, os
from PIL import Image
from degis.features.edge_maps import compute_edge_map_canny

meta = {"method":"canny", "low":150, "high":200, "resize": [224,224]}
os.makedirs("assets/edges", exist_ok=True)

samples = [(layout_df['file_path'][0],"assets/edges/sample1.png"),
           (layout_df['file_path'][1],"assets/edges/sample2.png"),
           (layout_df['file_path'][2],"assets/edges/sample3.png")]

arr = []
for src, dst in samples:
    e = compute_edge_map_canny(Image.open(src).convert("RGB"), (224,224))
    arr.append(e.reshape(224,224))
    Image.fromarray((arr[-1]*255).astype(np.uint8)).save(dst)

np.save("assets/edges/samples.npy", np.stack(arr,0))
with open("assets/edges/meta.json","w") as f: json.dump(meta,f,indent=2)
print("Saved edge samples + meta.")


Saved edge samples + meta.


In [14]:
import json, torch
clip_meta = {
  "open_clip_vit_h14": {
    "source": "open_clip",
    "model_name": "ViT-H-14",
    "pretrained": "laion2b_s32b_b79k",
    "embedding_dim": 1024,
    "dtype_runtime": "fp16_on_cuda, cast_to_fp32_on_save",
    "pooling": "model.encode_image() global projection",
    "normalize": "none (downstream code may L2-normalize)"
  },
  "hf_clip_bigg14": {
    "source": "transformers",
    "model_id": "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
    "embedding_api": "CLIPVisionModelWithProjection.image_embeds",
    "projection_dim": 1024,
    "dtype_runtime": "fp16_on_cuda, cast_to_fp32_on_save",
    "normalize": "L2 normalization applied (see code compute_clip_embedding_xl)"
  }
}
os.makedirs("assets/clip", exist_ok=True)
with open("assets/clip/clip_meta.json","w") as f: json.dump(clip_meta,f,indent=2)
print("Wrote assets/clip/clip_meta.json")


Wrote assets/clip/clip_meta.json


In [8]:
import numpy as np
import ot  # pip install POT

def emd_hist(p, q, M):
    # p, q: 1D histograms (sum ~ 1), M: ground cost matrix (KxK)
    p = p.astype(np.float32); q = q.astype(np.float32)
    p = p / (p.sum() + 1e-8); q = q / (q.sum() + 1e-8)
    return ot.emd2(p, q, M).item()  # scalar Earth Mover's Distance^2

# Example (HCL with neutrals)
M = np.load("assets/costs/cost_hcl514.npy")
emd = emd_hist(h_gen, h_ref, M)

NameError: name 'h_gen' is not defined

In [None]:
# === verify_feature_assets.py ===
# Purpose: load your precomputed EMD cost matrices, sanity-check them,
# compute SHA256 checksums, and emit a tiny manifest (CSV + JSON) you can cite.

import os, json, hashlib, csv
import numpy as np

# ---- EDIT THESE PATHS to your actual files ----
ASSETS_DIR = "/data/thesis/feature_assets"   # where to write summaries/plots
os.makedirs(ASSETS_DIR, exist_ok=True)

COST_FILES = {
    "rgb512":  config.COLOR_HIST_PATH_RGB,
    "lab514":  config.COLOR_HIST_PATH_LAB_514,
    "hcl514":  config.COLOR_HIST_PATH_HCL_514,
}

# Canonical histogram config you actually used (fast_* implementations)
HIST_CONFIG = {
    "bins": 8,
    "rgb":  {"ranges": {"R": [0,256], "G": [0,256], "B": [0,256]}, "neutral_bins": 0},
    "lab":  {
        "ranges": {"L": [0,100], "a": "rgb2lab(a)", "b": "rgb2lab(b)"},
        "neutral_bins": 2,
        "neutral_rule": {"black": {"L_lt": 10, "C_lt": 2.0},
                         "white": {"L_gt": 90, "C_lt": 2.0}},
        "dim": 8**3 + 2
    },
    "hcl":  {
        "ranges": {"L": [0,100], "C": [0,150.0], "H": [0,360]},
        "neutral_bins": 2,
        "neutral_rule": {"black": {"L_lt": 10, "C_lt": 2.0},
                         "white": {"L_gt": 90, "C_lt": 2.0}},
        "dim": 8**3 + 2
    },
}

def sha256(path):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1024*1024), b""):
            h.update(chunk)
    return h.hexdigest()

def check_matrix(M, name):
    assert isinstance(M, np.ndarray), f"{name}: not an ndarray"
    n, m = M.shape
    assert n == m, f"{name}: not square ({n}x{m})"
    assert np.allclose(M, M.T, atol=1e-6), f"{name}: not symmetric"
    diag = np.diag(M)
    assert np.all(diag >= -1e-7), f"{name}: negative diagonal?"
    assert np.allclose(diag, 0.0, atol=1e-6), f"{name}: diagonal not ~0"
    assert np.nanmin(M) >= -1e-7, f"{name}: negative entries?"
    return {
        "shape": f"{n}x{m}",
        "min": float(np.min(M)),
        "max": float(np.max(M)),
        "mean": float(np.mean(M)),
        "median": float(np.median(M)),
        "std": float(np.std(M)),
    }

# Load, validate, and record
rows = []
summary_json = {}

for key, path in COST_FILES.items():
    assert os.path.exists(path), f"Missing: {path}"
    M = np.load(path)
    stats = check_matrix(M, key)
    digest = sha256(path)
    row = {
        "name": key,
        "path": path,
        "shape": stats["shape"],
        "sha256": digest,
        "min": stats["min"],
        "max": stats["max"],
        "mean": stats["mean"],
    }
    rows.append(row)
    summary_json[key] = {**stats, "sha256": digest, "path": path}

# Write CSV manifest
csv_path = os.path.join(ASSETS_DIR, "cost_matrices_manifest.csv")
with open(csv_path, "w", newline="") as f:
    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    w.writeheader(); w.writerows(rows)

# Write JSON summary (plus your canonical histogram config)
json_path = os.path.join(ASSETS_DIR, "feature_assets_summary.json")
with open(json_path, "w") as f:
    json.dump({"costs": summary_json, "hist_config": HIST_CONFIG}, f, indent=2)

print(f"✔ Wrote:\n- {csv_path}\n- {json_path}")

# (Optional) save tiny downsampled heatmaps for the thesis (as sanity visuals)
try:
    import matplotlib.pyplot as plt
    for key, path in COST_FILES.items():
        M = np.load(path)
        # downsample if >256 to keep figures light
        step = max(1, M.shape[0] // 256)
        Mv = M[::step, ::step]
        plt.figure()
        plt.imshow(Mv)
        plt.title(f"{key} ground-cost (downsample {step}x)")
        plt.colorbar()
        out = os.path.join(ASSETS_DIR, f"{key}_cost_heatmap.png")
        plt.savefig(out, dpi=200, bbox_inches="tight")
        plt.close()
        print(f"  • saved {out}")
except Exception as e:
    print(f"(skipped heatmaps: {e})")


ValueError: operands could not be broadcast together with shapes (3336240,512) (512,3336240) 

In [4]:
def print_system_profile():
    print("=== SYSTEM PROFILE ===")
    print("Python:", platform.python_version())
    print("PyTorch:", torch.__version__)
    print("CPU cores:", psutil.cpu_count(logical=True))
    vm = psutil.virtual_memory()
    print(f"RAM: {vm.total/1e9:.1f} GB, free {vm.available/1e9:.1f} GB")
    
    # Check if /data exists, otherwise check current directory
    if os.path.exists("/data"):
        du = shutil.disk_usage("/data")
        print(f"/data disk: total {du.total/1e9:.1f} GB, free {du.free/1e9:.1f} GB")
    else:
        du = shutil.disk_usage(".")
        print(f"Current disk: total {du.total/1e9:.1f} GB, free {du.free/1e9:.1f} GB")
    
    print("CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        i = torch.cuda.current_device()
        print("GPU:", torch.cuda.get_device_name(i))
        print(f"VRAM total: {torch.cuda.get_device_properties(i).total_memory/1e9:.1f} GB")
    print("======================")

print_system_profile()


=== SYSTEM PROFILE ===
Python: 3.12.3
PyTorch: 2.8.0+cu128
CPU cores: 256
RAM: 540.8 GB, free 475.4 GB
/data disk: total 1099.5 GB, free 249.7 GB
CUDA available: True
GPU: NVIDIA GeForce RTX 5090
VRAM total: 33.7 GB


In [5]:
batch_size = 512
embeddings_size = "base"

In [12]:
# name = 'coco'
name = 'laion_5m'
# name = 'adimagenet'
csv_path = f"/data/thesis/{name}_manifest.csv"

In [13]:
df = pd.read_csv(csv_path)
print(f"Dataset loaded: {len(df)} images")
print(f"Columns: {df.columns.tolist()}")
print(df.shape)
df.head()

Dataset loaded: 3336240 images
Columns: ['id', 'url', 'caption', 'aesthetic', 'local_path']
(3336240, 5)


Unnamed: 0,id,url,caption,aesthetic,local_path
0,6744,https://t0.gstatic.com/images?q=tbn:ANd9GcQM-D...,Wrought Iron King Headboard And Footboard by B...,7.2972,/data/thesis/laion_5m_images/0006744.jpg
1,5801,https://i.dailymail.co.uk/1s/2020/03/24/16/263...,Kayley and Ryan said they were determined to '...,7.451863,/data/thesis/laion_5m_images/0005801.jpg
2,8959,https://cdn.shopify.com/s/files/1/2068/6307/pr...,Pink Minnie Mouse suspender dress,7.530277,/data/thesis/laion_5m_images/0008959.jpg
3,9283,http://t0.gstatic.com/images?q=tbn:ANd9GcSxmV1...,Xmas Home Decorating Ideas by Best 25 Christma...,7.74255,/data/thesis/laion_5m_images/0009283.jpg
4,2750,https://cdn.smokymountains.com/vacation-rental...,Photo of a Gatlinburg Cabin named The Blue Spr...,7.035913,/data/thesis/laion_5m_images/0002750.jpg


In [15]:
# efficiency_logger.py
import time, csv, torch
from pathlib import Path

def timed_generate(run_name, gen_fn, *, resolution, steps, cfg,
                   attn_ip, ip_scale, text_scale, cn_scale=None,
                   precision="fp16"):
    # warmup
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    _ = gen_fn(warmup=True)

    # measure
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
    t0 = time.perf_counter()
    _ = gen_fn()
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        peak = torch.cuda.max_memory_allocated() / (1024**2)
    else:
        peak = 0.0
    dt_ms = (time.perf_counter() - t0) * 1000

    row = dict(
        run=run_name, res=resolution, steps=steps, cfg=cfg,
        attn_ip=attn_ip, ip_scale=ip_scale, text_scale=text_scale,
        cn_scale=cn_scale if cn_scale is not None else "",
        latency_ms=round(dt_ms, 1), peak_vram_mb=int(peak), precision=precision
    )
    out = Path("efficiency_logs.csv"); new = not out.exists()
    with out.open("a", newline="") as f:
        w = csv.DictWriter(f, fieldnames=row.keys())
        if new: w.writeheader()
        w.writerow(row)
    return row

# Example:
timed_generate(
  "sdxl_1024_ip0.6", lambda warmup=False: run_pipeline(warmup),
  resolution=1024, steps=50, cfg=7.0, attn_ip=0.6, ip_scale=0.4, text_scale=1.1, cn_scale=""
)


NameError: name 'run_pipeline' is not defined