In [3]:
# Install the package in development mode if needed
# !pip install -e .

import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from multiprocessing import cpu_count
import os
import time
import platform
import psutil
import shutil

# Import the DEGIS package
import degis
from degis.data.dataset import UnifiedImageDataset

In [4]:
def print_system_profile():
    print("=== SYSTEM PROFILE ===")
    print("Python:", platform.python_version())
    print("PyTorch:", torch.__version__)
    print("CPU cores:", psutil.cpu_count(logical=True))
    vm = psutil.virtual_memory()
    print(f"RAM: {vm.total/1e9:.1f} GB, free {vm.available/1e9:.1f} GB")
    
    # Check if /data exists, otherwise check current directory
    if os.path.exists("/data"):
        du = shutil.disk_usage("/data")
        print(f"/data disk: total {du.total/1e9:.1f} GB, free {du.free/1e9:.1f} GB")
    else:
        du = shutil.disk_usage(".")
        print(f"Current disk: total {du.total/1e9:.1f} GB, free {du.free/1e9:.1f} GB")
    
    print("CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        i = torch.cuda.current_device()
        print("GPU:", torch.cuda.get_device_name(i))
        print(f"VRAM total: {torch.cuda.get_device_properties(i).total_memory/1e9:.1f} GB")
    print("======================")

print_system_profile()


=== SYSTEM PROFILE ===
Python: 3.12.3
PyTorch: 2.8.0+cu128
CPU cores: 256
RAM: 540.8 GB, free 475.4 GB
/data disk: total 1099.5 GB, free 249.7 GB
CUDA available: True
GPU: NVIDIA GeForce RTX 5090
VRAM total: 33.7 GB


In [5]:
batch_size = 512
embeddings_size = "base"

In [12]:
# name = 'coco'
name = 'laion_5m'
# name = 'adimagenet'
csv_path = f"/data/thesis/{name}_manifest.csv"

In [13]:
df = pd.read_csv(csv_path)
print(f"Dataset loaded: {len(df)} images")
print(f"Columns: {df.columns.tolist()}")
print(df.shape)
df.head()

Dataset loaded: 3336240 images
Columns: ['id', 'url', 'caption', 'aesthetic', 'local_path']
(3336240, 5)


Unnamed: 0,id,url,caption,aesthetic,local_path
0,6744,https://t0.gstatic.com/images?q=tbn:ANd9GcQM-D...,Wrought Iron King Headboard And Footboard by B...,7.2972,/data/thesis/laion_5m_images/0006744.jpg
1,5801,https://i.dailymail.co.uk/1s/2020/03/24/16/263...,Kayley and Ryan said they were determined to '...,7.451863,/data/thesis/laion_5m_images/0005801.jpg
2,8959,https://cdn.shopify.com/s/files/1/2068/6307/pr...,Pink Minnie Mouse suspender dress,7.530277,/data/thesis/laion_5m_images/0008959.jpg
3,9283,http://t0.gstatic.com/images?q=tbn:ANd9GcSxmV1...,Xmas Home Decorating Ideas by Best 25 Christma...,7.74255,/data/thesis/laion_5m_images/0009283.jpg
4,2750,https://cdn.smokymountains.com/vacation-rental...,Photo of a Gatlinburg Cabin named The Blue Spr...,7.035913,/data/thesis/laion_5m_images/0002750.jpg


In [14]:
# efficiency_logger.py
import time, csv, torch
from pathlib import Path

def timed_generate(run_name, gen_fn, *, resolution, steps, cfg,
                   attn_ip, ip_scale, text_scale, cn_scale=None,
                   precision="fp16"):
    # warmup
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    _ = gen_fn(warmup=True)

    # measure
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
    t0 = time.perf_counter()
    _ = gen_fn()
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        peak = torch.cuda.max_memory_allocated() / (1024**2)
    else:
        peak = 0.0
    dt_ms = (time.perf_counter() - t0) * 1000

    row = dict(
        run=run_name, res=resolution, steps=steps, cfg=cfg,
        attn_ip=attn_ip, ip_scale=ip_scale, text_scale=text_scale,
        cn_scale=cn_scale if cn_scale is not None else "",
        latency_ms=round(dt_ms, 1), peak_vram_mb=int(peak), precision=precision
    )
    out = Path("efficiency_logs.csv"); new = not out.exists()
    with out.open("a", newline="") as f:
        w = csv.DictWriter(f, fieldnames=row.keys())
        if new: w.writeheader()
        w.writerow(row)
    return row

# Example:
# timed_generate(
#   "sdxl_1024_ip0.6", lambda warmup=False: run_pipeline(warmup),
#   resolution=1024, steps=50, cfg=7.0, attn_ip=0.6, ip_scale=0.4, text_scale=1.1, cn_scale=""
# )
