# Use
This notebook is used to extract a subset of wikiart and ms coco to be used in NST

## Extracting Style Images

Goal is to have 50 images, first 12 already defined

In [2]:
import os
from datasets import load_dataset, Dataset, disable_caching
import itertools, datasets
import nest_asyncio; nest_asyncio.apply()

styles_path = "evaluation_pictures/style"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
stream_ds = load_dataset(
    "huggan/wikiart",
    split="train",    
    streaming=True    
)
stream_ds = stream_ds.shuffle(seed=42, buffer_size=5000)
num_threads = num_threads = min(32, (os.cpu_count() or 1) + 4)
stream_ds = stream_ds.decode(num_threads=num_threads)
first_200 = stream_ds.take(200)

In [4]:
style_feature = stream_ds.features["style"]

idx_to_style = {i: style_feature.int2str(i) for i in range(style_feature.num_classes)}

style_to_idx = style_feature.str2int


In [13]:
style_count = {style: 0 for style in idx_to_style.values()}

for example in first_200:
    style = idx_to_style[example["style"]]
    style_count[style] += 1

print("Style counts in the subset of 200 images:")
for style, count in style_count.items():
    print(f"{style}: {count}")

Style counts in the subset of 200 images:
Abstract_Expressionism: 8
Action_painting: 0
Analytical_Cubism: 0
Art_Nouveau: 8
Baroque: 16
Color_Field_Painting: 7
Contemporary_Realism: 1
Cubism: 8
Early_Renaissance: 7
Expressionism: 18
Fauvism: 1
High_Renaissance: 2
Impressionism: 25
Mannerism_Late_Renaissance: 4
Minimalism: 1
Naive_Art_Primitivism: 7
New_Realism: 2
Northern_Renaissance: 7
Pointillism: 1
Pop_Art: 0
Post_Impressionism: 17
Realism: 23
Rococo: 9
Romanticism: 12
Symbolism: 9
Synthetic_Cubism: 1
Ukiyo_e: 6


In [14]:
final_subset = []
style_count_final = {style: 0 for style in idx_to_style.values()}

for img in first_200:
    
    if len(final_subset) >= 38:
        break
    style = idx_to_style[img["style"]]

    if style_count[style] >= 3 and style not in ['Baroque', 'Impressionism', 'Realism', 'Romanticism'] and style_count_final[style] <= 2:
        final_subset.append(img)

    style_count_final[style] += 1

In [22]:
images_final_subset = [img["image"] for img in final_subset]
print(final_subset[0])
print(idx_to_style[final_subset[0]["style"]])

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1862x1382 at 0x79C187159E90>, 'artist': 0, 'genre': 2, 'style': 7}
Cubism


In [None]:
out = "evaluation_pictures/style"
counts = {}

for img, rec in zip(images_final_subset, final_subset):
    style = idx_to_style[rec["style"]]["style"]
    i = counts.get(style, 0)
    img.save(out + f"/{style}_{i}.jpg", "JPEG")
    counts[style] = i + 1


## Extracting Content images

In [2]:
import json, random, re, requests, zipfile, io, tempfile
from pathlib import Path
from pycocotools.coco import COCO

N_IMAGES = 50
SPLIT = "val2017"
OUT_DIR = Path("coco_images_only")
SEED = None

def slugify(s): return re.sub(r"[^a-z0-9\-_.]+", "", s.lower().strip().replace(" ", "-"))

def load_coco(split):
    url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    with requests.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with tempfile.NamedTemporaryFile(delete=True) as tmp:
            for ch in r.iter_content(1<<20):
                if ch: tmp.write(ch)
            tmp.flush()
            with zipfile.ZipFile(tmp.name) as z:
                name = [n for n in z.namelist() if n.endswith(f"instances_{split}.json")][0]
                with z.open(name) as f: data = json.load(f)
    c = COCO()
    c.dataset = data
    c.createIndex()
    return c

if SEED is not None: random.seed(SEED)
OUT_DIR.mkdir(parents=True, exist_ok=True)
coco = load_coco(SPLIT)
cats = coco.loadCats(coco.getCatIds())
chosen = random.sample(cats, k=min(N_IMAGES, len(cats)))
seen = set()
s = requests.Session()
for cat in chosen:
    ids = coco.getImgIds(catIds=[cat["id"]]); random.shuffle(ids)
    pick = next((coco.loadImgs([i])[0] for i in ids if i not in seen), None)
    if not pick: continue
    seen.add(pick["id"])
    url = pick.get("coco_url") or pick.get("COCO_URL")
    if not url: continue
    dest = OUT_DIR / f"{slugify(cat['name'])}__{pick['file_name']}"
    with s.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with open(dest, "wb") as f:
            for ch in r.iter_content(1<<20):
                if ch: f.write(ch)


creating index...
index created!
