<a href="https://colab.research.google.com/github/Bhargav-Hazarika/UDA-with-NST/blob/main/NST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Base paths
GDRIVE_BASE = "/content/drive/MyDrive/NST Dataset/filter_preproc"
RAW_ROOTS = [
  "/content/drive/MyDrive/NST Dataset/Northern_Renaissance",
  "/content/drive/MyDrive/NST Dataset/High_Renaissance",
  "/content/drive/MyDrive/NST Dataset/Cubism",
  "/content/drive/MyDrive/NST Dataset/Early_Renaissance",
  "/content/drive/MyDrive/NST Dataset/Color_Field_Painting",
  "/content/drive/MyDrive/NST Dataset/Contemporary_Realism",
  "/content/drive/MyDrive/NST Dataset/Baroque",
  "/content/drive/MyDrive/NST Dataset/Art_Nouveau_Modern",
  "/content/drive/MyDrive/NST Dataset/Analytical_Cubism",
  "/content/drive/MyDrive/NST Dataset/Action_painting",
  "/content/drive/MyDrive/NST Dataset/Abstract_Expressionism"
]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
base = GDRIVE_BASE # '/content/preprocess'
os.makedirs(base, exist_ok=True)
os.makedirs(os.path.join(base,"original_raw"), exist_ok=True)
os.makedirs(os.path.join(base,"preproc","images_jpeg"), exist_ok=True)
os.makedirs(os.path.join(base,"manifests"), exist_ok=True)
os.makedirs(os.path.join(base,"stylized"), exist_ok=True)  # optional
os.makedirs(os.path.join(base,"logs"), exist_ok=True)

print("Created folder structure under", base)


Created folder structure under /content/drive/MyDrive/NST Dataset/filter_preproc


In [None]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imagehash
Successfully installed imagehash-4.3.2


In [None]:
# Preprocessing core
import os, hashlib, pathlib, time, sys
from PIL import Image, ImageFile
import imagehash
import pandas as pd
from tqdm import tqdm

In [None]:
# CONFIG
RES = 256
MIN_SIDE = 128
JPEG_QUALITY = 92
PHASH_FUNC = imagehash.phash
# base = '/content/preprocess'
BASE = GDRIVE_BASE
SRC_DIRS = RAW_ROOTS  # from earlier cell
OUT_DIR = os.path.join(BASE,"preproc","images_jpeg")
MANIFEST_CSV = os.path.join(BASE,"manifests","manifest.csv")
MANIFEST_PQ = os.path.join(BASE,"manifests","manifest.parquet")
LOGFILE = os.path.join(BASE,"logs","preproc_log.txt")

In [None]:
# Helpers
def sha256_file(path):
    h = hashlib.sha256()
    with open(path,"rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def process_and_save(in_path, out_path, res=RES):
    try:
        with Image.open(in_path) as im:
            im = im.convert("RGB")
            w,h = im.size
            if min(w,h) < MIN_SIDE:
                return {"valid": False, "notes":"too_small", "width":w, "height":h}
            # center crop square
            side = min(w,h)
            left = (w-side)//2
            top = (h-side)//2
            im = im.crop((left, top, left+side, top+side)).resize((res,res), Image.LANCZOS)
            ph = str(PHASH_FUNC(im))
            # save
            im.save(out_path, "JPEG", quality=JPEG_QUALITY, optimize=True)
            sha_proc = sha256_file(out_path)
            return {"valid":True, "sha_proc":sha_proc, "phash":ph, "width":res, "height":res}
    except Exception as e:
        return {"valid":False, "notes":f"error:{e}"}

In [None]:
# Load existing manifest if present (resume support)
if os.path.exists(MANIFEST_PQ):
    manifest = pd.read_parquet(MANIFEST_PQ)
else:
    manifest = pd.DataFrame(columns=[
        "id","orig_path","proc_path","sha_orig","sha_proc","phash","width","height","valid","notes","source","timestamp"
    ])

In [None]:
# Build list of candidate files
exts = {".jpg",".jpeg",".png",".bmp",".tiff"}
files = []
for src in SRC_DIRS:
    for p in pathlib.Path(src).rglob("*"):
        if p.suffix.lower() in exts:
            files.append(str(p))
files = sorted(files)

In [None]:
# Create a set of already-processed originals to skip
processed_orig = set(manifest['orig_path'].astype(str).tolist()) if not manifest.empty else set()
start_idx = len(manifest)

In [None]:
rows = []
pbar = tqdm(files, desc="Preprocess pass")
for i, fp in enumerate(files):
    if fp in processed_orig:
        pbar.update(1)
        continue
    idx = start_idx + len(rows) + 1
    out_name = f"{idx:06d}.jpg"
    out_path = os.path.join(OUT_DIR, out_name)
    sha_orig = sha256_file(fp)
    resu = process_and_save(fp, out_path, res=RES)
    row = {
        "id": f"{idx:06d}",
        "orig_path": fp,
        "proc_path": out_path,
        "sha_orig": sha_orig,
        "sha_proc": resu.get("sha_proc",""),
        "phash": resu.get("phash",""),
        "width": resu.get("width",""),
        "height": resu.get("height",""),
        "valid": resu.get("valid",False),
        "notes": resu.get("notes",""),
        "source": pathlib.Path(fp).parts[-3] if len(pathlib.Path(fp).parts) >=3 else pathlib.Path(fp).parts[0],
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    rows.append(row)
    pbar.update(1)

if rows:
    manifest = pd.concat([manifest, pd.DataFrame(rows)], ignore_index=True)
    # Save both formats
    manifest.to_parquet(MANIFEST_PQ, index=False)
    manifest.to_csv(MANIFEST_CSV, index=False)
    print("Appended", len(rows), "rows. Manifest saved.")
else:
    print("No new files processed. Manifest unchanged.")

Preprocess pass: 100%|██████████| 21195/21195 [3:52:07<00:00,  1.40it/s]

Appended 21195 rows. Manifest saved.


## Dedupe

In [None]:
MANIFEST_PQ = os.path.join(GDRIVE_BASE,"manifests","manifest.parquet")
manifest = pd.read_parquet(MANIFEST_PQ)

# Exact duplicates (same sha_proc)
dups = manifest[manifest.duplicated("sha_proc", keep=False)]
print("Exact dupe groups:", dups.shape[0])

# Keep first occurrence for exact duplicates
manifest = manifest.drop_duplicates("sha_proc", keep="first").reset_index(drop=True)

Exact dupe groups: 198


In [None]:
# Near-duplicate via pHash (bucket by phash prefix to reduce comparisons)
from collections import defaultdict
buckets = defaultdict(list)
for i, r in manifest.iterrows():
    ph = r['phash'] if pd.notnull(r['phash']) else ""
    bucket_key = ph[:6]  # first chars to bucket (tunable)
    buckets[bucket_key].append(i)

to_drop = set()
PHAMM_TH = 6  # Hamming threshold
for bucket in buckets.values():
    if len(bucket) <= 1:
        continue
    for i in range(len(bucket)):
        for j in range(i+1, len(bucket)):
            ph1 = manifest.loc[bucket[i],"phash"]
            ph2 = manifest.loc[bucket[j],"phash"]
            if not ph1 or not ph2: continue
            # compute Hamming
            h1 = imagehash.hex_to_hash(ph1)
            h2 = imagehash.hex_to_hash(ph2)
            if h1 - h2 <= PHAMM_TH:
                # pick which to drop: keep higher orig file size (proxy for quality)
                sz_i = os.path.getsize(manifest.loc[bucket[i],"orig_path"])
                sz_j = os.path.getsize(manifest.loc[bucket[j],"orig_path"])
                drop_idx = bucket[j] if sz_i >= sz_j else bucket[i]
                to_drop.add(drop_idx)

print("Near-duplicates to drop:", len(to_drop))
manifest = manifest.drop(index=list(to_drop)).reset_index(drop=True)

Near-duplicates to drop: 54


In [None]:
# Reassign new deterministic ids based on sorted proc_path for stability (optional)
manifest = manifest.sort_values("proc_path").reset_index(drop=True)
manifest['id'] = [(i+1) for i in range(len(manifest))]
manifest['id'] = manifest['id'].apply(lambda x: f"{x:06d}")

# Save updated manifest
manifest.to_parquet(MANIFEST_PQ, index=False)
manifest.to_csv(os.path.join(GDRIVE_BASE,"manifests","manifest.csv"), index=False)
print("Dedup done. Manifest now has", manifest.shape[0], "rows.")

Dedup done. Manifest now has 21040 rows.


## Sampling

In [None]:
import pandas as pd, numpy as np, os
MANIFEST_PQ = os.path.join(GDRIVE_BASE,"manifests","manifest.parquet")
manifest = pd.read_parquet(MANIFEST_PQ)

n_total = len(manifest)
n_small = min(2000, n_total)    # tune
n_medium = min(15000, n_total) # tune

manifest = manifest.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle deterministically
manifest.iloc[:n_small].to_csv(os.path.join(GDRIVE_BASE,"manifests","sampling_small.csv"), index=False)
manifest.iloc[:n_medium].to_csv(os.path.join(GDRIVE_BASE,"manifests","sampling_medium.csv"), index=False)
manifest.to_csv(os.path.join(GDRIVE_BASE,"manifests","sampling_full.csv"), index=False)

print("Sampling files written. small:", n_small, "medium:", n_medium, "full:", n_total)

Sampling files written. small: 2000 medium: 15000 full: 21040


## Check size

In [None]:
MANIFEST_PQ = os.path.join(GDRIVE_BASE,"manifests","manifest.parquet")
m = pd.read_parquet(MANIFEST_PQ)
sizes = [os.path.getsize(p) for p in m['proc_path']]
avg = sum(sizes)/len(sizes)
total_bytes = avg * len(sizes)
print("Images:", len(sizes))
print("Avg per image (MB):", avg/1024/1024)
print("Estimated total (GB):", total_bytes/1024/1024/1024)

Images: 21040
Avg per image (MB): 0.02402570320172908
Estimated total (GB): 0.4936531204730272
