# DIFFUSION MODEL IMPLEMENTATION FOR DATA AUGMENTATION

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imagehash
Successfully installed imagehash-4.3.2


In [None]:
!pip install lpips

Collecting lpips
  Downloading lpips-0.1.4-py3-none-any.whl.metadata (10 kB)
Downloading lpips-0.1.4-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lpips
Successfully installed lpips-0.1.4


# Imports

In [None]:
import os, math, random
import numpy as np
import pandas as pd
import torch
import cv2
from tqdm.auto import tqdm
from PIL import Image, ImageChops, ImageDraw, ImageFilter
from torchvision import transforms
import matplotlib.pyplot as plt
from torchvision.transforms import functional as F
import seaborn as sns

from diffusers import (
    StableDiffusionImg2ImgPipeline,
    EulerAncestralDiscreteScheduler,
    DPMSolverMultistepScheduler,
)
import lpips
from skimage.metrics import structural_similarity as ssim

# Generation loop



Comment on first trial
- Pass rate (31%): our QC was a bit strict for some classes, especially oiliness under blue-illumination (naturally lower contrast -> fails blur and sometimes looks “too similar” -> high SSIM).
- Oiliness artifacts: SD 1.5 doesn’t “know” blue-illum mode; with too much strength/CFG it tends to blow highlights into a uniform cyan wash / flashlight hotspot or collapse to a very dark frame.

In [None]:
# ============================================================
# Proportional oversampling (+80%) with SD 1.5 img2img#
# ============================================================

# --------------------
# Paths & run params
# --------------------
TRAIN_ORIGINAL_CSV = "/content/drive/MyDrive/Skin_project/train_original.csv"
OUT_DIR            = "/content/drive/MyDrive/Skin_project/diffusion_oversample_70_2"
LOG_CSV            = os.path.join(OUT_DIR, "oversample_log_2.csv")
os.makedirs(OUT_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TARGET_SD_SIZE = 512
GLOBAL_SEED = 2024

# Oversampling: +80% per (feature, score) bucket
OVERSAMPLE_RATIO = 0.80
MAX_PER_BUCKET   = None

FILTER_BY_QC = False

# --------------------
# Compact prompts
# --------------------
SHOW_BLUE_OVERLAY  = True
MOISTURE_TONE_LOCK = True

BASE_DEFAULT = "macro skin patch, clinical close-up, high-detail microtexture, square crop, soft even lighting, natural color, no face"
BASE_OIL     = "macro skin patch, blue-illum dermatology imaging, square crop, soft even lighting, no face"
BASE_MOIST   = "macro skin patch, neutral non-blue lighting, square crop, soft even lighting, no face"
BASE_RED     = "macro skin patch with circular measurement patch in view, neutral lighting, soft even lighting, no face"

NEG_BASE = [
    "face","eyes","nose","mouth","hair","portrait","selfie","text","logo","watermark",
    "arrows","grid","legend","scale bar","numbers","excessive blur","heavy noise","oversaturated"
]
NEG_HP_EXTRA     = ["blue tint","solid blue blobs","filled blue shapes","heatmap","colormap"]
# Oiliness: harder negatives against wash/hotspot/exposure artifacts
NEG_OIL_EXTRA    = [
    "heatmap","colormap","contour overlay","uniform blue wash","blue fog","posterization",
    "overexposed","blown highlights","hotspot","bloom","lens flare","vignette","flashlight beam","banding","tiling","haze","overglow"
]
NEG_MOIST_EXTRA  = ["sweat droplets","water droplets","oily glare","lotion smears","blue illumination","blue tint"]
NEG_ELAST_EXTRA  = ["oily glare","sweat droplets","deep wrinkles","surgical tape marks"]
NEG_RED_EXTRA    = ["heatmap","colormap","legend","scale bar","blue cast","false color","uniform fill","airbrushed","gaussian blur"]

def join(t): return ", ".join(t)

# Descriptors (oiliness now includes coverage/scatter)
def oiliness_desc(score:int):
    s = int(score)
    if s <= -1:
        return "low oiliness; matte; sparse faint blue micro-specular points (<5% area); microtexture clearly visible"
    if s == 0:
        return "balanced oil; soft luster; scattered small blue highlights (10-20% area); microtexture visible"
    if s == 1:
        return "high oiliness; glossy; coherent blue specular regions (30-50% area) but pores and lines still visible"

score_desc = {
    "texture": {
        -1: "rough texture; visible pores and micro ridges",
         0: "moderately even texture with some fine pores",
         1: "smooth even texture; refined pores"
    },
    "oiliness": {
        -1: oiliness_desc(-1),
         0: oiliness_desc(0),
         1: oiliness_desc(1),
    },
    "hyperpigmentation": {
        -1: "severe hyperpigmentation; dense clustered melanin microspots" + (", thin semi-transparent blue contours around spots" if SHOW_BLUE_OVERLAY else ""),
         0: "moderate hyperpigmentation; noticeable microspots; mild clustering" + (", moderate thin blue contours" if SHOW_BLUE_OVERLAY else ""),
         1: "minimal hyperpigmentation; few small, widely separated microspots" + (", few thin blue contours" if SHOW_BLUE_OVERLAY else ""),
    },
    "moisture": {
        -1: "low moisture; dehydrated; matte; fine lines and micro-cracks pronounced; narrow highlights",
         0: "average moisture; balanced hydration; soft highlights; clear microtexture",
         1: "high moisture; plump; diffuse sheen; broader low-contrast highlights; fine lines reduced" + (", keep input skin tone" if MOISTURE_TONE_LOCK else ""),
    },
    "elasticity": {
        -1: "low elasticity; slack microfolds; creases persist; broad dull highlights",
         0: "average elasticity; balanced micro-relief; moderate highlights",
         1: "high elasticity; taut microtexture; minimal creasing; tighter brighter highlights",
    },
    "redness": {
        -1: "strong diffuse erythema inside the circular patch; natural warm pink-red; microtexture visible",
         0: "moderate erythema inside the circular patch; gentle warm pink tone; microtexture visible; not uniform",
         1: "minimal erythema; patch mostly natural tone with subtle pinkness; microtexture preserved",
    },
}

def build_prompt(feature: str, score: int):
    desc = score_desc.get(feature, {}).get(int(score), "")
    if feature == "oiliness": base = BASE_OIL
    elif feature == "moisture": base = BASE_MOIST
    elif feature == "redness": base = BASE_RED
    else: base = BASE_DEFAULT
    prompt = f"{base}, {desc}".strip(", ")
    neg = NEG_BASE.copy()
    if feature == "texture":
        prompt += ", sharp microtexture, fine micro-lines, no global blur"
        neg += ["flat uniform surface"]
    if feature == "hyperpigmentation": neg += NEG_HP_EXTRA
    if feature == "oiliness":
        neg += NEG_OIL_EXTRA
        prompt += ", blue reflective highlights (not an overlay), natural exposure, no false color"
    if feature == "moisture":
        neg += NEG_MOIST_EXTRA
        prompt += ", no hue shift"
    if feature == "elasticity":
        neg += NEG_ELAST_EXTRA
        prompt += ", micro-relief indicates recoil, not oil shine"
    if feature == "redness":
        neg += NEG_RED_EXTRA
        prompt += ", natural pink-red inside the circular patch; microtexture visible; not airbrushed"
    return prompt, join(neg)

# Token-safe trimming
PROMPT_MAX_TOKENS = 75
def trim_to_max_tokens(pipe, text, max_tokens=PROMPT_MAX_TOKENS):
    toks = pipe.tokenizer(text, truncation=True, max_length=max_tokens, return_tensors="pt")
    return pipe.tokenizer.batch_decode(toks["input_ids"], skip_special_tokens=True)[0]

# --------------------
# Feature-specific hyperparams
# Oiliness uses lower strength/CFG by default
# --------------------
HPARAMS = {
    "texture":             dict(strength=0.28, cfg=4.8, steps=28),
    "oiliness":            dict(strength=0.23, cfg=3.8, steps=26),
    "hyperpigmentation":   dict(strength=0.35, cfg=5.2, steps=30),
    "moisture":            dict(strength=0.25, cfg=4.5, steps=28),
    "elasticity":          dict(strength=0.25, cfg=4.5, steps=28),
    "redness":             dict(strength=0.30, cfg=4.5, steps=30),
}
def jitter(feature, h):
    if feature == "oiliness":
        # added tighter range to avoid washouts
        return dict(
            strength=float(np.clip(h["strength"] + np.random.uniform(-0.04, 0.04), 0.15, 0.30)),
            cfg=float(np.clip(h["cfg"] + np.random.uniform(-0.6, 0.6), 2.8, 4.6)),
            steps=int(np.clip(h["steps"] + np.random.choice([-2, 0, 2]), 24, 30)),
        )
    # default jitter
    return dict(
        strength=float(np.clip(h["strength"] + np.random.uniform(-0.03, 0.03), 0.20, 0.55)),
        cfg=float(h["cfg"] + np.random.uniform(-0.4, 0.4)),
        steps=int(np.clip(h["steps"] + np.random.choice([-2, 0, 2]), 24, 36)),
    )

# --------------------
# Helpers
# --------------------
def center_square_resize(img: Image.Image, size=TARGET_SD_SIZE) -> Image.Image:
    w,h = img.size; side = min(w,h); l=(w-side)//2; t=(h-side)//2
    return img.crop((l,t,l+side,t+side)).convert("RGB").resize((size,size), Image.LANCZOS)

def normalize_exposure(pil: Image.Image, clip_limit=2.0, tile_grid=(8,8)) -> Image.Image:
    """CLAHE on L channel (LAB) – tames blown hotspots/dark frames typical in blue-illum."""
    bgr = cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR)
    lab = cv2.cvtColor(bgr, cv2.COLOR_BGR2LAB)
    L, A, B = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid)
    L2 = clahe.apply(L)
    lab2 = cv2.merge([L2, A, B])
    bgr2 = cv2.cvtColor(lab2, cv2.COLOR_LAB2BGR)
    return Image.fromarray(cv2.cvtColor(bgr2, cv2.COLOR_BGR2RGB))

# --------------------
# Metrics & QC
# --------------------
LPIPS_FN = lpips.LPIPS(net='alex').to(DEVICE).eval()

def to_lpips_tensor(pil_img, size=224):
    a = np.array(pil_img.convert("RGB").resize((size,size), Image.LANCZOS))
    t = torch.tensor(a).permute(2,0,1).unsqueeze(0).float()/255.0
    return (t*2-1).to(DEVICE)

def _lap_var(arr_rgb):
    g = cv2.cvtColor(arr_rgb, cv2.COLOR_RGB2GRAY)
    return float(cv2.Laplacian(g, cv2.CV_64F).var())

def compute_metrics(pil_real: Image.Image, pil_syn: Image.Image, size=224):
    r = np.array(pil_real.convert("RGB").resize((size,size), Image.LANCZOS))
    s = np.array(pil_syn.convert("RGB").resize((size,size), Image.LANCZOS))
    # SSIM for logging
    try:
        ssim_val = float(ssim(r, s, channel_axis=2))
    except TypeError:
        ssim_val = float(ssim(r, s, multichannel=True))
    lp = float(LPIPS_FN(to_lpips_tensor(pil_real,size), to_lpips_tensor(pil_syn,size)).item())
    blur_real = _lap_var(r)
    blur_syn  = _lap_var(s)
    blur_ratio = float(blur_syn / (blur_real + 1e-9))
    return dict(ssim=ssim_val, lpips=lp, blur=blur_syn, blur_real=blur_real, blur_ratio=blur_ratio)

# Oiliness specular coverage (blue-illum proxy)
def specular_coverage_blue(pil_img, size=224):
    a = np.array(pil_img.convert("RGB").resize((size,size), Image.LANCZOS)).astype(np.float32)
    R, G, B = a[:,:,0], a[:,:,1], a[:,:,2]
    V = np.max(a, axis=2)
    mask = (B - np.maximum(R, G) > 15) & (V > 80)  # blue-dominant & bright
    return float(mask.mean())

# LPIPS band + per-feature blur ratio + fallback floor
QC_DEFAULT = {"lpips_min": 0.18, "lpips_max": 0.65, "blur_ratio_min": 0.50, "blur_floor": 60.0}
FEATURE_QC = {
    "elasticity":         {"lpips_min": 0.18, "lpips_max": 0.65, "blur_ratio_min": 0.35, "blur_floor": 25.0},
    "moisture":           {"lpips_min": 0.18, "lpips_max": 0.65, "blur_ratio_min": 0.45, "blur_floor": 40.0},
    "oiliness":           {"lpips_min": 0.18, "lpips_max": 0.70, "blur_ratio_min": 0.50, "blur_floor": 60.0},
    "texture":            {"lpips_min": 0.22, "lpips_max": 0.60, "blur_ratio_min": 0.70, "blur_floor": 90.0},
    "hyperpigmentation":  {"lpips_min": 0.22, "lpips_max": 0.65, "blur_ratio_min": 0.70, "blur_floor": 100.0},
    "redness":            {"lpips_min": 0.18, "lpips_max": 0.65, "blur_ratio_min": 0.50, "blur_floor": 70.0},
}

# Oiliness: acceptable specular coverage per label
OIL_COVERAGE_RANGE = {
    -1: (0.00, 0.12),
     0: (0.08, 0.35),
     1: (0.25, 0.60),
}

def passes_qc(feature, score, m, syn_img=None):
    cfg = FEATURE_QC.get(feature, QC_DEFAULT)
    lp_ok   = (cfg["lpips_min"] <= m["lpips"] <= cfg["lpips_max"])
    blur_ok = (m["blur_ratio"] >= cfg["blur_ratio_min"]) or (m["blur"] >= cfg["blur_floor"])

    if feature == "oiliness" and syn_img is not None:
        cov = specular_coverage_blue(syn_img)
        lo, hi = OIL_COVERAGE_RANGE.get(int(score), (0.05, 0.60))
        cov_ok = (lo <= cov <= hi)
    else:
        cov, cov_ok = None, True

    return bool(lp_ok and blur_ok and cov_ok), cov

# --------------------
# Load CSV & build full table of usable rows
# --------------------
df = pd.read_csv(TRAIN_ORIGINAL_CSV)

feature2imgcol = {
    "moisture": "moisture_img",
    "oiliness": "oiliness_img",
    "elasticity": "elasticity_img",
    "texture": "texture_img",
    "redness": "redness_img",
    "hyperpigmentation": "hyperpigmentation_img",
}

def to_int(v, default=0):
    try: return int(v)
    except:
        try: return int(float(v))
        except: return default

rows = []
for _, r in df.iterrows():
    pid = r.get("patient_id")
    region = r.get("region")
    for feat, col in feature2imgcol.items():
        pth = r.get(col)
        if isinstance(pth, str) and pth and os.path.exists(pth):
            sc = to_int(r.get(f"{feat}_score", 0), 0)
            rows.append((pid, region, feat, sc, pth))
base_df = pd.DataFrame(rows, columns=["patient_id","region","feature","score","image_path"])
print("Usable base rows:", len(base_df))

# --------------------
# Oversampling plan
# --------------------
plan_rows = []
for (feat, sc), g in base_df.groupby(["feature","score"]):
    n = len(g)
    n_new = int(math.ceil(n * OVERSAMPLE_RATIO))
    if MAX_PER_BUCKET is not None:
        n_new = min(n_new, MAX_PER_BUCKET)
    if n_new <= 0:
        continue
    sampled = g.sample(n_new, replace=True, random_state=GLOBAL_SEED)
    plan_rows.append(sampled)
plan_df = pd.concat(plan_rows, ignore_index=True) if plan_rows else pd.DataFrame(columns=base_df.columns)
print("Planned new generations:", len(plan_df))

# --------------------
# Load pipeline
# --------------------
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
).to(DEVICE)
# default scheduler (used for non-oiliness)
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.safety_checker = lambda images, **kwargs: (images, [False]*len(images))
try:
    pipe.enable_attention_slicing(); pipe.enable_vae_tiling()
except: pass

# scheduler config
_euler_cfg = pipe.scheduler.config
_dpm_cfg   = _euler_cfg

# --------------------
# Generation loop
# --------------------
rng = torch.Generator(device=DEVICE).manual_seed(GLOBAL_SEED)

records = []
for _, row in tqdm(plan_df.iterrows(), total=len(plan_df), desc="Oversampling"):
    pid, region, feature, score, real_path = row.tolist()
    try:
        init_raw = Image.open(real_path)
    except Exception as e:
        print("open fail:", real_path, e);
        continue

    init_img = center_square_resize(init_raw, TARGET_SD_SIZE)
    if feature == "oiliness":
        init_img = normalize_exposure(init_img, clip_limit=2.0, tile_grid=(8,8))

    p_full, n_full = build_prompt(feature, score)
    prompt  = trim_to_max_tokens(pipe, p_full)
    neg     = trim_to_max_tokens(pipe, n_full)

    base_h = HPARAMS.get(feature, dict(strength=0.30, cfg=4.8, steps=28))
    h      = jitter(feature, base_h)

    # different seed per sample for variety
    g = torch.Generator(device=DEVICE).manual_seed(random.randint(0, 2_000_000_000))

    # switch scheduler per feature
    if feature == "oiliness":
        pipe.scheduler = DPMSolverMultistepScheduler.from_config(_dpm_cfg)
    else:
        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(_euler_cfg)

    out = pipe(
        prompt              = prompt,
        negative_prompt     = neg,
        image               = init_img,
        strength            = h["strength"],
        guidance_scale      = h["cfg"],
        num_inference_steps = h["steps"],
        generator           = g
    )
    syn = out.images[0]

    # metrics vs real (SSIM logged only)
    mets = compute_metrics(init_raw, syn)
    keep, cov = passes_qc(feature, score, mets, syn_img=syn)

    if FILTER_BY_QC and not keep:
        continue

    # save
    feat_dir = os.path.join(OUT_DIR, feature); os.makedirs(feat_dir, exist_ok=True)
    fname = f"{feature}_{pid}_{region}_s{score}_aug{random.randint(100000,999999)}.png"
    syn_path = os.path.join(feat_dir, fname)
    syn.save(syn_path)

    records.append({
        "patient_id": pid,
        "region": region,
        "feature": feature,
        "score": score,
        "real_path": real_path,
        "synthetic_path": syn_path,
        "strength": h["strength"],
        "guidance_scale": h["cfg"],
        "steps": h["steps"],
        "prompt": prompt,
        "negative": neg,
        "ssim": mets["ssim"],
        "lpips": mets["lpips"],
        "blur": mets["blur"],
        "blur_real": mets["blur_real"],
        "blur_ratio": mets["blur_ratio"],
        "specular_cov": cov if feature=="oiliness" else np.nan,
        "keep": bool(keep),
    })

# Log
log_df = pd.DataFrame(records)
log_df.to_csv(LOG_CSV, index=False)
print(f"Wrote {len(log_df)} synthetic images to {OUT_DIR}")
print(f"Metrics log: {LOG_CSV}")
print("Pass rate (LPIPS + blur-ratio QC):", round(100.0 * (log_df['keep'].mean() if len(log_df) else 0), 1), "%")


Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth


100%|██████████| 233M/233M [00:01<00:00, 189MB/s]


Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth
Usable base rows: 366
Planned new generations: 300


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

merges.txt: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

safety_checker/model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Oversampling:   0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

✅ Wrote 300 synthetic images to /content/drive/MyDrive/Skin_project/diffusion_oversample_70_2
🧾 Metrics log: /content/drive/MyDrive/Skin_project/diffusion_oversample_70_2/oversample_log_2.csv
Pass rate (LPIPS + blur-ratio QC): 48.0 %


## CHECKS ON GENERATED IMAGES

*  Fail/Keep rate
*  Visualisation

In [None]:
# --- Setup & imports --
import sys, subprocess, io, os, random
import pandas as pd
import numpy as np
from PIL import Image

try:
    from docx import Document
    from docx.shared import Inches, Pt
    from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
except ImportError:
    # Attempt to install python-docx (works in Colab)
    print("Installing python-docx...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "python-docx", "--quiet"])
    from docx import Document
    from docx.shared import Inches, Pt
    from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

# =========================
# CONFIG
# =========================
OUT_DIR = "/content/drive/MyDrive/Skin_project/diffusion_oversample_70_2"
LOG_CSV = os.path.join(OUT_DIR, "oversample_log_2.csv")
REPORT_DOCX = os.path.join(OUT_DIR, "oversample_report.docx")

# Whether to show images inline in the notebook and/or export them into Word
SHOW_INLINE = True
EXPORT_DOCX = True

# Display/figure settings
MAX_SIDE = 480        # display size for each image in the pair (square)
RANDOM_SEED = 0       # seed for reproducible random picks
PAIR_FIGSIZE = (10, 5)  # inches

# =========================
# Load and validate log
# =========================
assert os.path.exists(LOG_CSV), f"Log not found: {LOG_CSV}"
log = pd.read_csv(LOG_CSV)

required_cols = {"feature","score","keep","real_path","synthetic_path"}
missing = required_cols - set(log.columns)
assert not missing, f"Log is missing columns: {missing}"

# coerce keep -> bool
if log["keep"].dtype != bool:
    log["keep"] = log["keep"].astype(str).str.lower().isin(["true","1","yes","y"])

# coerce score -> int (robustly)
def to_int(v, default=0):
    try:
        return int(v)
    except:
        try:
            return int(float(v))
        except:
            return default
log["score"] = log["score"].apply(to_int)

# keep only rows with existing files
def _exists(p):
    return isinstance(p, str) and os.path.exists(p)
log = log[log["real_path"].apply(_exists) & log["synthetic_path"].apply(_exists)].reset_index(drop=True)

print(f"Rows with valid files: {len(log)}")
if len(log) == 0:
    raise SystemExit("No rows to show. Double-check your OUT_DIR/LOG_CSV paths and that image files exist.")

# Ensure optional metrics are present
for col in ["patient_id","region","ssim","lpips","blur"]:
    if col not in log.columns:
        log[col] = np.nan

# =========================
# Computations
# =========================
# 1) Counts per feature
feat_counts = (
    log.groupby(["feature","keep"]).size()
       .unstack(fill_value=0)
       .rename(columns={True:"kept", False:"failed"})
       .reset_index()
)
feat_counts["total"] = feat_counts["kept"] + feat_counts["failed"]
feat_counts["fail_rate_%"] = (100.0 * feat_counts["failed"] / feat_counts["total"]).round(1)
feat_counts = feat_counts.sort_values("fail_rate_%", ascending=False)

print("\n=== Fail/Keep per feature ===")
display(feat_counts)

# 2) Counts per (feature, score)
bucket_counts = (
    log.groupby(["feature","score","keep"]).size()
       .unstack(fill_value=0)
       .rename(columns={True:"kept", False:"failed"})
       .reset_index()
)
bucket_counts["total"] = bucket_counts["kept"] + bucket_counts["failed"]
bucket_counts["fail_rate_%"] = (100.0 * bucket_counts["failed"] / bucket_counts["total"]).round(1)
bucket_counts = bucket_counts.sort_values(["feature","score"])

print("\n=== Fail/Keep per (feature, score) ===")
display(bucket_counts)

# =========================
# Word helpers
# =========================
def add_heading(doc, text, level=0):
    p = doc.add_heading(text, level=level)
    return p

def add_df_table(doc, df: pd.DataFrame, caption: str = None):
    """
    Insert a pandas DataFrame as a Word table with an optional caption.
    """
    if caption:
        run = doc.add_paragraph().add_run(caption)
        run.bold = True

    # Create table with header row + len(df) data rows
    rows, cols = df.shape
    table = doc.add_table(rows=rows+1, cols=cols)
    table.style = "Light List Accent 1" if "Light List Accent 1" in [s.name for s in doc.styles] else table.style

    # Header
    for j, col_name in enumerate(df.columns):
        cell = table.cell(0, j)
        cell.text = str(col_name)

    # Data
    for i in range(rows):
        for j in range(cols):
            val = df.iat[i, j]
            table.cell(i+1, j).text = "" if pd.isna(val) else str(val)

    # Spacing after table
    doc.add_paragraph()

def fig_pair_from_row(row, title_left="REAL", title_right="SYN"):
    """
    Create a matplotlib figure with side-by-side images for a given row.
    Returns (fig, metadata_title) and DOES NOT show by default.
    """
    try:
        real_img  = Image.open(row["real_path"]).convert("RGB")
        synth_img = Image.open(row["synthetic_path"]).convert("RGB")
    except Exception as e:
        print(" could not open:", row.get("synthetic_path"), e)
        return None, None

    # square-ish display without distorting too much (simple resize)
    real_img  = real_img.resize((MAX_SIDE, MAX_SIDE))
    synth_img = synth_img.resize((MAX_SIDE, MAX_SIDE))

    fig, ax = plt.subplots(1, 2, figsize=PAIR_FIGSIZE)
    for a in ax: a.axis("off")
    ax[0].imshow(real_img);  ax[0].set_title(title_left, fontsize=11)
    ax[1].imshow(synth_img); ax[1].set_title(title_right, fontsize=11)
    plt.tight_layout()

    # Build a compact metadata string for caption
    pid = row.get("patient_id", np.nan)
    reg = row.get("region", np.nan)
    ssim  = row.get("ssim",  np.nan)
    lpips = row.get("lpips", np.nan)
    blur  = row.get("blur",  np.nan)

    meta = f"pid:{pid}  reg:{reg}  " \
           f"SSIM={ssim if pd.notna(ssim) else 'NA'}  " \
           f"LPIPS={lpips if pd.notna(lpips) else 'NA'}  " \
           f"BLUR={int(blur) if pd.notna(blur) else 'NA'}"
    return fig, meta

def add_figure_to_doc(doc, fig, width_inches=6.0, caption=None):
    """
    Save a matplotlib figure to a BytesIO stream and insert into the Word doc.
    """
    stream = io.BytesIO()
    fig.savefig(stream, format='png', bbox_inches="tight", dpi=200)
    plt.close(fig)
    stream.seek(0)
    doc.add_picture(stream, width=Inches(width_inches))
    if caption:
        p = doc.add_paragraph(caption)
        p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

# =========================
# Document creation
# =========================
if EXPORT_DOCX:
    doc = Document()
    # Title
    title = doc.add_paragraph()
    run = title.add_run("Diffusion Oversampling Report")
    run.bold = True
    run.font.size = Pt(20)

    doc.add_paragraph(f"Source log: {LOG_CSV}")
    doc.add_paragraph(f"Total rows with valid files: {len(log)}")
    doc.add_paragraph()

    # Summary tables
    add_heading(doc, "Summary by Feature", level=1)
    add_df_table(doc, feat_counts, caption="Fail/Keep per feature")

    add_heading(doc, "Summary by (Feature, Score)", level=1)
    add_df_table(doc, bucket_counts, caption="Fail/Keep per (feature, score)")

# =========================
# 3) Random examples per (feature, score)
# =========================
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print("\n=== Random examples per (feature, score) ===")
for feat in sorted(log["feature"].unique()):
    sub_feat = log[log["feature"] == feat]
    scores = sorted(sub_feat["score"].unique())
    for sc in scores:
        sub = sub_feat[sub_feat["score"] == sc]
        failed = sub[sub["keep"] == False]
        kept   = sub[sub["keep"] == True]

        header = f"\n— {feat} | score={sc} —  (kept={len(kept)}, failed={len(failed)})"
        print(header)

        if EXPORT_DOCX:
            add_heading(doc, f"{feat} | score={sc}", level=2)
            doc.add_paragraph(f"(kept={len(kept)}, failed={len(failed)})")

        # FAILED example
        if len(failed) > 0:
            rowF = failed.sample(1, random_state=RANDOM_SEED).iloc[0]
            title_left  = f"REAL  • pid:{rowF['patient_id']} reg:{rowF['region']}"
            title_right = f"SYN (failed) • SSIM={rowF.get('ssim',np.nan):.2f}  LPIPS={rowF.get('lpips',np.nan):.2f}  BLUR={rowF.get('blur',np.nan):.0f}"
            fig, meta = fig_pair_from_row(rowF, title_left=title_left, title_right=title_right)

            print("FAILED example:")
            if fig is not None:
                if SHOW_INLINE:
                    plt.figure(fig.number)  # ensure current
                    plt.show()
                if EXPORT_DOCX:
                    add_figure_to_doc(doc, fig, width_inches=6.0, caption=f"FAILED example — {meta}")
        else:
            print("FAILED example: none in this bucket.")
            if EXPORT_DOCX:
                doc.add_paragraph("FAILED example: none in this bucket.")

        # KEPT example
        if len(kept) > 0:
            rowK = kept.sample(1, random_state=RANDOM_SEED).iloc[0]
            title_left  = f"REAL  • pid:{rowK['patient_id']} reg:{rowK['region']}"
            title_right = f"SYN (kept) • SSIM={rowK.get('ssim',np.nan):.2f}  LPIPS={rowK.get('lpips',np.nan):.2f}  BLUR={rowK.get('blur',np.nan):.0f}"
            fig, meta = fig_pair_from_row(rowK, title_left=title_left, title_right=title_right)

            print("KEPT example:")
            if fig is not None:
                if SHOW_INLINE:
                    plt.figure(fig.number)
                    plt.show()
                if EXPORT_DOCX:
                    add_figure_to_doc(doc, fig, width_inches=6.0, caption=f"KEPT example — {meta}")
        else:
            print("KEPT example: none in this bucket.")
            if EXPORT_DOCX:
                doc.add_paragraph("KEPT example: none in this bucket.")

# =========================
# Save document
# =========================
if EXPORT_DOCX:
    os.makedirs(os.path.dirname(REPORT_DOCX), exist_ok=True)
    doc.save(REPORT_DOCX)
    print(f"\n Word report saved to: {REPORT_DOCX}")

print("\nDone.")


In [None]:
######################################################################

# Creating 3 different datasets:
- real data
- real + synth
- synth

In [None]:
# ================================================
# Build 3 datasets:
#  - real-only            -> dataset_real_only.csv
#  - real + generated     -> dataset_real_plus_generated.csv
#  - generated-only       -> dataset_generated_only.csv
# Uses oversample_log.csv produced by the generator.
# ================================================

import os
import pandas as pd

# From your generator script
TRAIN_ORIGINAL_CSV = "/content/drive/MyDrive/Skin_project/train_original.csv"
OUT_DIR            = "/content/drive/MyDrive/Skin_project/diffusion_oversample_70_2"
LOG_CSV            = os.path.join(OUT_DIR, "oversample_log_2.csv")

REAL_ONLY_CSV      = os.path.join(OUT_DIR, "dataset_real_only.csv")
REAL_PLUS_GEN_CSV  = os.path.join(OUT_DIR, "dataset_real_plus_generated.csv")
GEN_ONLY_CSV       = os.path.join(OUT_DIR, "dataset_generated_only.csv")

# If True, only include synthetics with keep==True
ONLY_KEEP_SYNTHETIC = False

# --- load inputs ---
df_orig = pd.read_csv(TRAIN_ORIGINAL_CSV)
log_df  = pd.read_csv(LOG_CSV)

# Map feature -> image column and score column in your original CSV
feature2imgcol = {
    "moisture": "moisture_img",
    "oiliness": "oiliness_img",
    "elasticity": "elasticity_img",
    "texture": "texture_img",
    "redness": "redness_img",
    "hyperpigmentation": "hyperpigmentation_img",
}
feature2scorecol = {
    "moisture": "moisture_score",
    "oiliness": "oiliness_score",
    "elasticity": "elasticity_score",
    "texture": "texture_score",
    "redness": "redness_score",
    "hyperpigmentation": "hyperpigmentation_score",
}

def _to_int(x, default=0):
    try: return int(x)
    except:
        try: return int(float(x))
        except: return default

# --- build REAL rows ---
real_rows = []
for _, r in df_orig.iterrows():
    pid = r.get("patient_id")
    region = r.get("region")
    for feat, img_col in feature2imgcol.items():
        pth = r.get(img_col, "")
        if isinstance(pth, str) and len(pth) > 0 and os.path.exists(pth):
            sc = _to_int(r.get(feature2scorecol[feat], 0), 0)
            real_rows.append({
                "patient_id": pid,
                "region": region,
                "feature": feat,
                "image_path": pth,
                "score": sc,
                "source": "real",
            })
real_df = pd.DataFrame(real_rows)
real_df.to_csv(REAL_ONLY_CSV, index=False)
print(f"✅ Real-only dataset: {len(real_df)} rows -> {REAL_ONLY_CSV}")

# --- build SYNTHETIC rows from the log ---
syn_df = log_df.copy()
if ONLY_KEEP_SYNTHETIC and "keep" in syn_df.columns:
    syn_df = syn_df[syn_df["keep"] == True]

# Normalize column names to the common schema
syn_df = syn_df.rename(columns={
    "synthetic_path": "image_path"
})
synthetic_df = syn_df[[
    "patient_id", "region", "feature", "score", "image_path"
]].copy()
synthetic_df["source"] = "synthetic"

# Optionally drop rows whose image files are missing (defensive)
synthetic_df = synthetic_df[synthetic_df["image_path"].map(lambda p: isinstance(p, str) and os.path.exists(p))]

# Save generated-only
synthetic_df.to_csv(GEN_ONLY_CSV, index=False)
print(f"✅ Generated-only dataset: {len(synthetic_df)} rows -> {GEN_ONLY_CSV}")

# --- build REAL + GENERATED union ---
real_plus_gen = pd.concat([real_df, synthetic_df], ignore_index=True)
real_plus_gen.to_csv(REAL_PLUS_GEN_CSV, index=False)
print(f"✅ Real + Generated dataset: {len(real_plus_gen)} rows -> {REAL_PLUS_GEN_CSV}")

# --- quick summaries ---
def summary(df, name):
    print(f"\n{name} — counts by (feature, score):")
    print(df.groupby(["feature","score"]).size().rename("n").reset_index().pivot(index="feature", columns="score", values="n").fillna(0).astype(int))

summary(real_df, "REAL")
summary(synthetic_df, "SYNTHETIC (kept only)" if ONLY_KEEP_SYNTHETIC else "SYNTHETIC (all)")
summary(real_plus_gen, "REAL + SYNTHETIC")


✅ Real-only dataset: 366 rows -> /content/drive/MyDrive/Skin_project/diffusion_oversample_70_2/dataset_real_only.csv
✅ Generated-only dataset: 300 rows -> /content/drive/MyDrive/Skin_project/diffusion_oversample_70_2/dataset_generated_only.csv
✅ Real + Generated dataset: 666 rows -> /content/drive/MyDrive/Skin_project/diffusion_oversample_70_2/dataset_real_plus_generated.csv

REAL — counts by (feature, score):
score              -1   0   1
feature                      
elasticity          4  42  15
hyperpigmentation   6  36  19
moisture           42  18   1
oiliness           58   1   2
redness             0   2  59
texture             6  39  16

SYNTHETIC (all) — counts by (feature, score):
score              -1   0   1
feature                      
elasticity          4  34  12
hyperpigmentation   5  29  16
moisture           34  15   1
oiliness           47   1   2
redness             0   2  48
texture             5  32  13

REAL + SYNTHETIC — counts by (feature, score):
score      