In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip -q install -U huggingface_hub pillow scikit-learn tqdm

import tensorflow as tf
print("Colab TF version:", tf.__version__)

# (Optional) If you have a HF token, login so the hub can pull gated weights.
from huggingface_hub import login

Colab TF version: 2.19.0


In [None]:
# ====== Imports & basic setup =================================================
import os, io, time, gc, hashlib, warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
from PIL import Image
from concurrent.futures import ThreadPoolExecutor, as_completed

import tensorflow as tf
from huggingface_hub import from_pretrained_keras

from tqdm.auto import tqdm
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold, LeaveOneGroupOut
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score, balanced_accuracy_score, matthews_corrcoef, classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import ParameterGrid

# ====== Devices (Derm inference is CPU-only; head can use GPU if present) ====
GPUS = tf.config.list_physical_devices('GPU')
if GPUS:
    for g in GPUS:
        try: tf.config.experimental.set_memory_growth(g, True)
        except Exception: pass
TRAIN_DEVICE = '/GPU:0' if GPUS else '/CPU:0'
INFER_DEVICE = '/CPU:0'

## Transforming dataset from long to wide format



## TRAD AUGMENTATION DATASET

In [None]:
# ===================== LONG -> WIDE (robust) =====================
# Input (LONG):  columns ≈ patient_id, region?, feature, image_path, score
# Output (WIDE): columns = patient_id, region,
#                <feat>_img for all feats + <feat>_score for all feats
# Only the current feature's two columns are filled in each output row.

import os, numpy as np, pandas as pd
from IPython.display import display

# Uses your globals from the notebook:
# FEATURES and feature2imgcol must be defined already.
FEATURES_ALL = ["moisture","oiliness","elasticity","texture","redness","hyperpigmentation"]

# (Optional) match train_original.csv *visual* order of SCORE columns
SCORE_COL_ORDER = ["moisture","texture","oiliness","redness","hyperpigmentation","elasticity"]

def _coerce_long_schema(df):
    """Rename common variants to standard long schema names."""
    # normalize column names
    dn = {c: c.strip().lower() for c in df.columns}
    df = df.rename(columns=dn)

    # synonyms for key fields
    rename_map = {}
    def first_present(*cands):
        for c in cands:
            if c in df.columns:
                return c
        return None

    pid_col = first_present("patient_id","patientid","pid","id")
    feat_col = first_present("feature","feat")
    img_col  = first_present("image_path","img_path","path","image")
    score_col= first_present("score","label","y","target")
    region_col = first_present("region","site","area")

    missing = [n for n,(ncol) in {"patient_id":pid_col,"feature":feat_col,"image_path":img_col,"score":score_col}.items() if ncol is None]
    if missing:
        raise ValueError(f"Input long CSV is missing required columns: {missing}")

    rename_map[pid_col] = "patient_id"
    rename_map[feat_col] = "feature"
    rename_map[img_col] = "image_path"
    rename_map[score_col] = "score"
    if region_col and region_col != "region":
        rename_map[region_col] = "region"

    df = df.rename(columns=rename_map)
    if "region" not in df.columns:
        df["region"] = ""

    # clean values
    df["patient_id"] = df["patient_id"].astype(str)
    df["region"] = df["region"].astype(str)
    df["feature"] = df["feature"].astype(str).str.strip().str.lower()
    df["image_path"] = df["image_path"].astype(str)
    # numeric score -> int, allow NaN
    df["score"] = pd.to_numeric(df["score"], errors="coerce").round()

    return df

def long_to_wide_compatible(in_csv, out_csv, allowed_features=FEATURES_ALL, check_files=False, drop_dupes=True):
    """
    Convert a LONG CSV to WIDE, one output row per input row (only that feature filled).
    - check_files=False avoids accidental row drops when Drive isn't mounted.
    - drop_dupes=True removes duplicate rows (same patient_id, region, feature, image_path).
    """
    df_long = pd.read_csv(in_csv)
    df_long = _coerce_long_schema(df_long)

    # keep only the features we know about
    df_long = df_long[df_long["feature"].isin(allowed_features)].copy()

    if drop_dupes:
        df_long = df_long.drop_duplicates(subset=["patient_id","region","feature","image_path"]).reset_index(drop=True)

    # build a wide row per input
    rows = []
    for _, r in df_long.iterrows():
        feat = r["feature"]
        row = {"patient_id": r["patient_id"], "region": r["region"]}
        # init all *_img and *_score columns
        for f in FEATURES_ALL:
            row[f"{f}_img"] = ""
            row[f"{f}_score"] = np.nan
        # fill current feature only
        row[f"{feat}_img"] = r["image_path"]
        row[f"{feat}_score"] = (int(r["score"]) if pd.notna(r["score"]) else np.nan)
        rows.append(row)

    out = pd.DataFrame(rows)

    # exact column order (imgs then scores). Score columns in SCORE_COL_ORDER for readability
    img_cols   = [f"{f}_img" for f in FEATURES_ALL]
    score_cols = [f"{f}_score" for f in SCORE_COL_ORDER] + [f"{f}_score" for f in FEATURES_ALL if f not in SCORE_COL_ORDER]
    ordered_cols = ["patient_id", "region"] + img_cols + score_cols
    out = out.reindex(columns=ordered_cols)

    # OPTIONAL: only drop rows whose SINGLE filled image path doesn't exist
    if check_files:
        def _has_valid_img(row):
            # find which feature is filled and check just that one
            for f in FEATURES_ALL:
                p = row[f"{f}_img"]
                if isinstance(p, str) and p:
                    return os.path.exists(p)
            return False
        mask = out.apply(_has_valid_img, axis=1)
        dropped = int((~mask).sum())
        out = out[mask].reset_index(drop=True)
        print(f"[wide] Dropped {dropped} rows with non-existing image files (check_files=True).")

    out.to_csv(out_csv, index=False)
    print(f"[wide] Wrote {out_csv} — rows: {len(out)}, cols: {len(out.columns)}")
    return out

def preview_wide(path, n=3):
    df = pd.read_csv(path)
    print(f"[preview] {path} — shape={df.shape}")
    display(df.head(n))
    # quick counts by which feature column is filled
    which = []
    for _, r in df.iterrows():
        fnd = None
        for f in FEATURES_ALL:
            if isinstance(r[f"{f}_img"], str) and r[f"{f}_img"]:
                fnd = f; break
        which.append(fnd)
    s = pd.Series(which).value_counts().sort_index()
    print("[per-feature rows]")
    display(pd.DataFrame({"rows": s}))


In [None]:
CSV_TRAD_AUG_LONG = "/content/drive/MyDrive/Skin_project/trad_augmented_dataset.csv"
CSV_TRAIN_TRAD_AUG  = "/content/drive/MyDrive/Skin_project/trad_aug_as_wide.csv"

long_to_wide_compatible(CSV_TRAD_AUG_LONG,CSV_TRAIN_TRAD_AUG)
preview_wide(CSV_TRAIN_TRAD_AUG)

## DIFFUSION BASED AUGMENTATION DATASET

In [None]:
# DIFFUSION BASED AUGM DATASET

import os, numpy as np, pandas as pd
from IPython.display import display

# Global lists (match your project)
FEATURES_ALL = ["moisture","oiliness","elasticity","texture","redness","hyperpigmentation"]
SCORE_COL_ORDER = ["moisture","texture","oiliness","redness","hyperpigmentation","elasticity"]

# --- helpers ---

def _coerce_long_schema(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize a 'long' CSV to canonical column names & types."""
    # 1) normalize headers
    df = df.rename(columns={c: c.strip().lower() for c in df.columns})

    def first_present(*cands):
        for c in cands:
            if c in df.columns: return c
        return None

    pid_col   = first_present("patient_id","patientid","pid","id")
    feat_col  = first_present("feature","feat")
    path_col  = first_present("image_path","img_path","path","image")
    score_col = first_present("score","label","y","target")
    region_col= first_present("region","site","area")
    source_col= first_present("source","origin","dataset","provider")

    missing = [n for n, col in {
        "patient_id": pid_col, "feature": feat_col,
        "image_path": path_col, "score": score_col
    }.items() if col is None]
    if missing:
        raise ValueError(f"Input long CSV is missing required columns: {missing}")

    rename_map = {
        pid_col: "patient_id",
        feat_col: "feature",
        path_col: "image_path",
        score_col: "score",
    }
    if region_col and region_col != "region": rename_map[region_col] = "region"
    if source_col and source_col != "source": rename_map[source_col] = "source"

    df = df.rename(columns=rename_map)

    # 2) clean values
    if "region" not in df.columns:
        df["region"] = ""
    if "source" not in df.columns:
        df["source"] = ""

    df["patient_id"] = df["patient_id"].astype(str)
    df["region"]     = df["region"].astype(str)
    df["feature"]    = df["feature"].astype(str).str.strip().str.lower()
    df["image_path"] = df["image_path"].astype(str)
    df["source"]     = df["source"].astype(str)

    # robust score → int {-1,0,1}
    _label_map = {"low": -1, "avg": 0, "average": 0, "med": 0, "medium": 0, "high": 1}
    def _to_int(v):
        try: return int(round(float(v)))
        except: return _label_map.get(str(v).strip().lower(), 0)
    df["score"] = df["score"].apply(_to_int)

    return df


def long_to_wide_compatible(
    in_csv: str,
    out_csv: str,
    allowed_features=FEATURES_ALL,
    check_files: bool=False,
    drop_dupes: bool=True,
    keep_source: bool=True,
    source_col_name: str="source",
):
    """
    Convert a LONG CSV (one row per (patient, region, feature, path, score[, source]))
    into a WIDE CSV where each row has columns:
      patient_id, region, [<feat>_img]*6, [<feat>_score]*6 [, source]
    Only the current feature's *_img and *_score are filled per row.
    """
    df_long = pd.read_csv(in_csv)
    df_long = _coerce_long_schema(df_long)

    # restrict to known features
    df_long = df_long[df_long["feature"].isin(allowed_features)].copy()

    if drop_dupes:
        df_long = df_long.drop_duplicates(
            subset=["patient_id","region","feature","image_path"]
        ).reset_index(drop=True)

    rows = []
    has_source = keep_source and (source_col_name in df_long.columns)
    for _, r in df_long.iterrows():
        feat = r["feature"]
        row = {"patient_id": r["patient_id"], "region": r["region"]}
        # init all *_img and *_score
        for f in FEATURES_ALL:
            row[f"{f}_img"] = ""
            row[f"{f}_score"] = np.nan
        # fill only this feature
        row[f"{feat}_img"]   = r["image_path"]
        row[f"{feat}_score"] = int(r["score"])
        if has_source:
            row[source_col_name] = r[source_col_name]
        rows.append(row)

    out = pd.DataFrame(rows)

    # Column order
    img_cols   = [f"{f}_img" for f in FEATURES_ALL]
    score_cols = [f"{f}_score" for f in SCORE_COL_ORDER] + \
                 [f"{f}_score" for f in FEATURES_ALL if f not in SCORE_COL_ORDER]
    ordered = ["patient_id","region"] + img_cols + score_cols
    if has_source: ordered.append(source_col_name)
    out = out.reindex(columns=ordered)

    if check_files:
        def _has_valid_img(row):
            for f in FEATURES_ALL:
                p = row[f"{f}_img"]
                if isinstance(p, str) and p:
                    return os.path.exists(p)
            return False
        mask = out.apply(_has_valid_img, axis=1)
        dropped = int((~mask).sum())
        out = out[mask].reset_index(drop=True)
        print(f"[wide] Dropped {dropped} rows with non-existing image files.")

    out.to_csv(out_csv, index=False)
    print(f"[wide] Wrote {out_csv} — rows={len(out)}, cols={len(out.columns)}")
    return out


def preview_wide(path, n=5):
    df = pd.read_csv(path)
    print(f"[preview] {path} — shape={df.shape}")
    display(df.head(n))
    # per-feature row counts (which feature is filled)
    which = []
    for _, r in df.iterrows():
        fnd = None
        for f in FEATURES_ALL:
            if isinstance(r.get(f"{f}_img",""), str) and r[f"{f}_img"]:
                fnd = f; break
        which.append(fnd)
    s = pd.Series(which, dtype="object").value_counts().sort_index()
    print("[per-feature rows]"); display(pd.DataFrame({"rows": s}))


In [None]:
CSV_SYN_LONG = "/content/drive/MyDrive/Skin_project/dataset_generated_only.csv"
CSV_SYN_WIDE = "/content/drive/MyDrive/Skin_project/synthetic_as_wide.csv"

_ = long_to_wide_compatible(
        CSV_SYN_LONG, CSV_SYN_WIDE,
        check_files=True,     # set False if Drive isn't mounted yet
        keep_source=True      # keeps 'source' column ("synthetic")
    )
preview_wide(CSV_SYN_WIDE)


# GDF Embeddigns Extraction code

In [None]:
# ====== Paths / config ========================================================
CSV_TRAIN_POOL   = "/content/drive/MyDrive/Skin_project/train_original.csv"
#CSV_TRAIN_TRAD   = "/content/drive/MyDrive/Skin_project/trad_aug_as_wide.csv"
CSV_SYN_WIDE     = "/content/drive/MyDrive/Skin_project/synthetic_as_wide.csv"
CSV_TEST_FIXED   = "/content/drive/MyDrive/Skin_project/test_original.csv"

# Keep redness out for now
FEATURES = ["texture","hyperpigmentation","oiliness","moisture","elasticity"]

feature2imgcol = {
    "moisture": "moisture_img",
    "oiliness": "oiliness_img",
    "elasticity": "elasticity_img",
    "texture": "texture_img",
    "redness": "redness_img",
    "hyperpigmentation": "hyperpigmentation_img",
}

# Base labels
DEFAULT_LABELS = [-1, 0, 1]

# Binary remaps for the two very-imbalanced tasks
BINARY_MAP = {
    "oiliness": { -1: 0, 0: 1, 1: 1 },   # dry vs non-dry
    "moisture": { -1: 0, 0: 1, 1: 1 },   # low vs non-low
}
FEATURE_LABELS = {
    "oiliness":  [0, 1],
    "moisture":  [0, 1],
    # others keep DEFAULT_LABELS
}

VAL_FRAC_WITHIN_TRAIN = 0.125
RANDOM_SEED = 42

# Embeddings cache
CACHE_DIR = "/content/drive/MyDrive/Skin_project/derm_emb_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

BATCH_SIZE_INFER = 128
NUM_WORKERS      = min(8, os.cpu_count() or 4)

In [None]:
# ====== Derm foundation model (inference signature only) =====================
derm_model = from_pretrained_keras("google/derm-foundation")
derm_infer = derm_model.signatures["serving_default"]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]



In [None]:

# Tune these if needed (smaller = less RAM)
ENCODE_WORKERS = 1            # threads that read & PNG-encode
ENCODE_CHUNK   = 64         # how many images to encode before inferring
STREAM_BATCH   = 32        # inference batch size for Derm (CPU)

In [None]:
def _chunked(it, n):
    for i in range(0, len(it), n):
        yield it[i:i+n]


In [None]:
# ====== Small progress helper =================================================
def make_phase_bar(desc, phases, position=1, leave=True):
    bar = tqdm(total=len(phases), desc=desc, position=position, leave=leave)
    idx = {"i": 0, "phases": phases}
    def tick(label=None):
        if label is None and idx["i"] < len(idx["phases"]):
            label = idx["phases"][idx["i"]]
        if label is not None: bar.set_postfix_str(str(label))
        bar.update(1); idx["i"] += 1
    def close(): bar.close()
    return bar, tick, close

# ====== Embedding I/O helpers =================================================
def _png_bytes_from_pil(pil_img: Image.Image) -> bytes:
    buf = io.BytesIO()
    pil_img.convert("RGB").save(buf, format="PNG")
    return buf.getvalue()

def _example_from_png_bytes(image_bytes: bytes) -> bytes:
    ex = tf.train.Example(features=tf.train.Features(
        feature={'image/encoded': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_bytes]))}
    ))
    return ex.SerializeToString()

def _sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def _emb_cache_path(img_path: str) -> str:
    return os.path.join(CACHE_DIR, _sha1(img_path) + ".npy")

def _load_embed_if_cached(img_path: str):
    fp = _emb_cache_path(img_path)
    if os.path.exists(fp):
        try:
            arr = np.load(fp, allow_pickle=False)
            if arr.ndim == 1 and arr.shape[0] == 6144 and arr.dtype == np.float32:
                return arr
        except Exception:
            pass
    return None

def _save_embed_to_cache(img_path: str, emb: np.ndarray):
    fp = _emb_cache_path(img_path)
    try: np.save(fp, emb.astype(np.float32), allow_pickle=False)
    except Exception: pass

def _encode_one_path(img_path: str):
    try:
        with Image.open(img_path) as im:
            b = _png_bytes_from_pil(im)
        return img_path, _example_from_png_bytes(b)
    except Exception:
        return img_path, None

def build_embeddings_cache(paths, desc="build cache", position=2, allow_infer=True,
                           encode_workers=ENCODE_WORKERS, encode_chunk=ENCODE_CHUNK,
                           stream_batch=STREAM_BATCH, return_map=True):
    """
    Streamed, RAM-safe cache builder.
    If return_map=False, embeddings are NOT accumulated in RAM (best for pre-cache).
    """
    # Only unique, existing paths
    paths = [p for p in paths if isinstance(p, str) and os.path.exists(p)]
    uniq  = sorted(set(paths))

    cache_map, missing = ({}, [])
    for p in uniq:
        emb = _load_embed_if_cached(p)
        if emb is not None:
            if return_map:
                cache_map[p] = emb
        else:
            missing.append(p)

    if not missing or not allow_infer:
        if missing and not allow_infer:
            print(f"⚠️ {len(missing)} missing embeddings; not inferring (allow_infer=False).")
        return cache_map

    # ---- encode + infer in bounded chunks ----
    pbar = tqdm(total=len(missing), desc=f"{desc}", unit="img", position=position)
    for chunk_paths in _chunked(missing, encode_chunk):
        # Encode this chunk (bounded)
        encoded = []
        with ThreadPoolExecutor(max_workers=encode_workers) as ex:
            futs = {ex.submit(_encode_one_path, p): p for p in chunk_paths}
            for fut in as_completed(futs):
                p, ex_bytes = fut.result()
                if ex_bytes is not None:
                    encoded.append((p, ex_bytes))

        # Infer this chunk in small batches and immediately write to disk
        for i in range(0, len(encoded), stream_batch):
            batch = encoded[i:i+stream_batch]
            if not batch:
                continue
            batch_paths = [p for p, _ in batch]
            batch_exs   = [exb for _, exb in batch]
            with tf.device(INFER_DEVICE):
                outs = derm_infer(inputs=tf.constant(batch_exs))
                embs = outs["embedding"].numpy().astype(np.float32)

            for p, e in zip(batch_paths, embs):
                # save to disk first
                _save_embed_to_cache(p, e)
                # keep in RAM only if requested
                if return_map:
                    cache_map[p] = e

            # aggressively free batch buffers
            del batch, batch_paths, batch_exs, embs
            gc.collect()

        # free the encoded chunk
        del encoded
        gc.collect()

        pbar.update(len(chunk_paths))
    pbar.close()
    return cache_map



def clear_embedding_cache():
    if not os.path.isdir(CACHE_DIR):
        print("No cache dir to clear."); return
    n = 0
    for fn in os.listdir(CACHE_DIR):
        if fn.endswith(".npy"):
            try:
                os.remove(os.path.join(CACHE_DIR, fn)); n += 1
            except Exception: pass
    print(f"Cleared {n} cached files from {CACHE_DIR}.")

# ====== Data loading (handles binary remaps) =================================
def load_feature_df_one(csv_path, feature):
    df_all  = pd.read_csv(csv_path)
    img_col = feature2imgcol[feature]
    lbl_col = f"{feature}_score"
    cols    = ["patient_id", img_col, lbl_col] + (["region"] if "region" in df_all.columns else [])
    df      = df_all[cols].copy()
    df      = df.rename(columns={img_col: "image_path", lbl_col: "label"})

    # Keep rows that actually have an image on disk
    df = df[df["image_path"].apply(lambda p: isinstance(p, str) and os.path.exists(p))]
    df = df.dropna(subset=["label"])
    df["label"] = df["label"].astype(float).round().astype(int)

    # Optional binary remap (oiliness/moisture)
    if feature in BINARY_MAP:
        df["label"] = df["label"].map(BINARY_MAP[feature])
        df = df.dropna(subset=["label"]).astype({"label": int})

    allowed = FEATURE_LABELS.get(feature, DEFAULT_LABELS)
    df = df[df["label"].isin(allowed)].reset_index(drop=True)
    return df

def load_feature_df_multi(csv_paths, feature):
    frames = [load_feature_df_one(p, feature) for p in csv_paths]
    df = pd.concat(frames, ignore_index=True)
    df = df.drop_duplicates(subset=["image_path"]).reset_index(drop=True)
    return df

def df_to_Xy(df, emb_map):
    keep = [i for i, p in enumerate(df.image_path.values) if p in emb_map]
    if not keep:
        return np.zeros((0, 6144), np.float32), np.zeros((0,), np.int64), df.iloc[[]]
    df2 = df.iloc[keep].reset_index(drop=True)
    X   = np.stack([emb_map[p] for p in df2.image_path.values], axis=0)
    y   = df2.label.values.astype(np.int64)
    return X, y, df2

def remove_patient_overlap(df_trainpool, df_test):
    overlap = set(df_trainpool.patient_id.unique()) & set(df_test.patient_id.unique())
    if overlap:
        print(f"⚠️ Removing {len(overlap)} overlapping patient_id(s) to prevent leakage.")
        df_trainpool = df_trainpool[~df_trainpool.patient_id.isin(overlap)].reset_index(drop=True)
    return df_trainpool

def make_val_from_trainpool(df_trainpool, val_frac_within_train=VAL_FRAC_WITHIN_TRAIN, seed=RANDOM_SEED):
    groups = df_trainpool["patient_id"].values
    gss    = GroupShuffleSplit(n_splits=1, test_size=val_frac_within_train, random_state=seed)
    tr_idx, va_idx = next(gss.split(df_trainpool, groups=groups))
    return df_trainpool.iloc[tr_idx].reset_index(drop=True), df_trainpool.iloc[va_idx].reset_index(drop=True)

# ====== Precache convenience ==================================================
#def precache_from_csvs(features, train_csvs, include_test=True, clear_cache=False):
    #if clear_cache: clear_embedding_cache()
    #paths = []
    #for feat in features:
        #for csvp in train_csvs:
           # df_trp = load_feature_df_one(csvp, feat); paths.extend(df_trp.image_path.values)
        #if include_test:
            #df_te = load_feature_df_one(CSV_TEST_FIXED, feat); paths.extend(df_te.image_path.values)
    #uniq = sorted({p for p in paths if isinstance(p, str) and os.path.exists(p)})
    #t0 = time.time()
    #cached_before = sum(os.path.exists(_emb_cache_path(p)) for p in uniq)
    #build_embeddings_cache(uniq, desc="precache", allow_infer=True)
    #cached_after  = sum(os.path.exists(_emb_cache_path(p)) for p in uniq)
    #print(f"Pre-cache: {len(uniq)} unique images | cached {cached_after} (new {cached_after-cached_before}) in {time.time()-t0:.1f}s.")

def precache_from_csvs(features, train_csvs, include_test=True, clear_cache=False):
    if clear_cache: clear_embedding_cache()
    paths = []
    for feat in features:
        for csvp in train_csvs:
            df_trp = load_feature_df_one(csvp, feat); paths.extend(df_trp.image_path.values)
        if include_test:
            df_te = load_feature_df_one(CSV_TEST_FIXED, feat); paths.extend(df_te.image_path.values)

    uniq = sorted({p for p in paths if isinstance(p, str) and os.path.exists(p)})

    t0 = time.time()
    cached_before = sum(os.path.exists(_emb_cache_path(p)) for p in uniq)

    # STREAMED + DO NOT ACCUMULATE IN RAM
    build_embeddings_cache(
        uniq, desc="precache", allow_infer=True,
        encode_workers=ENCODE_WORKERS, encode_chunk=ENCODE_CHUNK,
        stream_batch=STREAM_BATCH, return_map=False
    )

    cached_after  = sum(os.path.exists(_emb_cache_path(p)) for p in uniq)
    print(f"Pre-cache: {len(uniq)} unique images | cached {cached_after} (new {cached_after-cached_before}) in {time.time()-t0:.1f}s.")


In [None]:
# 1) Choosing which CSVs to use in the training pool


#TRAIN_CSVS = [CSV_TRAIN_POOL, CSV_TRAIN_TRAD]  # real + trad aug
TRAIN_CSVS = [CSV_TRAIN_POOL, CSV_SYN_WIDE]    # real + diffusion

In [None]:
CACHE_DIR = "/content/drive/MyDrive/Skin_project/derm_emb_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

In [None]:
# 2) warm the cache once
precache_from_csvs(FEATURES, TRAIN_CSVS, include_test=True)

precache:   0%|          | 0/250 [00:00<?, ?img/s]

Pre-cache: 635 unique images | cached 635 (new 250) in 1924.2s.
