In [2]:
# 1) Install kagglehub if you haven't:
#    pip install kagglehub --upgrade
import os, shutil, zipfile, pandas as pd, kagglehub
from pathlib import Path


In [3]:
KAGGLE_SLUG = "vinitasilaparasetty/fitzpatrick-classification-by-ethnicity"
DEST        = Path("fitzpatrick17k")          # final clean layout
DEST.mkdir(exist_ok=True, parents=True)

# ── 2. Download (kagglehub auto-caches) ───────────────────────────────────
print("⏬  Downloading via kagglehub …")
dl_path = Path(
    kagglehub.dataset_download(KAGGLE_SLUG)   # returns cache path
)
print("KaggleHub cache folder:", dl_path)



⏬  Downloading via kagglehub …
KaggleHub cache folder: C:\Users\yrsee\.cache\kagglehub\datasets\vinitasilaparasetty\fitzpatrick-classification-by-ethnicity\versions\2


In [4]:
# ── 2. If a ZIP exists, unzip; else use the folder as-is ──────────────────
zip_candidates = list(dl_path.glob("*.zip"))
if zip_candidates:
    zip_file = zip_candidates[0]
    print("📦  Extracting", zip_file.name)
    with zipfile.ZipFile(zip_file) as zf:
        zf.extractall(DEST)
else:
    # The dataset is already extracted → copy contents to DEST
    for item in dl_path.iterdir():
        tgt = DEST / item.name
        if tgt.exists():
            continue
        print("📁  Copying", item.name, "→", tgt)
        if item.is_dir():
            shutil.copytree(item, tgt)
        else:
            shutil.copy2(item, tgt)


In [5]:
# ── 3. Standardize layout: move CSV + images/ ─────────────────────────────
# Find the main CSV
csv_path = next(DEST.rglob("*.csv"))
# Find the images folder (contains jpg / png)
img_dir  = next(p for p in DEST.rglob("*") if p.is_dir() and any(p.glob("*.jpg")))

if img_dir.resolve() != (DEST / "images").resolve():
    shutil.move(str(img_dir), DEST / "images")

print(f"✅  Dataset ready in: {DEST.resolve()}")
print("   ├─", (DEST / 'labels.csv').relative_to(DEST.parent))
print("   └─", (DEST / 'images').relative_to(DEST.parent))



✅  Dataset ready in: C:\Users\yrsee\everything\ACME-Outreach\skin-diagnostic-engine\fitzpatrick17k
   ├─ fitzpatrick17k\labels.csv
   └─ fitzpatrick17k\images


In [6]:
# ── 4. Quick sanity: view distribution ────────────────────────────────────
df = pd.read_csv(DEST / "labels.csv")
dist = df["phototype"].value_counts().sort_index()
print("\nFitzpatrick I–VI distribution:\n", dist)



Fitzpatrick I–VI distribution:
 phototype
I & II    903
III       903
IV        903
V         903
VI        903
Name: count, dtype: int64


In [7]:
# If you already have these, you can skip the installs.
# Recommended CUDA 12.1 wheels for RTX 40xx:
# !pip -q install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu121
# !pip -q install pandas scikit-learn matplotlib Pillow tqdm

import sys, torch
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


PyTorch: 2.5.1+cu118
CUDA available: True
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
CUDA available: True
GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [8]:
from pathlib import Path
import pandas as pd

# Point these at your prepared dataset
DATA_ROOT = Path("fitzpatrick17k")      # folder you created earlier
CSV_PATH  = DATA_ROOT / "labels.csv"    # should exist
IMG_ROOT  = DATA_ROOT / "images"        # should exist

assert CSV_PATH.is_file(), f"Missing CSV at {CSV_PATH}"
assert IMG_ROOT.is_dir(),  f"Missing images/ at {IMG_ROOT}"

df = pd.read_csv(CSV_PATH)
print("CSV columns:", df.columns.tolist())
print(df.head(3))

# Heuristic to find the image-path column if it isn't named 'image_path'
for col in ["image_path","image","filepath","file","filename","path"]:
    if col in df.columns:
        PATH_COL = col
        break
else:
    raise KeyError("No image-path column found. Expected one of: image_path,image,filepath,file,filename,path")

LABEL_COL = "phototype"  # from your dataset
assert LABEL_COL in df.columns, f"'{LABEL_COL}' not found in CSV"


CSV columns: ['file', 'age', 'gender', 'race', 'phototype']
        file    age  gender             race phototype
0    100.jpg  20-29  Female       East Asian       III
1   1000.jpg  20-29    Male  Latino_Hispanic        IV
2  10000.jpg  20-29  Female       East Asian       III


In [9]:
PLOT = False  # set True if you want a bar chart

dist = df[LABEL_COL].value_counts().sort_index()
print("Fitzpatrick I–VI counts:\n", dist.to_string())

if PLOT:
    import matplotlib.pyplot as plt
    dist.plot(kind="bar", title="Fitzpatrick I–VI distribution")
    plt.xlabel("Fitzpatrick type"); plt.ylabel("# images"); plt.show()


Fitzpatrick I–VI counts:
 phototype
I & II    903
III       903
IV        903
V         903
VI        903


In [10]:
# Cell 0 — reset kernel (optional) and seed
# Uncomment the reset line if you want a full kernel reset
# %reset -f

SEED = 42
import random, os
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
import numpy as np
np.random.seed(SEED)
import torch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)  # for multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [11]:
# Cell 1
from pathlib import Path
import pandas as pd
import numpy as np
import unicodedata, urllib.parse, difflib
from collections import Counter, defaultdict
from PIL import Image
import torch

DATA_ROOT = Path("fitzpatrick17k")
CSV_PATH  = DATA_ROOT / "labels.csv"
IMG_ROOT  = DATA_ROOT / "images"

assert CSV_PATH.exists(), f"CSV not found at {CSV_PATH}"
assert IMG_ROOT.exists(), f"images folder not found at {IMG_ROOT}"

df = pd.read_csv(CSV_PATH)
print("Loaded df shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Loaded df shape: (4515, 5)
Columns: ['file', 'age', 'gender', 'race', 'phototype']


Unnamed: 0,file,age,gender,race,phototype
0,100.jpg,20-29,Female,East Asian,III
1,1000.jpg,20-29,Male,Latino_Hispanic,IV
2,10000.jpg,20-29,Female,East Asian,III
3,10001.jpg,20-29,Female,Southeast Asian,V
4,10004.jpg,20-29,Male,Southeast Asian,V


In [12]:
# Cell 2: resolve paths (CSV column 'file')
PATH_COL_RAW = "file"
assert PATH_COL_RAW in df.columns, f"Expected column '{PATH_COL_RAW}' in CSV"

# build file index under IMG_ROOT (case-insensitive)
name_map, stem_map, rel_map = {}, {}, {}
for p in IMG_ROOT.rglob("*"):
    if p.is_file():
        rel = p.relative_to(IMG_ROOT)
        name = unicodedata.normalize("NFKC", p.name).lower()
        stem = unicodedata.normalize("NFKC", p.stem).lower()
        rel_s = str(rel).replace("\\","/").lower()
        name_map[name] = rel
        stem_map.setdefault(stem, []).append(rel)
        rel_map[rel_s] = rel

def _clean_input(raw):
    if pd.isna(raw): return None
    s = str(raw).strip()
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        s = s[1:-1].strip()
    parsed = urllib.parse.urlparse(s)
    path = parsed.path if parsed.scheme else s
    path = urllib.parse.unquote(path.split("?")[0].split("#")[0])
    return unicodedata.normalize("NFKC", path).replace("\\","/")

def resolve_path(raw):
    p = _clean_input(raw)
    if not p: return None
    base = Path(p).name.lower()
    base = unicodedata.normalize("NFKC", base)
    if base in name_map: return str(name_map[base])
    stem = Path(base).stem.lower()
    if stem in stem_map and len(stem_map[stem])>0: return str(stem_map[stem][0])
    rel_try = p.lower().lstrip("./")
    if rel_try in rel_map: return str(rel_map[rel_try])
    tail = rel_try.split("/")[-1]
    for k,v in rel_map.items():
        if k.endswith("/"+tail) or k==tail: return str(v)
    cand = difflib.get_close_matches(base, list(name_map.keys()), n=1, cutoff=0.85)
    if cand: return str(name_map[cand[0]])
    return None

df["resolved_path"] = df[PATH_COL_RAW].apply(resolve_path)
missing = df["resolved_path"].isna().sum()
print(f"Resolved: {len(df)-missing}  |  Missing: {missing}")
if missing:
    display(df.loc[df["resolved_path"].isna(), [PATH_COL_RAW]].head(10))
# drop rows without image file
df = df.dropna(subset=["resolved_path"]).reset_index(drop=True)
print("After dropna shape:", df.shape)


Resolved: 4515  |  Missing: 0
After dropna shape: (4515, 6)


In [13]:
# Cell 3 — map original Fitzpatrick labels to 3 classes
# Try common label column names
possible_label_cols = [c for c in ["fitzpatrick", "phototype", "label", "class", "_label_idx", "target"] if c in df.columns]
LABEL_COL = possible_label_cols[0] if possible_label_cols else None
if LABEL_COL is None:
    raise KeyError("No label-like column found in CSV. Columns: " + ", ".join(df.columns))
print("Using label source:", LABEL_COL)

# coerce to numeric where possible
df[LABEL_COL] = pd.to_numeric(df[LABEL_COL], errors="coerce")
# drop rows without numeric label
df = df.dropna(subset=[LABEL_COL]).reset_index(drop=True)

# Map Fitzpatrick numeric values -> 3 classes
# If values are 1..6 -> map 1,2->0 light; 3,4->1 medium; 5,6->2 dark
def fitz_to_3(x):
    x = int(x)
    if x <= 2:
        return 0
    elif x <= 4:
        return 1
    else:
        return 2

df["label3"] = df[LABEL_COL].apply(fitz_to_3)
print("3-class distribution (0=light,1=medium,2=dark):")
print(df["label3"].value_counts().sort_index())
num_classes = 3


Using label source: phototype
3-class distribution (0=light,1=medium,2=dark):
Series([], Name: count, dtype: int64)


In [14]:
# Cell 3: map phototype -> 3 classes (0 light, 1 medium, 2 dark)
def roman_to_int_safe(x):
    if pd.isna(x): return None
    s = str(x).strip().upper()
    # simple roman map for I..VI, also handles '3' etc.
    roman_map = {"I":1,"II":2,"III":3,"IV":4,"V":5,"VI":6}
    if s in roman_map: return roman_map[s]
    try:
        return int(s)
    except:
        # try first character numeric in strings like "III\n"
        for token in s.split():
            try:
                return int(token)
            except: pass
    return None

df["phototype_num"] = df["phototype"].apply(roman_to_int_safe)
n_missing = df["phototype_num"].isna().sum()
if n_missing:
    print("Rows with missing/unparsable phototype:", n_missing)
    display(df.loc[df["phototype_num"].isna(), ["phototype","resolved_path"]].head(10))
    df = df.dropna(subset=["phototype_num"]).reset_index(drop=True)

def map3(x):
    x = int(x)
    if x <= 2: return 0   # I-II -> light
    if x <= 4: return 1   # III-IV -> medium
    return 2              # V-VI -> dark

df["label3"] = df["phototype_num"].apply(map3)
print("3-class distribution:", df["label3"].value_counts().sort_index().to_dict())
num_classes = 3


3-class distribution: {}


In [15]:
# Cell 4: check conflicts (same resolved_path -> multiple label3)
dupes = df.groupby("resolved_path").agg(n_rows=("resolved_path","size"), n_labels=("label3","nunique"))
conflicts = dupes[dupes["n_labels"]>1]
print("Conflicting files count:", len(conflicts))
if len(conflicts)>0:
    display(conflicts.head(10))
    print("Example conflicts (path -> labels):")
    for p in conflicts.index[:10]:
        print(p, "->", sorted(df.loc[df["resolved_path"]==p, "label3"].unique().tolist()))

# Option (manual): if few, you can drop conflicts or choose majority label.
# Automated majority-dedupe (uncomment to enable):
# from collections import Counter
# if len(conflicts)>0:
#     keep=[]
#     for p,g in df.groupby("resolved_path"):
#         if g["label3"].nunique()==1:
#             keep.append(g.iloc[0])
#         else:
#             maj = Counter(g["label3"]).most_common(1)[0][0]
#             keep.append(g[g["label3"]==maj].iloc[0])
#     df = pd.DataFrame(keep).reset_index(drop=True)
#     print("After majority dedupe:", df.shape)


Conflicting files count: 0


In [15]:
# Cell 5: stratified train/val split without sklearn
from collections import defaultdict
np.random.seed(42)
indices_by_class = defaultdict(list)
for idx, lbl in enumerate(df["label3"].values):
    indices_by_class[int(lbl)].append(idx)

train_idx, val_idx = [], []
frac_train = 0.8
for lbl, idxs in indices_by_class.items():
    idxs = idxs.copy()
    np.random.shuffle(idxs)
    split = int(len(idxs)*frac_train)
    train_idx += idxs[:split]
    val_idx += idxs[split:]

train_idx = sorted(set(train_idx))
val_idx = sorted(set(val_idx))
print("train len:", len(train_idx), "val len:", len(val_idx))
print("train distribution:", Counter(df.loc[train_idx,"label3"]))
print("val distribution:", Counter(df.loc[val_idx,"label3"]))


train len: 0 val len: 0
train distribution: Counter()
val distribution: Counter()


In [16]:
# Cell 6: dataset, transforms, balanced sampler
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

IMG_SIZE = 224
train_tf = T.Compose([T.Resize((IMG_SIZE,IMG_SIZE)), T.RandomHorizontalFlip(),
                      T.ColorJitter(0.2,0.2,0.2,0.02), T.ToTensor(),
                      T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])
val_tf = T.Compose([T.Resize((IMG_SIZE,IMG_SIZE)), T.ToTensor(),
                    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])

class Melanin3Dataset(Dataset):
    def __init__(self, df_subset, img_root, path_col="resolved_path", label_col="label3", tf=None):
        self.df = df_subset.reset_index(drop=True)
        self.img_root = Path(img_root)
        self.path_col = path_col
        self.label_col = label_col
        self.tf = tf
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        p = self.img_root / row[self.path_col]
        img = Image.open(p).convert("RGB")
        if self.tf: img = self.tf(img)
        lbl = int(row[self.label_col])
        return img, torch.tensor(lbl, dtype=torch.long)

train_df = df.loc[train_idx].reset_index(drop=True)
val_df   = df.loc[val_idx].reset_index(drop=True)
train_ds = Melanin3Dataset(train_df, IMG_ROOT, tf=train_tf)
val_ds   = Melanin3Dataset(val_df, IMG_ROOT, tf=val_tf)

# balanced sampler (manual)
labels = train_df["label3"].astype(int).values
classes, counts = np.unique(labels, return_counts=True)
total = labels.shape[0]
class_weights = {int(c): float(total / (len(classes) * cnt)) for c,cnt in zip(classes, counts)}
samples_weight = np.array([class_weights[int(l)] for l in labels], dtype=np.float32)
sampler = WeightedRandomSampler(weights=samples_weight, num_samples=len(samples_weight), replacement=True)

batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, sampler=sampler, num_workers=4)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=4)

# quick sanity
for imgs, labels in train_loader:
    print("batch imgs", imgs.shape, "labels min/max", labels.min().item(), labels.max().item())
    break


NameError: name 'train_idx' is not defined

In [17]:
# Cell A: inspect df and the CSV-derived columns
import pandas as pd, numpy as np
from pathlib import Path
DATA_ROOT = Path("fitzpatrick17k")
CSV_PATH  = DATA_ROOT / "labels.csv"
IMG_ROOT  = DATA_ROOT / "images"

print("CSV exists:", CSV_PATH.exists())
print("IMG_ROOT exists:", IMG_ROOT.exists())
print()

# Try re-load the CSV (fresh) to ensure no stale state
try:
    df = pd.read_csv(CSV_PATH)
    print("Reloaded df shape:", df.shape)
except Exception as e:
    print("Failed to reload CSV:", repr(e))
    raise

print("Columns:", df.columns.tolist())
print("\nSample rows (first 10):")
display(df.head(10))

# show unique counts for relevant columns if present
for col in ["file","phototype","phototype_num","resolved_path","label3"]:
    if col in df.columns:
        print(f"{col} unique / nulls: unique={df[col].nunique()}, nulls={df[col].isna().sum()}")
    else:
        print(f"{col} NOT in df")


CSV exists: True
IMG_ROOT exists: True

Reloaded df shape: (4515, 5)
Columns: ['file', 'age', 'gender', 'race', 'phototype']

Sample rows (first 10):


Unnamed: 0,file,age,gender,race,phototype
0,100.jpg,20-29,Female,East Asian,III
1,1000.jpg,20-29,Male,Latino_Hispanic,IV
2,10000.jpg,20-29,Female,East Asian,III
3,10001.jpg,20-29,Female,Southeast Asian,V
4,10004.jpg,20-29,Male,Southeast Asian,V
5,10005.jpg,30-39,Female,Indian,V
6,10006.jpg,40-49,Male,Black,VI
7,10007.jpg,40-49,Female,Latino_Hispanic,IV
8,10008.jpg,20-29,Female,East Asian,III
9,10009.jpg,60-69,Male,White,I & II


file unique / nulls: unique=4515, nulls=0
phototype unique / nulls: unique=5, nulls=0
phototype_num NOT in df
resolved_path NOT in df
label3 NOT in df


In [18]:
# Cell B: list some actual image files on disk and build a name-index
from pathlib import Path
import unicodedata
IMG_ROOT = Path("fitzpatrick17k") / "images"
files = [p for p in IMG_ROOT.rglob("*") if p.is_file()]
print("Number of files under images/:", len(files))
print("Example files (first 20):")
for p in files[:20]:
    print(" ", p.relative_to(IMG_ROOT))

# create a mapping of lowercase basename -> relative path
name_map = {}
for p in files:
    name = unicodedata.normalize("NFKC", p.name).lower()
    name_map[name] = p.relative_to(IMG_ROOT)

# check whether any CSV file entries directly match basenames
if "file" in df.columns:
    df["file_str"] = df["file"].astype(str).str.strip()
    df["file_basename_lc"] = df["file_str"].apply(lambda s: str(s).split("/")[-1].split("\\")[-1].lower())
    df["exists_basename"] = df["file_basename_lc"].apply(lambda b: b in name_map)
    print("\nCSV rows where basename matches a file on disk:", df["exists_basename"].sum(), "/", len(df))
    print("Examples (file -> exists):")
    display(df.loc[df["exists_basename"].head(10).index, ["file","file_basename_lc","exists_basename"]].head(10))
else:
    print("No 'file' column in CSV to check against filesystem.")


Number of files under images/: 35447
Example files (first 20):
  1.jpg
  10.jpg
  100.jpg
  1000.jpg
  10004.jpg
  10005.jpg
  10006.jpg
  10008.jpg
  1001.jpg
  10010.jpg
  10013.jpg
  10015.jpg
  10016.jpg
  10017.jpg
  10018.jpg
  10019.jpg
  1002.jpg
  10021.jpg
  10024.jpg
  10025.jpg

CSV rows where basename matches a file on disk: 2331 / 4515
Examples (file -> exists):


Unnamed: 0,file,file_basename_lc,exists_basename
0,100.jpg,100.jpg,True
1,1000.jpg,1000.jpg,True
2,10000.jpg,10000.jpg,False
3,10001.jpg,10001.jpg,False
4,10004.jpg,10004.jpg,True
5,10005.jpg,10005.jpg,True
6,10006.jpg,10006.jpg,True
7,10007.jpg,10007.jpg,False
8,10008.jpg,10008.jpg,True
9,10009.jpg,10009.jpg,False


In [19]:
# Robust resolver + phototype parser diagnostic (run now)
from pathlib import Path
import unicodedata, re
import pandas as pd, numpy as np
from collections import Counter

DATA_ROOT = Path("fitzpatrick17k")
CSV_PATH  = DATA_ROOT / "labels.csv"
IMG_ROOT  = DATA_ROOT / "images"

# reload df fresh to avoid stale state
df = pd.read_csv(CSV_PATH)
print("CSV rows:", len(df))
print("CSV columns:", df.columns.tolist())
print()

# 1) list files under images/
files = [p for p in IMG_ROOT.rglob("*") if p.is_file()]
print("Number of files under images/:", len(files))
print("Example files (first 20):")
for p in files[:20]:
    print(" ", p.relative_to(IMG_ROOT))
print()

# 2) build basename -> relative path map (case-insensitive, unicode-normalized)
name_map = {}
for p in files:
    name = unicodedata.normalize("NFKC", p.name).lower()
    # prefer first seen; if duplicates exist, we'll keep the first (we can surface duplicates later)
    if name not in name_map:
        name_map[name] = p.relative_to(IMG_ROOT)

# 3) try to match CSV 'file' entries to actual files by basename
if "file" not in df.columns:
    raise KeyError("'file' column not found in CSV. Columns: " + ", ".join(df.columns))

def basename_only(s):
    if pd.isna(s): return ""
    s = str(s).strip()
    # take basename only in case CSV includes path segments
    s = s.split("/")[-1].split("\\")[-1]
    return unicodedata.normalize("NFKC", s).lower()

df["file_basename"] = df["file"].apply(basename_only)
df["match_basename"] = df["file_basename"].apply(lambda b: name_map.get(b, None))

n_matched = df["match_basename"].notna().sum()
print(f"Basename matches found: {n_matched} / {len(df)} ({n_matched/len(df):.1%})")

if n_matched < len(df):
    print("\nSample (first 15) unmatched CSV file basenames:")
    unmatched = df.loc[df["match_basename"].isna(), "file_basename"].head(15).tolist()
    print(unmatched)

# 4) if some matched, assign resolved_path column
df["resolved_path"] = df["match_basename"].apply(lambda r: str(r) if pd.notna(r) else None)

# 5) parse phototype strings robustly into numeric 1..6
def parse_phototype(s):
    if pd.isna(s): return None
    s = str(s).upper()
    # look for roman numerals or digits I..VI or 1..6
    # regex finds the first valid token like I, II, III, IV, V, VI, or digit 1-6
    m = re.search(r'\b(I{1,3}|IV|V|VI|[1-6])\b', s)
    if not m:
        return None
    token = m.group(1)
    roman_map = {"I":1,"II":2,"III":3,"IV":4,"V":5,"VI":6}
    if token in roman_map:
        return roman_map[token]
    try:
        return int(token)
    except:
        return None

df["phototype_num"] = df["phototype"].apply(parse_phototype)
n_ph_missing = df["phototype_num"].isna().sum()
print(f"\nPhototype parseable: {len(df)-n_ph_missing} / {len(df)}  (missing: {n_ph_missing})")
if n_ph_missing>0:
    display(df.loc[df["phototype_num"].isna(), ["phototype","file"]].head(10))

# 6) create label3 (0=light I-II, 1=medium III-IV, 2=dark V-VI)
def to_label3(x):
    if x is None: return None
    x = int(x)
    if x <= 2: return 0
    if x <= 4: return 1
    return 2

df["label3"] = df["phototype_num"].apply(lambda x: to_label3(x) if pd.notna(x) else None)

# 7) keep only rows that have both resolved_path and label3
df_clean = df.dropna(subset=["resolved_path","label3"]).reset_index(drop=True)
print(f"\ndf_clean (have image + phototype): {len(df_clean)} rows")
print("label3 distribution in df_clean:", dict(df_clean["label3"].value_counts().sort_index()))
print()

# 8) If df_clean is non-empty, show a small sample with full details
if len(df_clean) > 0:
    display(df_clean[["file","file_basename","resolved_path","phototype","phototype_num","label3"]].head(12))
else:
    # detailed diagnostics when we failed to match anything
    print("No rows with both matched image and parseable phototype.")
    print("Top 20 CSV 'file' basenames (for inspection):")
    print(df["file_basename"].head(20).tolist())
    print("\nTop 40 image basenames on disk (for inspection):")
    print(list(name_map.keys())[:40])

# 9) show duplicates on disk for basenames (optional info)
from collections import defaultdict
duplicates = defaultdict(list)
for p in files:
    b = unicodedata.normalize("NFKC", p.name).lower()
    duplicates[b].append(p.relative_to(IMG_ROOT))
dups = {k:v for k,v in duplicates.items() if len(v)>1}
print("\nNumber of basename duplicates on disk:", len(dups))
if len(dups) > 0:
    # show a couple examples
    for k,v in list(dups.items())[:8]:
        print(k, "->", v)

# write df back to namespace for downstream work if non-empty
if len(df_clean) > 0:
    df = df_clean
    print("\nAssigned df = df_clean; ready for next steps (split / dataset).")
else:
    print("\ndf not replaced. Fix matching or phototype parsing before proceeding.")


CSV rows: 4515
CSV columns: ['file', 'age', 'gender', 'race', 'phototype']

Number of files under images/: 35447
Example files (first 20):
  1.jpg
  10.jpg
  100.jpg
  1000.jpg
  10004.jpg
  10005.jpg
  10006.jpg
  10008.jpg
  1001.jpg
  10010.jpg
  10013.jpg
  10015.jpg
  10016.jpg
  10017.jpg
  10018.jpg
  10019.jpg
  1002.jpg
  10021.jpg
  10024.jpg
  10025.jpg

Basename matches found: 2331 / 4515 (51.6%)

Sample (first 15) unmatched CSV file basenames:
['10000.jpg', '10001.jpg', '10007.jpg', '10009.jpg', '10011.jpg', '10012.jpg', '10020.jpg', '10022.jpg', '10023.jpg', '10028.jpg', '10031.jpg', '10035.jpg', '10037.jpg', '10040.jpg', '10041.jpg']

Phototype parseable: 4515 / 4515  (missing: 0)

df_clean (have image + phototype): 2331 rows
label3 distribution in df_clean: {0: np.int64(501), 1: np.int64(827), 2: np.int64(1003)}



Unnamed: 0,file,file_basename,resolved_path,phototype,phototype_num,label3
0,100.jpg,100.jpg,100.jpg,III,3,1
1,1000.jpg,1000.jpg,1000.jpg,IV,4,1
2,10004.jpg,10004.jpg,10004.jpg,V,5,2
3,10005.jpg,10005.jpg,10005.jpg,V,5,2
4,10006.jpg,10006.jpg,10006.jpg,VI,6,2
5,10008.jpg,10008.jpg,10008.jpg,III,3,1
6,1001.jpg,1001.jpg,1001.jpg,V,5,2
7,10010.jpg,10010.jpg,10010.jpg,V,5,2
8,10013.jpg,10013.jpg,10013.jpg,III,3,1
9,10015.jpg,10015.jpg,10015.jpg,III,3,1



Number of basename duplicates on disk: 0

Assigned df = df_clean; ready for next steps (split / dataset).


In [20]:
# Fixed heuristic-matching cell (ensures match_method/match_info exist before display)
from pathlib import Path
import unicodedata, re
from collections import defaultdict
from difflib import SequenceMatcher
import pandas as pd, numpy as np

IMG_ROOT = Path("fitzpatrick17k") / "images"
print("IMG_ROOT:", IMG_ROOT, "exists:", IMG_ROOT.exists())

# ensure file_basename present
if "file_basename" not in df.columns:
    df["file_basename"] = df["file"].astype(str).apply(lambda s: str(s).strip().split("/")[-1].split("\\")[-1].lower())

# build disk maps
files = [p for p in IMG_ROOT.rglob("*") if p.is_file()]
disk_map = {unicodedata.normalize("NFKC", p.name).lower(): p.relative_to(IMG_ROOT) for p in files}
disk_stems_map = defaultdict(list)
for p in files:
    stem = unicodedata.normalize("NFKC", p.stem).lower()
    disk_stems_map[stem].append(p.relative_to(IMG_ROOT))

# baseline matched/unmatched
matched_mask = df["file_basename"].apply(lambda b: b in disk_map)
already_matched = df.loc[matched_mask].copy()
unmatched = df.loc[~matched_mask].copy()
print("Already matched (baseline):", len(already_matched), "Unmatched:", len(unmatched))

def extract_number_tokens(s):
    return sorted(set(re.findall(r'\d+', s)), key=lambda x: -len(x))

def best_fuzzy_candidate(target, candidates, min_ratio=0.88):
    best=None; best_ratio=0.0
    for c in candidates:
        r = SequenceMatcher(None, target, c).ratio()
        if r>best_ratio:
            best_ratio=r; best=c
    if best_ratio>=min_ratio:
        return best, best_ratio
    return None, None

auto_matches = {}
manual_suggestions = {}

for idx, row in unmatched.iterrows():
    b = row["file_basename"]
    stem = Path(b).stem
    # 1) stem exact
    if stem in disk_stems_map and len(disk_stems_map[stem])==1:
        auto_matches[idx] = (str(disk_stems_map[stem][0]), "stem_exact", None)
        continue
    # 2) numeric containment
    nums = extract_number_tokens(b)
    if nums:
        main = nums[0]
        candidates = [k for k in disk_map.keys() if main in k]
        if len(candidates)==1:
            auto_matches[idx] = (str(disk_map[candidates[0]]), "contains_number", main)
            continue
        elif 1 < len(candidates) <= 6:
            manual_suggestions[idx] = [str(disk_map[c]) for c in candidates]
            continue
    # 3) endswith
    tail = b
    candidates = [k for k in disk_map.keys() if k.endswith(tail)]
    if len(candidates)==1:
        auto_matches[idx] = (str(disk_map[candidates[0]]), "endswith", None)
        continue
    elif 1 < len(candidates) <= 6:
        manual_suggestions[idx] = [str(disk_map[c]) for c in candidates]
        continue
    # 4) fuzzy basename
    cand, ratio = best_fuzzy_candidate(b, disk_map.keys(), min_ratio=0.88)
    if cand:
        auto_matches[idx] = (str(disk_map[cand]), "fuzzy", float(ratio))
        continue
    # 5) fuzzy stem -> single file
    cand, ratio = best_fuzzy_candidate(stem, disk_stems_map.keys(), min_ratio=0.92)
    if cand and len(disk_stems_map[cand])==1:
        auto_matches[idx] = (str(disk_stems_map[cand][0]), "fuzzy_stem", float(ratio))
        continue
    manual_suggestions.setdefault(idx, [])

# Build df2 with resolved_path_auto
df2 = df.copy()
df2["resolved_path_auto"] = df2["file_basename"].apply(lambda b: str(disk_map[b]) if b in disk_map else None)

# Ensure match_method & match_info columns exist
if "match_method" not in df2.columns: df2["match_method"] = None
if "match_info"  not in df2.columns: df2["match_info"]  = None

# Fill in auto_matches
for idx, (rel, method, info) in auto_matches.items():
    df2.at[idx, "resolved_path_auto"] = rel
    df2.at[idx, "match_method"] = method
    df2.at[idx, "match_info"] = info

# Mark exact where resolved_path_auto exists and no method set
df2["match_method"] = df2["match_method"].fillna(df2["resolved_path_auto"].notna().map(lambda x: "exact" if x else None))

# Summaries
df_matched = df2[df2["resolved_path_auto"].notna()].copy().reset_index(drop=True)
df_unmatched = df2[df2["resolved_path_auto"].isna()].copy().reset_index(drop=True)

print("Auto matches found:", len(auto_matches))
print("Manual suggestion cases:", len([k for k,v in manual_suggestions.items() if v]))
print("Remaining fully unmatched:", len(df_unmatched))
print("Matched by method counts:\n", df_matched["match_method"].value_counts(dropna=False))

# Safely display only existing columns
cols = [c for c in ["file","file_basename","resolved_path_auto","match_method","match_info"] if c in df_matched.columns]
print("\nExamples of auto matches:")
display(df_matched[cols].head(12))

print("\nExamples of manual suggestion cases (first 12):")
man_items = [(idx, manual_suggestions[idx]) for idx in list(manual_suggestions.keys())[:12] if manual_suggestions[idx]]
for idx, cand_list in man_items:
    print("CSV row idx", idx, "file", df.loc[idx,"file"], " -> candidates:", cand_list[:6])

print(f"\nOriginal CSV rows: {len(df)}; matched now: {len(df_matched)} ({len(df_matched)/len(df):.1%})")

# leave df2/df_matched/df_unmatched in namespace for review


IMG_ROOT: fitzpatrick17k\images exists: True
Already matched (baseline): 2331 Unmatched: 0
Auto matches found: 0
Manual suggestion cases: 0
Remaining fully unmatched: 0
Matched by method counts:
 match_method
exact    2331
Name: count, dtype: int64

Examples of auto matches:


Unnamed: 0,file,file_basename,resolved_path_auto,match_method,match_info
0,100.jpg,100.jpg,100.jpg,exact,
1,1000.jpg,1000.jpg,1000.jpg,exact,
2,10004.jpg,10004.jpg,10004.jpg,exact,
3,10005.jpg,10005.jpg,10005.jpg,exact,
4,10006.jpg,10006.jpg,10006.jpg,exact,
5,10008.jpg,10008.jpg,10008.jpg,exact,
6,1001.jpg,1001.jpg,1001.jpg,exact,
7,10010.jpg,10010.jpg,10010.jpg,exact,
8,10013.jpg,10013.jpg,10013.jpg,exact,
9,10015.jpg,10015.jpg,10015.jpg,exact,



Examples of manual suggestion cases (first 12):

Original CSV rows: 2331; matched now: 2331 (100.0%)


In [21]:
df = df2[df2["resolved_path_auto"].notna()].copy().reset_index(drop=True)
df["resolved_path"] = df["resolved_path_auto"]


In [22]:
# Accept auto-matched rows and finalize resolved_path and label3
# (Be sure df2 exists in namespace from the previous step)
assert "df2" in globals(), "df2 not found — re-run the matching step"

# keep only matched rows
df = df2[df2["resolved_path_auto"].notna()].copy().reset_index(drop=True)
df["resolved_path"] = df["resolved_path_auto"]  # set final path
# If you already created label3 earlier, keep it; otherwise create from phototype
if "label3" not in df.columns:
    import re
    def parse_phototype(s):
        if pd.isna(s): return None
        s = str(s).upper()
        m = re.search(r'\b(I{1,3}|IV|V|VI|[1-6])\b', s)
        if not m: return None
        token = m.group(1)
        rm = {"I":1,"II":2,"III":3,"IV":4,"V":5,"VI":6}
        if token in rm: return rm[token]
        try: return int(token)
        except: return None
    df["phototype_num"] = df["phototype"].apply(parse_phototype)
    def to_label3(x):
        if pd.isna(x): return None
        x=int(x)
        if x<=2: return 0
        if x<=4: return 1
        return 2
    df["label3"] = df["phototype_num"].apply(lambda x: to_label3(x) if pd.notna(x) else None)

# final sanity
print("Final df shape:", len(df))
print("Counts per 3-class label (0=light,1=medium,2=dark):")
print(df["label3"].value_counts().sort_index())
# show sample rows
display(df[["file","resolved_path","phototype","label3"]].head(10))


Final df shape: 2331
Counts per 3-class label (0=light,1=medium,2=dark):
label3
0     501
1     827
2    1003
Name: count, dtype: int64


Unnamed: 0,file,resolved_path,phototype,label3
0,100.jpg,100.jpg,III,1
1,1000.jpg,1000.jpg,IV,1
2,10004.jpg,10004.jpg,V,2
3,10005.jpg,10005.jpg,V,2
4,10006.jpg,10006.jpg,VI,2
5,10008.jpg,10008.jpg,III,1
6,1001.jpg,1001.jpg,V,2
7,10010.jpg,10010.jpg,V,2
8,10013.jpg,10013.jpg,III,1
9,10015.jpg,10015.jpg,III,1


In [23]:
# Stratified split 80/20 (recompute from current df)
from collections import defaultdict, Counter
import numpy as np
SEED = 42
np.random.seed(SEED)

indices_by_class = defaultdict(list)
for idx, lbl in enumerate(df["label3"].astype(int).values):
    indices_by_class[int(lbl)].append(idx)

train_idx, val_idx = [], []
frac_train = 0.8
for lbl, idxs in indices_by_class.items():
    idxs = idxs.copy()
    np.random.shuffle(idxs)
    split = int(len(idxs) * frac_train)
    train_idx += idxs[:split]
    val_idx += idxs[split:]

train_idx = sorted(set(train_idx))
val_idx   = sorted(set(val_idx))
print("train len:", len(train_idx), "val len:", len(val_idx))
print("train class counts:", Counter(df.loc[train_idx,"label3"]))
print("val class counts:", Counter(df.loc[val_idx,"label3"]))


train len: 1863 val len: 468
train class counts: Counter({2: 802, 1: 661, 0: 400})
val class counts: Counter({2: 201, 1: 166, 0: 101})


In [28]:
# Fix & diagnostic: make sure we have a Dataset, then time a few batches with num_workers=0
import time
import torch
from torch.utils.data import DataLoader
from pathlib import Path
from PIL import Image
import torchvision.transforms as T
import numpy as np

# adapt these if your names differ
IMG_ROOT = Path("fitzpatrick17k") / "images"
IMG_SIZE = 224
BATCH = 8

# 1) If train_ds isn't defined, create a minimal Dataset wrapper for train_df
if 'train_ds' not in globals():
    print("train_ds not found — creating Melanin3Dataset from train_df (requires train_idx/train_df exist).")
    # sanity checks
    assert 'train_idx' in globals() and 'df' in globals(), "train_idx or df missing — run the split cells first."
    train_df = df.loc[train_idx].reset_index(drop=True)
    # small, safe transform for timing (no heavy color jitter)
    tf = T.Compose([T.Resize((IMG_SIZE, IMG_SIZE)), T.ToTensor()])
    class QuickDataset(torch.utils.data.Dataset):
        def __init__(self, df_subset, img_root, path_col='resolved_path', label_col='label3', tf=None):
            self.df = df_subset.reset_index(drop=True)
            self.img_root = Path(img_root)
            self.path_col = path_col
            self.label_col = label_col
            self.tf = tf
        def __len__(self): return len(self.df)
        def __getitem__(self, idx):
            row = self.df.iloc[idx]
            p = self.img_root / row[self.path_col]
            img = Image.open(p).convert("RGB")
            if self.tf: img = self.tf(img)
            lbl = int(row[self.label_col])
            return img, torch.tensor(lbl, dtype=torch.long)
    train_ds = QuickDataset(train_df, IMG_ROOT, tf=tf)
    print("Created train_ds length:", len(train_ds))
else:
    print("Using existing train_ds (len={})".format(len(train_ds)))

# 2) Create a single-process test loader and time a few batches
test_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=False, num_workers=0, pin_memory=torch.cuda.is_available())
print("\nTiming a few batches with num_workers=0:")
t0 = time.time()
for i, (imgs, labs) in enumerate(test_loader):
    t1 = time.time()
    print(f" batch {i:2d}: imgs {tuple(imgs.shape)}, labels min/max {labs.min().item()}/{labs.max().item()}, time {(t1-t0):.3f}s")
    t0 = time.time()
    if i >= 5:
        break

# 3) If this is fast (<~0.5–1s per small batch) use num_workers=0 for interactive training:
print("\nIf these batch times are small, use num_workers=0 for notebook training to avoid long worker spawn delays.")
print("To rebuild your main train_loader quickly, run the cell below (it sets num_workers=0):")

# quick snippet to rebuild your main loaders safely
print("""
# Rebuild example (paste and run):
from torch.utils.data import DataLoader, WeightedRandomSampler
train_loader = DataLoader(train_ds, batch_size=64, sampler=sampler, num_workers=0, pin_memory=True if torch.cuda.is_available() else False)
val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False, num_workers=0, pin_memory=True if torch.cuda.is_available() else False)
""")


train_ds not found — creating Melanin3Dataset from train_df (requires train_idx/train_df exist).
Created train_ds length: 1863

Timing a few batches with num_workers=0:
 batch  0: imgs (8, 3, 224, 224), labels min/max 1/2, time 1.859s
 batch  1: imgs (8, 3, 224, 224), labels min/max 0/2, time 0.016s
 batch  2: imgs (8, 3, 224, 224), labels min/max 0/2, time 0.016s
 batch  3: imgs (8, 3, 224, 224), labels min/max 1/2, time 0.010s
 batch  4: imgs (8, 3, 224, 224), labels min/max 0/2, time 0.009s
 batch  5: imgs (8, 3, 224, 224), labels min/max 0/2, time 0.004s

If these batch times are small, use num_workers=0 for notebook training to avoid long worker spawn delays.
To rebuild your main train_loader quickly, run the cell below (it sets num_workers=0):

# Rebuild example (paste and run):
from torch.utils.data import DataLoader, WeightedRandomSampler
train_loader = DataLoader(train_ds, batch_size=64, sampler=sampler, num_workers=0, pin_memory=True if torch.cuda.is_available() else False)
v

In [40]:
# Verbose training loop (drop-in). Paste & run this cell.
import time, math, torch, numpy as np
import torch.nn as nn
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Safety checks
assert 'train_loader' in globals() and 'val_loader' in globals(), "train_loader / val_loader missing."
assert 'model' in globals(), "model missing."

# Ensure model on device and in train mode
model = model.to(device)
for p in model.parameters(): 
    p.requires_grad = True
model.train()

# Optimizer + scheduler (recreate to be sure we have fresh state)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
# Use weighted loss if you computed class_weights earlier
if 'class_weights' in globals():
    loss_weight = torch.tensor([class_weights.get(i,1.0) for i in range(3)], dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=loss_weight)
else:
    criterion = nn.CrossEntropyLoss()

EPOCHS = 6
LOG_EVERY = 10   # print every N batches
best_val_acc = 0.0
save_path = "best_resnet18_3class_verbose.pth"

for epoch in range(1, EPOCHS+1):
    epoch_start = time.time()
    model.train()
    running_loss = 0.0
    running_correct = 0
    running_total = 0
    batch_times = []
    print(f"\n=== Epoch {epoch}/{EPOCHS} ===")
    for b_idx, (imgs, labels) in enumerate(train_loader, start=1):
        t0 = time.time()
        imgs = imgs.to(device); labels = labels.to(device)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        # gradient norm (useful)
        total_grad_norm = 0.0
        for p in model.parameters():
            if p.grad is not None:
                total_grad_norm += float(p.grad.detach().norm().item()**2)
        total_grad_norm = math.sqrt(total_grad_norm) if total_grad_norm>0 else 0.0

        optimizer.step()

        # stats
        preds = out.argmax(1)
        batch_correct = int((preds == labels).sum().item())
        running_correct += batch_correct
        running_total += labels.size(0)
        running_loss += float(loss.item()) * labels.size(0)

        bt = time.time()-t0
        batch_times.append(bt)

        if b_idx % LOG_EVERY == 0 or b_idx == 1:
            avg_loss = running_loss / running_total
            avg_acc = running_correct / running_total
            lr = optimizer.param_groups[0]['lr']
            print(f" batch {b_idx:4d} | batch_time {bt:.3f}s | avg_loss {avg_loss:.4f} | avg_acc {avg_acc:.4f} | lr {lr:.6g} | grad_norm {total_grad_norm:.4f}")

    epoch_time = time.time() - epoch_start
    train_loss = running_loss / running_total
    train_acc  = running_correct / running_total
    print(f"Epoch {epoch} train_loss {train_loss:.4f} train_acc {train_acc:.4f} time {epoch_time:.1f}s mean_batch {np.mean(batch_times):.3f}s")

    # Validation
    model.eval()
    val_loss = 0.0; val_total=0; val_correct=0
    y_true=[]; y_pred=[]
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs = imgs.to(device); labels = labels.to(device)
            out = model(imgs)
            loss = criterion(out, labels)
            preds = out.argmax(1)
            val_loss += float(loss.item()) * imgs.size(0)
            val_total += imgs.size(0)
            val_correct += int((preds==labels).sum().item())
            y_true.extend(labels.cpu().numpy().tolist()); y_pred.extend(preds.cpu().numpy().tolist())

    val_loss = val_loss / max(1, val_total)
    val_acc  = val_correct / max(1, val_total)
    print(f"Epoch {epoch} VALID  val_loss {val_loss:.4f} val_acc {val_acc:.4f}")

    # confusion matrix quick
    num_c = 3
    cm = np.zeros((num_c, num_c), dtype=int)
    for t,p in zip(y_true, y_pred):
        cm[int(t), int(p)] += 1
    print(" Confusion matrix:\n", cm)
    per_class_recall = np.diag(cm) / np.clip(cm.sum(axis=1), 1, None)
    print(" Per-class recall:", per_class_recall)

    # checkpoint
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), save_path)
        print(" Saved new best model ->", save_path)

    scheduler.step()

print("\nTraining finished. Best val acc:", best_val_acc)


Using device: cuda

=== Epoch 1/6 ===
 batch    1 | batch_time 0.790s | avg_loss 1.3569 | avg_acc 0.2656 | lr 0.0003 | grad_norm 9.6298
 batch   10 | batch_time 0.594s | avg_loss 1.0436 | avg_acc 0.4328 | lr 0.0003 | grad_norm 5.7499
 batch   20 | batch_time 0.599s | avg_loss 0.9592 | avg_acc 0.4789 | lr 0.0003 | grad_norm 4.3118
 batch   30 | batch_time 0.107s | avg_loss 0.8959 | avg_acc 0.5228 | lr 0.0003 | grad_norm 6.3320
Epoch 1 train_loss 0.8959 train_acc 0.5228 time 18.8s mean_batch 0.556s
Epoch 1 VALID  val_loss 1.6539 val_acc 0.2735
 Confusion matrix:
 [[ 75   0  26]
 [110   0  56]
 [148   0  53]]
 Per-class recall: [0.74257426 0.         0.26368159]
 Saved new best model -> best_resnet18_3class_verbose.pth

=== Epoch 2/6 ===
 batch    1 | batch_time 0.489s | avg_loss 0.6051 | avg_acc 0.7344 | lr 0.0003 | grad_norm 6.0874
 batch   10 | batch_time 0.582s | avg_loss 0.6293 | avg_acc 0.7172 | lr 0.0003 | grad_norm 5.0936
 batch   20 | batch_time 0.575s | avg_loss 0.5647 | avg_acc

In [41]:
# Cell: confusion and per-class metrics (if y_true, y_pred exist)
import numpy as np
from collections import Counter

assert 'y_true' in globals() and 'y_pred' in globals(), "Run validation to produce y_true,y_pred first."

num_c = 3
cm = np.zeros((num_c, num_c), dtype=int)
for t,p in zip(y_true, y_pred):
    cm[int(t), int(p)] += 1

print("Confusion matrix:\n", cm)
recall = np.diag(cm) / np.clip(cm.sum(axis=1), 1, None)
precision = np.diag(cm) / np.clip(cm.sum(axis=0), 1, None)
f1 = 2 * precision * recall / np.clip(precision + recall, 1e-8, None)
print("Per-class counts:", cm.sum(axis=1).tolist())
print("Recall:", recall.tolist())
print("Precision:", precision.tolist())
print("F1:", f1.tolist())


Confusion matrix:
 [[  2   0  99]
 [  0   0 166]
 [  1   0 200]]
Per-class counts: [101, 166, 201]
Recall: [0.019801980198019802, 0.0, 0.9950248756218906]
Precision: [0.6666666666666666, 0.0, 0.43010752688172044]
F1: [0.038461538461538464, 0.0, 0.6006006006006006]


In [42]:
# Cell: compute simple color features and train a small linear classifier
import torch, torchvision.transforms as T
from PIL import Image
import numpy as np, math
from torch import nn
from torch.utils.data import Dataset, DataLoader

# Dataset that returns small handcrafted features
class ColorFeatDataset(Dataset):
    def __init__(self, df_subset, img_root, path_col='resolved_path', label_col='label3', img_size=128):
        self.df = df_subset.reset_index(drop=True)
        self.root = img_root
        self.path_col = path_col
        self.label_col = label_col
        self.tf = T.Compose([T.Resize((img_size,img_size))])
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        p = self.root / row[self.path_col]
        img = Image.open(p).convert("RGB")
        img = self.tf(img)
        arr = np.array(img).astype(np.float32)/255.0  # H,W,3
        # mean RGB
        mean_rgb = arr.mean(axis=(0,1))  # 3
        # HSV saturation mean
        from colorsys import rgb_to_hsv
        h_avg = []; s_avg = []
        # compute mean saturation by sampling a grid (faster than full per-pixel loop)
        small = arr.reshape(-1,3)[::max(1, arr.size//5000)]
        sats = [rgb_to_hsv(*tuple(px))[1] for px in small]
        sat_mean = float(np.mean(sats))
        # mean Y from YCbCr (brightness)
        img_ycbcr = Image.fromarray((arr*255).astype(np.uint8)).convert("YCbCr")
        Y, Cb, Cr = np.array(img_ycbcr).astype(np.float32).mean(axis=(0,1))
        feats = np.concatenate([mean_rgb, [sat_mean, Y/255.0]])
        label = int(row[self.label_col])
        return torch.tensor(feats, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# build datasets using current train_df/val_df from your notebook
assert 'train_df' in globals() and 'val_df' in globals()
feat_train = ColorFeatDataset(train_df, IMG_ROOT)
feat_val   = ColorFeatDataset(val_df,   IMG_ROOT)
train_loader_f = DataLoader(feat_train, batch_size=64, shuffle=True, num_workers=0)
val_loader_f   = DataLoader(feat_val,   batch_size=256, shuffle=False, num_workers=0)

# a tiny linear classifier (input dim = 5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_f = nn.Sequential(nn.Linear(5, 32), nn.ReLU(), nn.Linear(32, 3)).to(device)
opt = torch.optim.AdamW(model_f.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss()

# train a few epochs
for epoch in range(10):
    model_f.train()
    tot_loss=0; tot=0; corr=0
    for X, y in train_loader_f:
        X=X.to(device); y=y.to(device)
        opt.zero_grad()
        out = model_f(X)
        loss = crit(out,y)
        loss.backward(); opt.step()
        tot_loss += loss.item()*X.size(0)
        corr += (out.argmax(1)==y).sum().item(); tot+=X.size(0)
    train_acc = corr/tot
    # val
    model_f.eval()
    corr=0; tot=0
    with torch.no_grad():
        for X,y in val_loader_f:
            X=X.to(device); y=y.to(device)
            out = model_f(X)
            corr += (out.argmax(1)==y).sum().item(); tot+=X.size(0)
    val_acc = corr/tot
    print(f"Epoch {epoch+1} linear feat model -> train_acc {train_acc:.3f} val_acc {val_acc:.3f}")


Epoch 1 linear feat model -> train_acc 0.389 val_acc 0.440
Epoch 2 linear feat model -> train_acc 0.425 val_acc 0.434
Epoch 3 linear feat model -> train_acc 0.428 val_acc 0.429
Epoch 4 linear feat model -> train_acc 0.430 val_acc 0.429
Epoch 5 linear feat model -> train_acc 0.430 val_acc 0.429
Epoch 6 linear feat model -> train_acc 0.430 val_acc 0.429
Epoch 7 linear feat model -> train_acc 0.430 val_acc 0.429
Epoch 8 linear feat model -> train_acc 0.429 val_acc 0.429
Epoch 9 linear feat model -> train_acc 0.430 val_acc 0.429
Epoch 10 linear feat model -> train_acc 0.430 val_acc 0.429


In [43]:
# Cell: compute simple HSV skin mask and per-image masked mean Y, then test linear classifier
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

def simple_skin_mask_rgb(arr_rgb):
    # arr_rgb in 0..1
    import colorsys
    hws = arr_rgb.reshape(-1,3)
    mask = []
    for r,g,b in hws:
        h,s,v = colorsys.rgb_to_hsv(r,g,b)
        # heuristic thresholds (tunable); keep moderately saturated warm colors
        if s > 0.1 and v > 0.15 and 0.0 <= h <= 0.7:
            mask.append(1)
        else:
            mask.append(0)
    m = np.array(mask, dtype=np.uint8).reshape(arr_rgb.shape[0], arr_rgb.shape[1])
    return m

class MaskedFeatDataset(Dataset):
    def __init__(self, df_subset, img_root, img_size=128):
        self.df = df_subset.reset_index(drop=True)
        self.root = img_root
        self.img_size = img_size
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        p = self.root / row['resolved_path']
        img = Image.open(p).convert('RGB').resize((self.img_size, self.img_size))
        arr = np.array(img).astype(np.float32)/255.0
        mask = simple_skin_mask_rgb(arr)
        if mask.sum() == 0:
            # fallback: whole image
            mask = np.ones((self.img_size, self.img_size), dtype=np.uint8)
        # compute masked mean RGB and mean brightness (Y)
        masked = arr * mask[:,:,None]
        mean_rgb = masked.sum(axis=(0,1)) / (mask.sum()+1e-6)
        img_ycbcr = Image.fromarray((arr*255).astype(np.uint8)).convert("YCbCr")
        Y = np.array(img_ycbcr)[:,:,0]
        mean_Y_masked = (Y * mask).sum() / (mask.sum()+1e-6) / 255.0
        feats = np.concatenate([mean_rgb, [mean_Y_masked]])
        label = int(row['label3'])
        return torch.tensor(feats, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# build and test the same linear classifier routine as before but using MaskedFeatDataset
masked_train = MaskedFeatDataset(train_df, IMG_ROOT)
masked_val   = MaskedFeatDataset(val_df, IMG_ROOT)
train_loader_m = DataLoader(masked_train, batch_size=64, shuffle=True, num_workers=0)
val_loader_m   = DataLoader(masked_val,   batch_size=256, shuffle=False, num_workers=0)

# tiny model
model_m = nn.Sequential(nn.Linear(4, 32), nn.ReLU(), nn.Linear(32,3)).to(device)
opt_m = torch.optim.AdamW(model_m.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss()
for epoch in range(8):
    model_m.train()
    tot=0; corr=0
    for X,y in train_loader_m:
        X=X.to(device); y=y.to(device)
        opt_m.zero_grad()
        out = model_m(X)
        loss = crit(out,y)
        loss.backward(); opt_m.step()
        corr += (out.argmax(1)==y).sum().item(); tot+=X.size(0)
    model_m.eval()
    corr_v=0; tot_v=0
    with torch.no_grad():
        for X,y in val_loader_m:
            X=X.to(device); y=y.to(device)
            corr_v += (model_m(X).argmax(1)==y).sum().item(); tot_v+=X.size(0)
    print("Epoch", epoch+1, "train_acc", corr/tot, "val_acc", corr_v/tot_v)


Epoch 1 train_acc 0.43048845947396674 val_acc 0.42948717948717946
Epoch 2 train_acc 0.43048845947396674 val_acc 0.42948717948717946
Epoch 3 train_acc 0.43048845947396674 val_acc 0.42948717948717946
Epoch 4 train_acc 0.43048845947396674 val_acc 0.42948717948717946
Epoch 5 train_acc 0.43048845947396674 val_acc 0.42948717948717946
Epoch 6 train_acc 0.43048845947396674 val_acc 0.42948717948717946
Epoch 7 train_acc 0.43048845947396674 val_acc 0.42948717948717946
Epoch 8 train_acc 0.43048845947396674 val_acc 0.42948717948717946
