In [None]:
# 1) Install kagglehub if you haven't:
#    pip install kagglehub --upgrade
import os, shutil, zipfile, pandas as pd, kagglehub
from pathlib import Path


In [17]:
KAGGLE_SLUG = "vinitasilaparasetty/fitzpatrick-classification-by-ethnicity"
DEST        = Path("fitzpatrick17k")          # final clean layout
DEST.mkdir(exist_ok=True, parents=True)

# ── 2. Download (kagglehub auto-caches) ───────────────────────────────────
print("⏬  Downloading via kagglehub …")
dl_path = Path(
    kagglehub.dataset_download(KAGGLE_SLUG)   # returns cache path
)
print("KaggleHub cache folder:", dl_path)



⏬  Downloading via kagglehub …
KaggleHub cache folder: C:\Users\yrsee\.cache\kagglehub\datasets\vinitasilaparasetty\fitzpatrick-classification-by-ethnicity\versions\2


In [18]:
# ── 2. If a ZIP exists, unzip; else use the folder as-is ──────────────────
zip_candidates = list(dl_path.glob("*.zip"))
if zip_candidates:
    zip_file = zip_candidates[0]
    print("📦  Extracting", zip_file.name)
    with zipfile.ZipFile(zip_file) as zf:
        zf.extractall(DEST)
else:
    # The dataset is already extracted → copy contents to DEST
    for item in dl_path.iterdir():
        tgt = DEST / item.name
        if tgt.exists():
            continue
        print("📁  Copying", item.name, "→", tgt)
        if item.is_dir():
            shutil.copytree(item, tgt)
        else:
            shutil.copy2(item, tgt)


📁  Copying fairface → fitzpatrick17k\fairface
📁  Copying fitz_undersampled_test_final.csv → fitzpatrick17k\fitz_undersampled_test_final.csv
📁  Copying fitz_undersampled_train_final.csv → fitzpatrick17k\fitz_undersampled_train_final.csv


In [19]:
# ── 3. Standardize layout: move CSV + images/ ─────────────────────────────
# Find the main CSV
csv_path = next(DEST.rglob("*.csv"))
# Find the images folder (contains jpg / png)
img_dir  = next(p for p in DEST.rglob("*") if p.is_dir() and any(p.glob("*.jpg")))

csv_path.rename(DEST / "labels.csv")
if img_dir.resolve() != (DEST / "images").resolve():
    shutil.move(str(img_dir), DEST / "images")

print(f"✅  Dataset ready in: {DEST.resolve()}")
print("   ├─", (DEST / 'labels.csv').relative_to(DEST.parent))
print("   └─", (DEST / 'images').relative_to(DEST.parent))



✅  Dataset ready in: C:\Users\yrsee\everything\ACME-Outreach\skin-diagnostic-engine\fitzpatrick17k
   ├─ fitzpatrick17k\labels.csv
   └─ fitzpatrick17k\images


In [22]:
# ── 4. Quick sanity: view distribution ────────────────────────────────────
df = pd.read_csv(DEST / "labels.csv")
dist = df["phototype"].value_counts().sort_index()
print("\nFitzpatrick I–VI distribution:\n", dist)



Fitzpatrick I–VI distribution:
 phototype
I & II    903
III       903
IV        903
V         903
VI        903
Name: count, dtype: int64
