# 01 — Data Exploration & Cleaning
This notebook downloads the Fitzpatrick17k dataset, cleans it, and analyzes the class distribution.

In [None]:
# Uncomment to install dependencies in Colab
# !pip install -q imagehash wandb google-cloud-storage google-cloud-aiplatform

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add project root to path (for Colab: mount drive or clone repo first)
# sys.path.insert(0, '/content/skin_tone_classifier')

from src.data.prepare import (
    load_metadata,
    validate_fitzpatrick_labels,
    encode_labels,
    validate_images,
    filter_human_images,
    deduplicate_images,
    compute_class_distribution,
    generate_cleaning_report,
    stratified_split,
    download_images,
)

In [None]:
# Configuration
CSV_PATH = "data/fitzpatrick17k.csv"
IMAGE_DIR = "data/images"
OUTPUT_DIR = "data/cleaned"
RANDOM_SEED = 42

In [None]:
# Load metadata
df = load_metadata(CSV_PATH)
print(f"Total rows: {len(df)}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Explore raw Fitzpatrick distribution
print("\nRaw Fitzpatrick distribution:")
print(df["fitzpatrick"].value_counts().sort_index())

fig, ax = plt.subplots(figsize=(8, 5))
df["fitzpatrick"].value_counts().sort_index().plot(kind="bar", ax=ax, color="steelblue")
ax.set_title("Raw Fitzpatrick Skin Type Distribution")
ax.set_xlabel("Fitzpatrick Type")
ax.set_ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# Download images (uncomment and run if you need to download from URLs)
# downloaded = download_images(df, IMAGE_DIR)
# print(f"Downloaded {downloaded} images")

In [None]:
# Step 1: Validate Fitzpatrick labels
original_count = len(df)
df = validate_fitzpatrick_labels(df)
print(f"After label validation: {len(df)} ({original_count - len(df)} dropped)")

In [None]:
# Step 2: Validate images
before = len(df)
df = validate_images(IMAGE_DIR, df)
print(f"After image validation: {len(df)} ({before - len(df)} dropped)")

In [None]:
# Step 3: Filter to human-only images
before = len(df)
df = filter_human_images(IMAGE_DIR, df)
print(f"After human filter: {len(df)} ({before - len(df)} dropped)")

In [None]:
# Step 4: Deduplicate
before = len(df)
df = deduplicate_images(IMAGE_DIR, df)
print(f"After deduplication: {len(df)} ({before - len(df)} dropped)")

In [None]:
# Step 5: Encode labels (6-class: Fitzpatrick 1-6 → labels 0-5)
df = encode_labels(df)
print("\nFitzpatrick label distribution:")
print(df["skin_tone_label"].value_counts().sort_index())

fig, ax = plt.subplots(figsize=(10, 5))
df["fitzpatrick"].value_counts().sort_index().plot(kind="bar", ax=ax, color="steelblue")
ax.set_title("Cleaned: 6-Class Fitzpatrick Distribution")
ax.set_xlabel("Fitzpatrick Type")
ax.set_ylabel("Count")
ax.set_xticklabels(["Fitz I", "Fitz II", "Fitz III", "Fitz IV", "Fitz V", "Fitz VI"], rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Class distribution report
dist = compute_class_distribution(df, "fitzpatrick")
print("\nClass Distribution Report:")
fitz_names = {1: "Fitz I", 2: "Fitz II", 3: "Fitz III", 4: "Fitz IV", 5: "Fitz V", 6: "Fitz VI"}
for cls in sorted(dist.keys()):
    info = dist[cls]
    print(f"  {fitz_names.get(cls, cls)}: {info['count']} images ({info['percentage']:.1f}%)")

imbalance_ratio = max(d["count"] for d in dist.values()) / min(d["count"] for d in dist.values())
print(f"\nImbalance ratio: {imbalance_ratio:.2f}x")

In [None]:
# Stratified split
train_df, val_df, test_df = stratified_split(df, "skin_tone_label", (0.7, 0.15, 0.15), seed=RANDOM_SEED)

print(f"\nSplit sizes: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")

for split_name, split_df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    print(f"\n{split_name} distribution:")
    print(split_df["fitzpatrick"].value_counts().sort_index())

In [None]:
# Save cleaned data
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
df.to_csv(f"{OUTPUT_DIR}/fitzpatrick17k_cleaned.csv", index=False)
train_df.to_csv(f"{OUTPUT_DIR}/train.csv", index=False)
val_df.to_csv(f"{OUTPUT_DIR}/val.csv", index=False)
test_df.to_csv(f"{OUTPUT_DIR}/test.csv", index=False)
print(f"\nSaved cleaned data to {OUTPUT_DIR}/")

In [None]:
# Summary
print("\n" + "="*60)
print("DATA CLEANING SUMMARY")
print("="*60)
print(f"Original images:     {original_count}")
print(f"After cleaning:      {len(df)}")
print(f"Total dropped:       {original_count - len(df)}")
print(f"Imbalance ratio:     {imbalance_ratio:.2f}x")
print(f"Train/Val/Test:      {len(train_df)}/{len(val_df)}/{len(test_df)}")
print("="*60)