# 03 â€” Evaluation & Fairness Analysis
Evaluate EfficientNetV2-S, ResNet50, and AutoML baseline across all 6 Fitzpatrick types.
Compute per-class metrics and fairness gap.

In [None]:
import os, subprocess, sys

# Clone repo (skip if already cloned)
if not os.path.exists("/content/NST_Class"):
    subprocess.run(["git", "clone", "https://github.com/AayushBaniya2006/NST_Class.git"], cwd="/content")
os.chdir("/content/NST_Class")

!pip install -q -r requirements.txt

sys.path.insert(0, '/content/NST_Class')

import torch
import pandas as pd
import numpy as np
from pathlib import Path
from torch.utils.data import DataLoader

from src.data.dataset import FitzpatrickDataset
from src.data.transforms import get_eval_transforms
from src.models.classifier import SkinToneClassifier
from src.evaluation.metrics import compute_all_metrics
from src.evaluation.fairness import compute_fairness_gap, compare_model_fairness
from src.evaluation.confusion import plot_confusion_matrix, plot_fairness_comparison

print("Setup complete! Make sure you ran notebooks 01 and 02 first.")

In [None]:
# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
IMAGE_DIR = "data/images"
DATA_DIR = "data/cleaned"
CLASS_NAMES = ["1", "2", "3", "4", "5", "6"]
DISPLAY_NAMES = ["Fitz I", "Fitz II", "Fitz III", "Fitz IV", "Fitz V", "Fitz VI"]

In [None]:
# Load test data
test_df = pd.read_csv(f"{DATA_DIR}/test.csv")
print(f"Test set: {len(test_df)} images")
print(test_df["fitzpatrick"].value_counts().sort_index())

transform = get_eval_transforms(224)
test_dataset = FitzpatrickDataset(test_df, IMAGE_DIR, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

In [None]:
# Helper: run inference
@torch.no_grad()
def get_predictions(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []

    for images, labels in loader:
        images = images.to(device)
        outputs = model(images)
        probs = torch.softmax(outputs, dim=1)

        all_preds.extend(outputs.argmax(dim=1).cpu().numpy())
        all_labels.extend(labels.numpy())
        all_probs.extend(probs.cpu().numpy())

    return np.array(all_labels), np.array(all_preds), np.array(all_probs)

In [None]:
# Evaluate EfficientNetV2-S
model_eff = SkinToneClassifier("efficientnet_v2_s", num_classes=6, pretrained=False)
model_eff.load_state_dict(torch.load("checkpoints/efficientnet_v2_s_final.pt", map_location=DEVICE, weights_only=True))
model_eff = model_eff.to(DEVICE)

y_true, y_pred_eff, y_proba_eff = get_predictions(model_eff, test_loader, DEVICE)
metrics_eff = compute_all_metrics(y_true, y_pred_eff, y_proba_eff, CLASS_NAMES)

print("EfficientNetV2-S Results:")
print(f"  Accuracy: {metrics_eff['accuracy']:.4f}")
print(f"  Macro F1: {metrics_eff['macro_f1']:.4f}")
print(f"  ROC-AUC:  {metrics_eff['roc_auc']:.4f}")
for cls, display in zip(CLASS_NAMES, DISPLAY_NAMES):
    m = metrics_eff["per_class"][cls]
    print(f"  {display}: P={m['precision']:.3f} R={m['recall']:.3f} F1={m['f1']:.3f}")

In [None]:
# Confusion matrix: EfficientNetV2
Path("results").mkdir(exist_ok=True)
plot_confusion_matrix(
    metrics_eff["confusion_matrix"],
    DISPLAY_NAMES,
    title="EfficientNetV2-S Confusion Matrix",
    save_path="results/cm_efficientnet.png",
)

In [None]:
# Evaluate ResNet50
model_res = SkinToneClassifier("resnet50", num_classes=6, pretrained=False)
model_res.load_state_dict(torch.load("checkpoints/resnet50_final.pt", map_location=DEVICE, weights_only=True))
model_res = model_res.to(DEVICE)

_, y_pred_res, y_proba_res = get_predictions(model_res, test_loader, DEVICE)
metrics_res = compute_all_metrics(y_true, y_pred_res, y_proba_res, CLASS_NAMES)

print("\nResNet50 Results:")
print(f"  Accuracy: {metrics_res['accuracy']:.4f}")
print(f"  Macro F1: {metrics_res['macro_f1']:.4f}")
for cls, display in zip(CLASS_NAMES, DISPLAY_NAMES):
    m = metrics_res["per_class"][cls]
    print(f"  {display}: P={m['precision']:.3f} R={m['recall']:.3f} F1={m['f1']:.3f}")

In [None]:
# Confusion matrix: ResNet50
plot_confusion_matrix(
    metrics_res["confusion_matrix"],
    DISPLAY_NAMES,
    title="ResNet50 Confusion Matrix",
    save_path="results/cm_resnet50.png",
)

In [None]:
# AutoML results placeholder
# Replace these with actual AutoML evaluation output from notebook 04
automl_per_class = {
    "1": {"recall": 0.0, "precision": 0.0, "f1": 0.0},
    "2": {"recall": 0.0, "precision": 0.0, "f1": 0.0},
    "3": {"recall": 0.0, "precision": 0.0, "f1": 0.0},
    "4": {"recall": 0.0, "precision": 0.0, "f1": 0.0},
    "5": {"recall": 0.0, "precision": 0.0, "f1": 0.0},
    "6": {"recall": 0.0, "precision": 0.0, "f1": 0.0},
}
# TODO: Fill in from AutoML evaluation output

In [None]:
# Fairness gap analysis
print("\n" + "="*60)
print("FAIRNESS ANALYSIS")
print("="*60)

fairness_eff = compute_fairness_gap(metrics_eff["per_class"])
fairness_res = compute_fairness_gap(metrics_res["per_class"])

print(f"\nEfficientNetV2-S Fairness Gap: {fairness_eff['gap']:.2%}")
print(f"  Best:  {fairness_eff['best_class']} ({fairness_eff['best_value']:.2%})")
print(f"  Worst: {fairness_eff['worst_class']} ({fairness_eff['worst_value']:.2%})")
print(f"  Significant: {fairness_eff['significant']}")

print(f"\nResNet50 Fairness Gap: {fairness_res['gap']:.2%}")
print(f"  Best:  {fairness_res['best_class']} ({fairness_res['best_value']:.2%})")
print(f"  Worst: {fairness_res['worst_class']} ({fairness_res['worst_value']:.2%})")
print(f"  Significant: {fairness_res['significant']}")

In [None]:
# Cross-model fairness comparison
model_metrics = {
    "EfficientNetV2-S": metrics_eff["per_class"],
    "ResNet50": metrics_res["per_class"],
    # "AutoML": automl_per_class,  # Uncomment when AutoML results are ready
}

fairness_results = compare_model_fairness(model_metrics)

plot_fairness_comparison(
    fairness_results,
    save_path="results/fairness_comparison.png",
)

In [None]:
# Summary table
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)
print(f"{'Model':<20} {'Accuracy':<10} {'Macro F1':<10} {'Fairness Gap':<15} {'Significant?'}")
print("-"*65)
for name, m, fg in [
    ("EfficientNetV2-S", metrics_eff, fairness_eff),
    ("ResNet50", metrics_res, fairness_res),
]:
    print(f"{name:<20} {m['accuracy']:<10.4f} {m['macro_f1']:<10.4f} {fg['gap']:<15.2%} {fg['significant']}")