# Phase 0-5: Clustering]

UMAP-based visualization and manual cluster selection on precomputed CLIP features.

In [None]:
import os
import pickle
import shutil
from pathlib import Path

import numpy as np
import umap
import matplotlib.pyplot as plt
from matplotlib.path import Path as MplPath
from matplotlib.patches import Polygon
from PIL import Image

print(f"Imports OK. CWD: {Path().resolve()}")

In [None]:
# Paths are machine-specific. Adjust PROJECT_ROOT before running on a new machine.
PROJECT_ROOT = "/Users/tyreecruse/Desktop/CS230/Project/Data/Original"
# Or just use the current directory:
# PROJECT_ROOT = os.getcwd()

CONFIG = {
    # Input
    "dataset_path": os.path.join(PROJECT_ROOT, "master_dataset_pool"),
    "features_path": os.path.join(PROJECT_ROOT, "features.pkl"),

    # Outputs
    "results_dir": os.path.join(PROJECT_ROOT, "clustering_results"),
    "clustered_dir": os.path.join(PROJECT_ROOT, "clustered_images"),

    # UMAP parameters
    "n_neighbors": 15,
    "min_dist": 0.1,
    "n_components": 2,
    "random_state": 42,
}

print("Config")
print("------")
for k in ("dataset_path", "features_path", "results_dir", "clustered_dir"):
    print(f"{k:>13}: {CONFIG[k]}")

dataset_path = Path(CONFIG["dataset_path"])
images_dir = dataset_path / "images"

if not dataset_path.exists():
    print(f"\n[warning] dataset not found at {dataset_path}")
else:
    n_imgs = len(list(images_dir.glob("*"))) if images_dir.exists() else 0
    print(f"\nDataset found at {dataset_path}")
    print(f"  images: {n_imgs:,}")

In [None]:
def load_features(filepath):
    """Load features.pkl from Phase 0-4 and return (features, paths, meta)."""
    filepath = Path(filepath)
    if not filepath.exists():
        raise FileNotFoundError(f"Features file not found: {filepath}")

    with open(filepath, "rb") as f:
        data = pickle.load(f)

    features = np.asarray(data.get("features"))
    paths = [str(p) for p in data.get("paths", [])]

    meta = {k: v for k, v in data.items() if k not in ("features", "paths")}

    if features.ndim != 2:
        raise ValueError(f"Expected 2D feature matrix, got shape {features.shape}")

    print("\nLoaded features")
    print("----------------")
    print(f"  file      : {filepath}")
    print(f"  n_samples : {features.shape[0]:,}")
    print(f"  dim       : {features.shape[1]:,}")

    return features, paths, meta


def fix_image_paths(paths, dataset_path):
    """Map stored paths to the dataset's images/ directory by filename."""
    dataset_path = Path(dataset_path)
    images_dir = dataset_path / "images"

    fixed = []
    for p in paths:
        name = os.path.basename(str(p))
        fixed.append(str(images_dir / name))

    return fixed

In [None]:
def compute_umap_embedding(features, config):
    """Compute a UMAP embedding from feature matrix."""
    reducer = umap.UMAP(
        n_neighbors=config.get("n_neighbors", 15),
        min_dist=config.get("min_dist", 0.1),
        n_components=config.get("n_components", 2),
        random_state=config.get("random_state", 42),
    )
    embedding = reducer.fit_transform(features)

    print("\nUMAP embedding")
    print("--------------")
    print(f"  shape : {embedding.shape[0]:,} x {embedding.shape[1]}")

    return embedding, reducer


def plot_umap_embedding(embedding, title="UMAP embedding"):
    """Simple scatter plot of the UMAP embedding."""
    if embedding is None or len(embedding) == 0:
        print("No embedding to plot.")
        return None, None

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.scatter(embedding[:, 0], embedding[:, 1], s=2, alpha=0.5)
    ax.set_xlabel("UMAP 1")
    ax.set_ylabel("UMAP 2")
    ax.set_title(title)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    return fig, ax

In [None]:
def manual_cluster_selection(embedding):
    """Interactively define polygon regions for clusters on the UMAP plot."""
    embedding = np.asarray(embedding)
    n_points = embedding.shape[0]

    labels = np.full(n_points, -1, dtype=int)
    polygons = {}

    # Base plot for selection
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.scatter(embedding[:, 0], embedding[:, 1], s=2, alpha=0.5)
    ax.set_xlabel("UMAP 1")
    ax.set_ylabel("UMAP 2")
    ax.set_title("Manual cluster selection\n(define polygons with mouse, Enter to finish)")
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show(block=False)

    cluster_idx = 0
    while True:
        resp = input(f"Define polygon for cluster {cluster_idx}? [y/n]: ").strip().lower()
        if resp not in ("y", "yes"):
            break

        print("  Click polygon vertices in the figure; press Enter when finished.")
        pts = plt.ginput(n=-1, timeout=0)
        if len(pts) < 3:
            print("  Need at least 3 vertices; skipping cluster.")
            continue

        polygons[cluster_idx] = pts
        poly = MplPath(pts)
        inside = poly.contains_points(embedding)
        labels[inside] = cluster_idx
        print(f"  Assigned {inside.sum()} points to cluster {cluster_idx}.")

        cluster_idx += 1

    # Assign remaining points to a final cluster (if any)
    if np.any(labels == -1):
        labels[labels == -1] = cluster_idx
        polygons[cluster_idx] = None
        print(f"  {np.sum(labels == cluster_idx):,} points assigned to 'unassigned' cluster {cluster_idx}.")

    return labels, polygons


def plot_clustered_embedding(embedding, labels, polygons=None, title="Clustered UMAP embedding"):
    """Plot UMAP embedding colored by cluster labels (ints)."""
    embedding = np.asarray(embedding)
    labels = np.asarray(labels)

    unique = np.unique(labels)
    cmap = plt.cm.get_cmap("tab10", len(unique))

    fig, ax = plt.subplots(figsize=(6, 4))
    for i, k in enumerate(unique):
        mask = labels == k
        if not np.any(mask):
            continue
        ax.scatter(
            embedding[mask, 0],
            embedding[mask, 1],
            s=4,
            alpha=0.6,
            label=f"cluster {int(k)}",
            color=cmap(i),
        )

    # Optional polygon overlays
    if polygons:
        for k, verts in polygons.items():
            if not verts:
                continue
            poly = Polygon(verts, fill=False, edgecolor=cmap(int(k) % cmap.N), linewidth=2, linestyle="--")
            ax.add_patch(poly)

    ax.set_xlabel("UMAP 1")
    ax.set_ylabel("UMAP 2")
    ax.set_title(title)
    ax.legend(loc="best", markerscale=4)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    return fig, ax

In [None]:
def summarize_clusters(labels):
    """Print basic counts for each cluster id."""
    labels = np.asarray(labels)
    unique = np.unique(labels)
    print("\nCluster summary")
    print("----------------")
    for k in unique:
        n = int((labels == k).sum())
        print(f"  cluster {int(k):2d}: {n:,} points")


def display_cluster_samples(image_paths, labels, cluster_label, n_samples=9):
    """Show a small grid of images from a given cluster."""
    labels = np.asarray(labels)
    indices = np.where(labels == cluster_label)[0]

    if len(indices) == 0:
        print(f"No images found for cluster {cluster_label}.")
        return

    n_samples = min(n_samples, len(indices))
    chosen = np.random.choice(indices, size=n_samples, replace=False)

    cols = int(np.ceil(np.sqrt(n_samples)))
    rows = int(np.ceil(n_samples / cols))

    fig, axes = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
    axes = np.array(axes).reshape(-1)

    for ax, idx in zip(axes, chosen):
        path = image_paths[idx]
        try:
            img = Image.open(path).convert("RGB")
        except Exception:
            ax.set_title("load error", fontsize=8)
            ax.axis("off")
            continue
        ax.imshow(img)
        ax.set_title(Path(path).name, fontsize=8)
        ax.axis("off")

    # Hide any unused axes
    for ax in axes[len(chosen):]:
        ax.axis("off")

    plt.tight_layout()
    plt.show()

In [None]:
def save_cluster_lists(image_paths, labels, output_dir):
    """Write one text file per cluster with image paths."""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    labels = np.asarray(labels)
    unique = np.unique(labels)

    for k in unique:
        idx = np.where(labels == k)[0]
        outfile = output_dir / f"cluster_{int(k):02d}.txt"
        with open(outfile, "w") as f:
            for i in idx:
                f.write(str(image_paths[i]) + "\n")
        print(f"  wrote {len(idx):4d} paths -> {outfile.name}")


def organize_clusters_into_folders(results_dir, dataset_path, output_root):
    """Copy images into per-cluster folders using saved cluster_*.txt files."""
    results_dir = Path(results_dir)
    dataset_path = Path(dataset_path)
    output_root = Path(output_root)

    images_dir = dataset_path / "images"
    txt_files = sorted(results_dir.glob("cluster_*.txt"))

    if not txt_files:
        print("No cluster_*.txt files found; skipping folder organization.")
        return

    print("\nOrganizing images into cluster folders")
    print("--------------------------------------")
    for txt in txt_files:
        cluster_name = txt.stem  # e.g., 'cluster_00'
        cluster_dir = output_root / cluster_name
        cluster_dir.mkdir(parents=True, exist_ok=True)

        with open(txt, "r") as f:
            paths = [line.strip() for line in f if line.strip()]

        n_copied = 0
        for p in paths:
            src_name = Path(p).name
            src = images_dir / src_name if not os.path.isabs(p) else Path(p)
            if not src.exists():
                continue

            dst = cluster_dir / src_name
            if dst.exists():
                continue

            shutil.copy2(src, dst)
            n_copied += 1

        print(f"  {cluster_name}: {n_copied:,} images")

In [None]:
def main():
    """Run UMAP, manual cluster selection, and cluster organization."""
    results = {}

    dataset_path = Path(CONFIG["dataset_path"])
    features_path = Path(CONFIG["features_path"])
    results_dir = Path(CONFIG["results_dir"])
    clustered_dir = Path(CONFIG["clustered_dir"])

    print("\n" + "=" * 60)
    print("CLUSTERING PIPELINE")
    print("=" * 60)

    try:
        # Load features
        features, paths, meta = load_features(features_path)

        # Map stored paths to current dataset
        fixed_paths = fix_image_paths(paths, dataset_path)

        # UMAP
        embedding, reducer = compute_umap_embedding(features, CONFIG)
        plot_umap_embedding(embedding, title="UMAP embedding of features")

        # Manual selection
        labels, polygons = manual_cluster_selection(embedding)
        summarize_clusters(labels)
        plot_clustered_embedding(embedding, labels, polygons)

        # Quick visual check for first cluster
        unique = np.unique(labels)
        if len(unique):
            display_cluster_samples(fixed_paths, labels, int(unique[0]), n_samples=9)

        # Save clustering outputs
        results_dir.mkdir(parents=True, exist_ok=True)
        save_cluster_lists(fixed_paths, labels, results_dir)
        organize_clusters_into_folders(results_dir, dataset_path, clustered_dir)

        results = {
            "features_path": str(features_path),
            "embedding": embedding,
            "labels": labels,
            "polygons": polygons,
            "meta": meta,
        }

        print("\n" + "=" * 60)
        print("Clustering complete.")
        print("=" * 60)
        print(f"Results dir   : {results_dir}")
        print(f"Clustered dir : {clustered_dir}")

    except Exception as exc:
        print(f"\nClustering failed: {exc}")
        results["error"] = str(exc)

    return results


if __name__ == "__main__":
    clustering_results = main()

In [None]:
print("\n" + "=" * 60)
print("NEXT STEPS")
print("=" * 60)
print("\n1. Review the clustered UMAP plot to understand structure.")
print("2. Rerun manual_cluster_selection() if cluster boundaries need adjustment.")
print("3. Inspect organized cluster folders under:")
print(f"   {CONFIG['clustered_dir']}")
print("4. Optionally sample images from each cluster for qualitative checks.")
print("5. Use the most "normal" cluster as the base for YOLO fine-tuning.")