# t-SNE visualization for CLIP features

This notebook:
- Extracts CLIP image features and labels using the repo's `lpclip/feat_extractor.py`.
- Runs t-SNE on a chosen split and saves a scatter plot (`tsne_<split>.png`).

Adjust `repo_root`, `data_root`, `dataset_cfg`, `trainer_cfg`, and `split` as needed.


In [6]:
import os
import sys
import subprocess
from pathlib import Path

# --- User settings ---
repo_root = Path("/Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv")
data_root = Path("/content/data")  # change if running locally (e.g., Path("/path/to/data"))
trainer_cfg = repo_root / "configs/trainers/CoOp/rn50.yaml"
dataset_cfg = repo_root / "configs/datasets/oxford_pets.yaml"
output_dir = repo_root / "clip_feat"
split = "test"  # "train", "val", or "test"
# ---------------------

# Optional installs (only if missing)
try:
    import numpy as np
    from sklearn.manifold import TSNE
    import seaborn as sns
    import matplotlib.pyplot as plt
except Exception:
    subprocess.run([sys.executable, "-m", "pip", "install", "numpy", "scikit-learn", "seaborn", "matplotlib"], check=True)
    import numpy as np
    from sklearn.manifold import TSNE
    import seaborn as sns
    import matplotlib.pyplot as plt

print("Repo root:", repo_root)
print("Data root:", data_root)
print("Trainer cfg:", trainer_cfg)
print("Dataset cfg:", dataset_cfg)
print("Split:", split)

assert repo_root.exists(), f"repo_root not found: {repo_root}"
assert trainer_cfg.exists(), f"trainer cfg not found: {trainer_cfg}"
assert dataset_cfg.exists(), f"dataset cfg not found: {dataset_cfg}"

output_dir.mkdir(parents=True, exist_ok=True)



Repo root: /Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv
Data root: /content/data
Trainer cfg: /Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv/configs/trainers/CoOp/rn50.yaml
Dataset cfg: /Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv/configs/datasets/oxford_pets.yaml
Split: test


In [7]:
import shlex

# Run feature extraction for the chosen split using the repo script
cmd = [
    sys.executable,
    str(repo_root / "lpclip/feat_extractor.py"),
    "--root", str(data_root),
    "--output-dir", str(output_dir),
    "--config-file", str(trainer_cfg),
    "--dataset-config-file", str(dataset_cfg),
    "--split", split,
]
print("Running:\n", " ".join(shlex.quote(p) for p in cmd))
res = subprocess.run(cmd, cwd=str(repo_root), capture_output=True, text=True)
print(res.stdout)
if res.returncode != 0:
    print(res.stderr)
    raise RuntimeError("Feature extraction failed; see stderr above.")

# Expected output file path
npz_path = output_dir / "OxfordPets" / f"{split}.npz"
print("Feature file:", npz_path)
assert npz_path.exists(), f"Did not find features at {npz_path}"



Running:
 /usr/local/bin/python3 /Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv/lpclip/feat_extractor.py --root /content/data --output-dir /Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv/clip_feat --config-file /Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv/configs/trainers/CoOp/rn50.yaml --dataset-config-file /Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv/configs/datasets/oxford_pets.yaml --split test

Traceback (most recent call last):
  File "/Users/hamzaiqbal/grad/comp_vision/project_mf-clip/MFCLIP_acv/lpclip/feat_extractor.py", line 3, in <module>
    import torch
ModuleNotFoundError: No module named 'torch'



RuntimeError: Feature extraction failed; see stderr above.

In [8]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Load features and labels
npz = np.load(npz_path)
X = npz["feature_list"].astype("float32")
y = npz["label_list"].astype("int32")
print("Features:", X.shape, "Labels:", y.shape)

# Optional subsample for readability
max_points = 4000
if len(X) > max_points:
    rng = np.random.default_rng(0)
    idx = rng.choice(len(X), size=max_points, replace=False)
    X, y = X[idx], y[idx]
    print(f"Subsampled to {len(X)} points for plotting")

# t-SNE
perplexity = min(30, max(5, len(X)//100))  # heuristic
print("Using perplexity:", perplexity)
tsne = TSNE(n_components=2, init="pca", learning_rate="auto", perplexity=perplexity, random_state=0)
Z = tsne.fit_transform(X)

# Plot
plt.figure(figsize=(8, 6))
unique_labels = np.unique(y)
palette = sns.color_palette("tab20", n_colors=len(unique_labels))
for i, cls in enumerate(unique_labels):
    m = (y == cls)
    plt.scatter(Z[m, 0], Z[m, 1], s=5, color=palette[i % len(palette)], label=str(cls), alpha=0.7)
plt.title(f"t-SNE of CLIP features ({split} split)")
plt.axis("off")
plt.tight_layout()

# Save and show
out_png = repo_root / f"tsne_{split}.png"
plt.savefig(out_png, dpi=200)
print("Saved:", out_png)
plt.show()



NameError: name 'npz_path' is not defined

In [None]:
# (Optional) Extract additional splits
# Set split = "train" or "val" in the first cell and re-run extraction + plot.
# Or do it programmatically here:

for sp in ["train", "val"]:
    cmd2 = [
        sys.executable,
        str(repo_root / "lpclip/feat_extractor.py"),
        "--root", str(data_root),
        "--output-dir", str(output_dir),
        "--config-file", str(trainer_cfg),
        "--dataset-config-file", str(dataset_cfg),
        "--split", sp,
    ]
    print("\nExtracting:", sp)
    res2 = subprocess.run(cmd2, cwd=str(repo_root), capture_output=True, text=True)
    if res2.returncode != 0:
        print(res2.stderr)
        raise RuntimeError(f"Extraction failed for split {sp}")
print("Done.")

