# Artificial Vision & Feature Separability — 01 · Colors: Baselines, Prototype & Exemplar Models

**Goal.** Build color-category classifiers and compare **logistic regression (baseline)**, **prototype (centroid)**, and **exemplar** models.  
**Outputs.** Decision plots, confusion matrices, and a short takeaway summary.  
**Data.** Use either a CSV of color samples with labels (recommended) or generate a synthetic color set.

In [None]:
# --- Reproducibility & Environment ---
import os, random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

os.makedirs("results", exist_ok=True)
os.makedirs("data", exist_ok=True)

print("Seed set to", SEED)

In [None]:
# --- Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

## 1. Data
You can **(A)** place a CSV at `data/colors.csv` with columns like `R,G,B,label` (0–255) or `L,a,b,label`, **or (B)** generate a toy dataset below.

**Expected CSV schema (example):**
```
R,G,B,label
255,0,0,red
0,255,0,green
0,0,255,blue
...
```

In [None]:
# Option A: Load your CSV (uncomment when you have a file)
# csv_path = Path("data/colors.csv")
# assert csv_path.exists(), "Place your color dataset at data/colors.csv"
# df = pd.read_csv(csv_path)

# Option B: Generate toy RGB color dataset (3 classes around canonical colors)
def make_toy_colors(n_per_class=200, noise=30, seed=SEED):
    rng = np.random.default_rng(seed)
    centers = {
        "red":   np.array([220, 40, 40]),
        "green": np.array([40, 220, 40]),
        "blue":  np.array([40, 40, 220]),
    }
    X_list, y_list = [], []
    for label, c in centers.items():
        Xc = rng.normal(c, noise, size=(n_per_class, 3)).clip(0,255)
        X_list.append(Xc)
        y_list += [label]*n_per_class
    X = np.vstack(X_list).astype(np.float32)
    y = np.array(y_list)
    return pd.DataFrame({"R":X[:,0], "G":X[:,1], "B":X[:,2], "label":y})

df = make_toy_colors()
df.head()

## 2. Train / Test Split & Scaling
We split the data, then standardize features for classifier training.

In [None]:
X = df[["R","G","B"]].values
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)

scaler = StandardScaler()
Xz_train = scaler.fit_transform(X_train)
Xz_test  = scaler.transform(X_test)

labels = sorted(np.unique(y))
labels

## 3. Baseline: Multinomial Logistic Regression

In [None]:
logreg = LogisticRegression(max_iter=500, multi_class="multinomial", random_state=SEED)
logreg.fit(Xz_train, y_train)
pred_lr = logreg.predict(Xz_test)

acc_lr = accuracy_score(y_test, pred_lr)
cm_lr = confusion_matrix(y_test, pred_lr, labels=labels)
print(f"Logistic Regression — Test Accuracy: {acc_lr:.3f}")
print(classification_report(y_test, pred_lr))

In [None]:
plt.figure()
plt.imshow(cm_lr, aspect="auto")
plt.title("Confusion Matrix — Logistic Regression")
plt.xlabel("Pred"); plt.ylabel("True")
plt.xticks(range(len(labels)), labels, rotation=45)
plt.yticks(range(len(labels)), labels)
plt.colorbar(); plt.tight_layout()
plt.savefig("results/01_confusion_logreg.png", dpi=150); plt.show()

## 4. Prototype Classifier (Centroid in Feature Space)
Compute per-class centroids in **standardized** space and classify by nearest centroid (Euclidean).

In [None]:
# Compute centroids
centroids = {lab: Xz_train[y_train==lab].mean(axis=0) for lab in labels}

def proto_predict(Xz):
    # nearest centroid classifier
    preds = []
    for row in Xz:
        d2 = {lab: np.linalg.norm(row - mu) for lab, mu in centroids.items()}
        preds.append(min(d2, key=d2.get))
    return np.array(preds)

pred_proto = proto_predict(Xz_test)
acc_proto = accuracy_score(y_test, pred_proto)
cm_proto = confusion_matrix(y_test, pred_proto, labels=labels)
print(f"Prototype — Test Accuracy: {acc_proto:.3f}")
print(classification_report(y_test, pred_proto))

plt.figure()
plt.imshow(cm_proto, aspect="auto")
plt.title("Confusion Matrix — Prototype")
plt.xlabel("Pred"); plt.ylabel("True")
plt.xticks(range(len(labels)), labels, rotation=45)
plt.yticks(range(len(labels)), labels)
plt.colorbar(); plt.tight_layout()
plt.savefig("results/01_confusion_prototype.png", dpi=150); plt.show()

## 5. Exemplar Models
Two common options:
- **k-NN (uniform weights)**: class vote among nearest neighbors.  
- **GCM-style similarity**: similarity decays with distance; class score is sum of similarities of exemplars.

In [None]:
# 5a) k-NN exemplar (simple baseline exemplar)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xz_train, y_train)
pred_knn = knn.predict(Xz_test)

acc_knn = accuracy_score(y_test, pred_knn)
cm_knn = confusion_matrix(y_test, pred_knn, labels=labels)
print(f"k-NN (k=5) — Test Accuracy: {acc_knn:.3f}")
plt.figure(); plt.imshow(cm_knn, aspect="auto"); plt.title("Confusion — kNN (k=5)")
plt.xlabel("Pred"); plt.ylabel("True")
plt.xticks(range(len(labels)), labels, rotation=45); plt.yticks(range(len(labels)), labels)
plt.colorbar(); plt.tight_layout(); plt.savefig("results/01_confusion_knn.png", dpi=150); plt.show()

In [None]:
# 5b) GCM-style exemplar: similarity = exp(-c * ||x - exemplar||)
def gcm_predict(Xz, Xz_train, y_train, labels, c=2.0):
    preds = []
    for row in Xz:
        scores = {lab: 0.0 for lab in labels}
        d = Xz_train - row  # (n_train, n_feat)
        dist = np.linalg.norm(d, axis=1) # Euclidean; could swap in Mahalanobis
        sim = np.exp(-c * dist)          # exponential decay
        for s, lab in zip(sim, y_train):
            scores[lab] += s
        preds.append(max(scores, key=scores.get))
    return np.array(preds)

pred_gcm = gcm_predict(Xz_test, Xz_train, y_train, labels, c=2.0)
acc_gcm = accuracy_score(y_test, pred_gcm)
cm_gcm = confusion_matrix(y_test, pred_gcm, labels=labels)
print(f"GCM (c=2.0) — Test Accuracy: {acc_gcm:.3f}")
plt.figure(); plt.imshow(cm_gcm, aspect="auto"); plt.title("Confusion — GCM (c=2.0)")
plt.xlabel("Pred"); plt.ylabel("True")
plt.xticks(range(len(labels)), labels, rotation=45); plt.yticks(range(len(labels)), labels)
plt.colorbar(); plt.tight_layout(); plt.savefig("results/01_confusion_gcm.png", dpi=150); plt.show()

## 6. Simple 2D Decision Illustration (optional)
Project to 2D with PCA for visualization and plot decision contours for **logistic** and **k-NN**.

In [None]:
from sklearn.decomposition import PCA
p2 = PCA(n_components=2, random_state=SEED)
Z_train_2d = p2.fit_transform(Xz_train)
Z_test_2d  = p2.transform(Xz_test)

# Fit on 2D to draw boundaries
log2d = LogisticRegression(max_iter=500, multi_class="multinomial", random_state=SEED).fit(Z_train_2d, y_train)
knn2d = KNeighborsClassifier(n_neighbors=5).fit(Z_train_2d, y_train)

# grid
xmin, ymin = Z_train_2d.min(axis=0) - 1
xmax, ymax = Z_train_2d.max(axis=0) + 1
xx, yy = np.meshgrid(np.linspace(xmin, xmax, 200), np.linspace(ymin, ymax, 200))
grid = np.c_[xx.ravel(), yy.ravel()]

pred_log = log2d.predict(grid).reshape(xx.shape)
pred_knn = knn2d.predict(grid).reshape(xx.shape)

def label_to_int(arr, labels):
    mapping = {lab:i for i,lab in enumerate(labels)}
    return np.vectorize(mapping.get)(arr)

plt.figure()
plt.contourf(xx, yy, label_to_int(pred_log, labels), alpha=0.3)
plt.scatter(Z_train_2d[:,0], Z_train_2d[:,1], c=label_to_int(y_train, labels), s=10, edgecolor='k', linewidth=0.2)
plt.title("Decision Boundary — Logistic (2D PCA)")
plt.tight_layout(); plt.savefig("results/01_boundary_logistic_2d.png", dpi=150); plt.show()

plt.figure()
plt.contourf(xx, yy, label_to_int(pred_knn, labels), alpha=0.3)
plt.scatter(Z_train_2d[:,0], Z_train_2d[:,1], c=label_to_int(y_train, labels), s=10, edgecolor='k', linewidth=0.2)
plt.title("Decision Boundary — k-NN (2D PCA)")
plt.tight_layout(); plt.savefig("results/01_boundary_knn_2d.png", dpi=150); plt.show()

## 7. Takeaways
- Logistic regression sets a strong linear baseline in standardized RGB space.
- Prototype classifiers are simple and surprisingly competitive when classes are compact.
- Exemplar approaches (k-NN, GCM) capture fine-grained category structure.
- These tools frame the move from color categories to higher-dimensional vision tasks.