# Artificial Vision & Feature Separability — 03 · Exemplar Models & GCM

**Goal.** Implement and analyze **exemplar-based categorization**, focusing on the **Generalized Context Model (GCM)** and comparisons to **prototype** and **k-NN** across color datasets.
**Outputs.** Log-loss/accuracy tables, confusion matrices, decision boundaries (2D PCA), and a sensitivity sweep over **similarity decay** and **distance metrics**.

In [None]:
# --- Reproducibility & Environment ---
import os, random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

os.makedirs("results", exist_ok=True)
os.makedirs("data", exist_ok=True)

print("Seed set to", SEED)

In [None]:
# --- Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

## 1. Data
Load `data/colors.csv` (RGB or Lab + label) or generate a balanced toy set.

In [None]:
# CSV loader (preferred)
csv_path = Path("data/colors.csv")
if csv_path.exists():
    df = pd.read_csv(csv_path)
    feat_cols = [c for c in df.columns if c.lower() in ["r","g","b","l","a","b"]][:3]
    assert len(feat_cols)==3, "Expect 3 feature columns (RGB or Lab)."
else:
    # Balanced toy colors
    def make_toy_colors(n_per=250, noise=30, seed=SEED):
        rng = np.random.default_rng(seed)
        centers = {
            "red":   np.array([220, 40, 40]),
            "green": np.array([40, 220, 40]),
            "blue":  np.array([40, 40, 220]),
        }
        X_list, y_list = [], []
        for lab, c in centers.items():
            Xc = rng.normal(c, noise, size=(n_per, 3)).clip(0,255)
            X_list.append(Xc); y_list += [lab]*n_per
        X = np.vstack(X_list).astype(np.float32)
        y = np.array(y_list)
        return pd.DataFrame({"R":X[:,0], "G":X[:,1], "B":X[:,2], "label":y})
    df = make_toy_colors()
    feat_cols = ["R","G","B"]

X = df[feat_cols].values
y = df["label"].values
labels = sorted(np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)

scaler = StandardScaler()
Xz_train = scaler.fit_transform(X_train)
Xz_test  = scaler.transform(X_test)

print("Train/Test:", X_train.shape, X_test.shape, "Labels:", labels)

## 2. Prototype & k-NN Baselines

In [None]:
# Prototype (nearest class centroid in standardized space)
centroids = {lab: Xz_train[y_train==lab].mean(axis=0) for lab in labels}
def proto_predict(Xz):
    preds = []
    for row in Xz:
        d = {lab: np.linalg.norm(row - mu) for lab, mu in centroids.items()}
        preds.append(min(d, key=d.get))
    return np.array(preds)

pred_proto = proto_predict(Xz_test)
acc_proto = accuracy_score(y_test, pred_proto)
cm_proto = confusion_matrix(y_test, pred_proto, labels=labels)
print(f"Prototype — Acc: {acc_proto:.3f}")

In [None]:
# k-NN (exemplar with uniform votes)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xz_train, y_train)
pred_knn = knn.predict(Xz_test)
proba_knn = knn.predict_proba(Xz_test)
acc_knn = accuracy_score(y_test, pred_knn)
ll_knn = log_loss(y_test, proba_knn, labels=labels)
print(f"k-NN (k=5) — Acc: {acc_knn:.3f} | Log-loss: {ll_knn:.3f}")

## 3. Generalized Context Model (GCM)
Similarity-based exemplar model.  
Similarity \( s(x, x_i) = \exp(-c \cdot d(x, x_i)) \) with distance \( d \) as Minkowski-\(p\) (\(p=1\) Manhattan, \(p=2\) Euclidean).

In [None]:
def minkowski_distance(x, Y, p=2):
    # x: (d,), Y: (n,d)
    return np.power(np.abs(Y - x)**p, 1.0).sum(axis=1)**(1.0/p)

def gcm_proba(Xz, Xz_train, y_train, labels, c=2.0, p=2):
    proba = np.zeros((Xz.shape[0], len(labels)), dtype=float)
    lab2idx = {lab:i for i,lab in enumerate(labels)}
    for i, x in enumerate(Xz):
        dist = minkowski_distance(x, Xz_train, p=p)
        sim = np.exp(-c * dist)
        for s, lab in zip(sim, y_train):
            proba[i, lab2idx[lab]] += s
        ssum = proba[i].sum()
        if ssum > 0: proba[i] /= ssum
    return proba

### 3.1 Hyperparameter Sweep (c, p) via Cross-Validation
We pick \(c\in\{0.5,1,2,4,8\}\) and \(p\in\{1,2\}\) using 5-fold Stratified CV on the training set, minimizing **log-loss**.

In [None]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
C_GRID = [0.5, 1.0, 2.0, 4.0, 8.0]
P_GRID = [1, 2]

best = {"logloss": np.inf, "c": None, "p": None}
for cval in C_GRID:
    for pval in P_GRID:
        ll_list = []
        for tr_idx, va_idx in cv.split(Xz_train, y_train):
            Xtr, Xva = Xz_train[tr_idx], Xz_train[va_idx]
            ytr, yva = y_train[tr_idx], y_train[va_idx]
            proba = gcm_proba(Xva, Xtr, ytr, labels, c=cval, p=pval)
            ll = log_loss(yva, proba, labels=labels)
            ll_list.append(ll)
        mean_ll = float(np.mean(ll_list))
        if mean_ll < best["logloss"]:
            best.update({"logloss": mean_ll, "c": cval, "p": pval})

print("Best GCM params:", best)

### 3.2 Evaluate GCM (best params) on Test

In [None]:
proba_gcm = gcm_proba(Xz_test, Xz_train, y_train, labels, c=best["c"], p=best["p"])
pred_gcm = np.array([labels[i] for i in np.argmax(proba_gcm, axis=1)])
acc_gcm = accuracy_score(y_test, pred_gcm)
ll_gcm = log_loss(y_test, proba_gcm, labels=labels)
cm_gcm = confusion_matrix(y_test, pred_gcm, labels=labels)
print(f"GCM — Acc: {acc_gcm:.3f} | Log-loss: {ll_gcm:.3f} (c={best['c']}, p={best['p']})")

## 4. Logistic Regression Reference (probabilistic baseline)

In [None]:
logreg = LogisticRegression(max_iter=1000, multi_class="multinomial", random_state=SEED)
logreg.fit(Xz_train, y_train)
proba_lr = logreg.predict_proba(Xz_test)
pred_lr = logreg.predict(Xz_test)
acc_lr = accuracy_score(y_test, pred_lr)
ll_lr = log_loss(y_test, proba_lr, labels=labels)
cm_lr = confusion_matrix(y_test, pred_lr, labels=labels)
print(f"LogReg — Acc: {acc_lr:.3f} | Log-loss: {ll_lr:.3f}")

## 5. Results Overview
We save confusion matrices and a small table for accuracy/log-loss comparison.

In [None]:
import csv
summary = [
    ("prototype", accuracy_score(y_test, pred_proto), np.nan),
    ("knn_k5", acc_knn, ll_knn),
    ("gcm", acc_gcm, ll_gcm),
    ("logreg", acc_lr, ll_lr),
]
with open("results/03_summary.csv", "w", newline="") as f:
    w = csv.writer(f); w.writerow(["model","accuracy","logloss"]); w.writerows(summary)

for name, cm in [("prototype", cm_proto), ("knn", confusion_matrix(y_test, pred_knn, labels=labels)),
                 ("gcm", cm_gcm), ("logreg", cm_lr)]:
    plt.figure()
    plt.imshow(cm, aspect="auto")
    plt.title(f"Confusion — {name}")
    plt.xlabel("Pred"); plt.ylabel("True")
    plt.xticks(range(len(labels)), labels, rotation=45)
    plt.yticks(range(len(labels)), labels)
    plt.colorbar(); plt.tight_layout(); plt.savefig(f"results/03_confusion_{name}.png", dpi=150); plt.show()

print("Wrote results/03_summary.csv and confusion matrices.")

## 6. Decision Boundaries (2D PCA)

In [None]:
p2 = PCA(n_components=2, random_state=SEED)
Z_train = p2.fit_transform(Xz_train)
Z_test  = p2.transform(Xz_test)

def label_to_int(arr, labels):
    mapping = {lab:i for i,lab in enumerate(labels)}
    return np.vectorize(mapping.get)(arr)

# For decision surfaces, fit simple surrogates on 2D space:
log2d = LogisticRegression(max_iter=1000, multi_class="multinomial", random_state=SEED).fit(Z_train, y_train)
knn2d = KNeighborsClassifier(n_neighbors=5).fit(Z_train, y_train)

xmin, ymin = Z_train.min(axis=0) - 1
xmax, ymax = Z_train.max(axis=0) + 1
xx, yy = np.meshgrid(np.linspace(xmin, xmax, 200), np.linspace(ymin, ymax, 200))
grid = np.c_[xx.ravel(), yy.ravel()]

for model, name in [(log2d, "logistic"), (knn2d, "knn")]:
    pred = model.predict(grid).reshape(xx.shape)
    plt.figure()
    plt.contourf(xx, yy, label_to_int(pred, labels), alpha=0.3)
    plt.scatter(Z_train[:,0], Z_train[:,1], c=label_to_int(y_train, labels), s=10, edgecolor='k', linewidth=0.2)
    plt.title(f"Decision — {name} (2D PCA)")
    plt.tight_layout(); plt.savefig(f"results/03_boundary_{name}_2d.png", dpi=150); plt.show()

## 7. Sensitivity: Vary GCM Decay (c) at Fixed p
We plot accuracy/log-loss on test as we vary \(c\) around the best value (holding \(p\) fixed).

In [None]:
c_vals = [best["c"]/4, best["c"]/2, best["c"], best["c"]*2, best["c"]*4]
c_vals = [float(c) for c in c_vals if c>0]
accs, lls = [], []
for cval in c_vals:
    proba_ = gcm_proba(Xz_test, Xz_train, y_train, labels, c=cval, p=best["p"])
    pred_ = np.array([labels[i] for i in np.argmax(proba_, axis=1)])
    accs.append(accuracy_score(y_test, pred_))
    lls.append(log_loss(y_test, proba_, labels=labels))

plt.figure()
plt.plot(c_vals, accs, marker="o")
plt.xlabel("c (similarity decay)"); plt.ylabel("Accuracy"); plt.title("GCM Sensitivity — Accuracy")
plt.tight_layout(); plt.savefig("results/03_gcm_sensitivity_acc.png", dpi=150); plt.show()

plt.figure()
plt.plot(c_vals, lls, marker="o")
plt.xlabel("c (similarity decay)"); plt.ylabel("Log-loss"); plt.title("GCM Sensitivity — Log-loss")
plt.tight_layout(); plt.savefig("results/03_gcm_sensitivity_ll.png", dpi=150); plt.show()

## 8. Takeaways
- GCM’s **decay (c)** and **distance metric (p)** materially affect performance; tuning by CV on log-loss is recommended.
- Prototype is strong when classes are compact; k-NN captures fine detail but can overfit without CV for k.
- Logistic sets a probabilistic linear baseline; exemplar methods often produce better **log-loss** when boundaries are complex.