# Embedding Quality Metrics

In [None]:
import sys


sys.path.append("../prod/utils")

In [None]:
# Imports & utils
from io_docs import load_embeddings
from embeddings import get_embedding_model
from logging_config import logger

import numpy as np
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.neighbors import NearestNeighbors
from scipy.stats import skew
np.random.seed(42)

def l2_normalize(X, eps=1e-12):
    n = np.linalg.norm(X, axis=1, keepdims=True) + eps
    return X / n

In [None]:
# Constants
FAISS_PATH = "../prod/faiss_index"
embeddings = get_embedding_model(1) # 

In [None]:
# Data Loading
documents, all_embeddings, metadatas = load_embeddings(FAISS_PATH, embeddings)

X = np.array(all_embeddings)
n, d = X.shape

# Optional metadata (delete if not available)
section_labels = None # np.array(metadatas).astype(object)               # e.g., np.load("section_labels.npy").astype(object)
doc_order = np.arange(n)            # 0..n-1 in reading order; shuffle to match X if needed

X = l2_normalize(X)
X.shape, (section_labels.shape if section_labels is not None else None)


## 1. Pairwise Cosine Similarity Distribution
**Formula:**  
cosine_sim(x, y) = (x · y) / (||x|| * ||y||)

**Why:** Checks if embeddings are meaningfully spread out.  
**Rule of thumb:**  
- Mean ~0.0–0.2 → good  
- Mean >0.4 → likely collapse

## 2. Cosine to Mean Vector (Anisotropy)
**Formula:**  
μ = mean(x₁, x₂, …, xₙ)  
cos_to_mean(i) = cosine_sim(xᵢ, μ)

**Why:** Detects if most embeddings point in same direction.  
**Rule of thumb:**  
- Mean ≤0.10–0.15 → fine  
- ≥0.25 → strong anisotropy

## 3. First Principal Component Variance Ratio
**Formula:**  
variance_ratio₁ = λ₁ / (λ₁ + λ₂ + … + λ_d)  
(λ₁ = variance along first PC)

**Why:** High values = one dominant direction in space.  
**Rule of thumb:**  
- <0.25 → good  
- 0.25–0.5 → moderate anisotropy  
- \>0.5 → strong anisotropy

In [None]:
from sklearn.decomposition import PCA

# Pairwise cosine sample to avoid O(n^2)
m = min(2000, n)  # sample size
idx = np.random.choice(n, size=m, replace=False)
Xs = X[idx]

cos_dists = pairwise_distances(Xs, metric="cosine")
cos_sims = 1 - cos_dists
mask = np.triu(np.ones_like(cos_sims, dtype=bool), k=1)
pair_cos = cos_sims[mask]

# Isotropy (cosine to mean direction)
mu = X.mean(axis=0, keepdims=True)
mu = l2_normalize(mu)
cos_to_mean = (X @ mu.T).ravel()

# PCA anisotropy
pca = PCA(n_components=10, svd_solver="randomized", random_state=42).fit(X)
first_pc_ratio = pca.explained_variance_ratio_[0]

print(f"Pairwise cosine: mean={pair_cos.mean():.3f}, std={pair_cos.std():.3f}, q95={np.quantile(pair_cos,0.95):.3f}")
print(f"Cosine to mean vector: mean={cos_to_mean.mean():.3f}, std={cos_to_mean.std():.3f}")
print(f"First PC variance ratio: {first_pc_ratio:.3f}")

## 4. Hubness
**Definition:** Count how often each vector appears in others’ top-k neighbors.  
Check **skewness** and **Gini coefficient**.

**Why:** High hubness means a few vectors appear in many neighbor lists, distorting search.  
**Rule of thumb:**  
- Skew ≤1.0, Gini ≤0.3 → healthy  
- Skew >2.0 or Gini >0.4 → hubness problem

In [None]:
k = 10
nn = NearestNeighbors(n_neighbors=k+1, metric="cosine").fit(X)
_, inds = nn.kneighbors(X, n_neighbors=k+1)
inds = inds[:,1:]  # drop self

# Count how often each point is a neighbor of others
counts = np.zeros(n, dtype=int)
for row in inds:
    for j in row:
        counts[j] += 1

def gini(x):
    x = np.sort(x.astype(np.float64))
    if x.sum() == 0: return 0.0
    n = len(x)
    cumx = np.cumsum(x)
    return (n + 1 - 2 * (cumx / x.sum()).sum() / n)

print(f"k={k} mean-occurrence={counts.mean():.2f}, max={counts.max()}, skew={skew(counts):.2f}, gini={gini(counts):.2f}")


## 5. Adjacency@k (Sequential Coherence)
**Definition:** For document chunks, % whose previous/next chunk appears in their top-k neighbors.

**Why:** Adjacent chunks in text should be semantically close.  
**Rule of thumb:**  
- ≥0.6 → strong coherence  
- 0.3–0.6 → moderate  
- <0.3 → poor

In [None]:
# Adjacent coherence
def adjacency_at_k(inds, order):
    pos = np.empty_like(order)
    pos[order] = np.arange(len(order))
    hits = 0
    total = 0
    for i in range(len(order)):
        neighbors = set(inds[i])
        # previous / next in reading order
        p = i-1 if i-1 >= 0 else None
        q = i+1 if i+1 < len(order) else None
        hit = False
        if p is not None: hit |= (p in neighbors)
        if q is not None: hit |= (q in neighbors)
        hits += int(hit)
        total += 1
    return hits / total

adj10 = adjacency_at_k(inds, doc_order)
print(f"Adjacency@{k}: {adj10:.3f}")


## 6. Section Purity@k (Label-Based Coherence)
**Definition:** For each chunk, fraction of k-NN that share the same section/topic label.

**Why:** Checks topical grouping in embeddings.  
**Rule of thumb:**  
- ≥0.7 → strong separation  
- 0.5–0.7 → moderate  
- <0.5 → weak

In [None]:
# Section purity (skip if no labels)
if section_labels is not None:
    labels = np.asarray(section_labels)
    purities = []
    for i in range(n):
        nbrs = inds[i]
        purities.append(np.mean(labels[nbrs] == labels[i]))
    print(f"Section Purity@{k}: {np.mean(purities):.3f}")
else:
    print("No section_labels provided — skipping.")

## 7. Silhouette Score (if labels available)
**Formula:**  
a(i) = avg distance to same-cluster points  
b(i) = smallest avg distance to a different cluster  
silhouette(i) = (b(i) − a(i)) / max(a(i), b(i))

**Why:** Measures separation quality given labels.  
**Rule of thumb:**  
- ≥0.5 → strong  
- 0.3–0.5 → moderate  
- <0.3 → weak (0.2–0.3 still common for text)

## 8. Retrieval Metrics (RAG-Focused)
**Recall@k:** relevant_found_in_top_k / total_relevant  
**Precision@k:** relevant_found_in_top_k / k  
**MRR:** mean(1 / rank_of_first_relevant)  
**nDCG@k:** discounted gain of ranked results vs ideal order

**Why:** Directly measures retrieval quality for your use case.  
**Rule of thumb:**  
- Recall@10 ≥0.7 → good  
- MRR ≥0.5 → relevant appears very early  
- nDCG@10 ≥0.6 → good ranking

In [None]:
# Retrieval evaluation
# Inputs:
# Q: (num_queries, d) normalized embeddings of queries
# gt: list[set[int]] relevant chunk indices per query

def topk_cosine_search(Q, X, k=10):
    # X, Q assumed L2-normalized; cosine sim = dot product
    sims = Q @ X.T   # (q, n)
    idx = np.argpartition(-sims, kth=k-1, axis=1)[:,:k]
    # sort top-k per row
    row_scores = np.take_along_axis(sims, idx, axis=1)
    order = np.argsort(-row_scores, axis=1)
    return np.take_along_axis(idx, order, axis=1), np.take_along_axis(row_scores, order, axis=1)

def retrieval_metrics_at_k(topk, gt_sets, k=10):
    q = len(gt_sets)
    recall = precision = mrr = ndcg = 0.0
    for i in range(q):
        hits = [1 if r in gt_sets[i] else 0 for r in topk[i,:k]]
        # P@k, R@k
        precision += sum(hits)/k
        recall += (sum(hits)/max(1, len(gt_sets[i])))
        # MRR
        rank = next((j+1 for j,h in enumerate(hits) if h==1), None)
        if rank: mrr += 1.0/rank
        # nDCG@k
        dcg = sum(h/np.log2(j+2) for j,h in enumerate(hits))
        ideal = sum(1/np.log2(j+2) for j in range(min(k, len(gt_sets[i]))))
        ndcg += (dcg/ideal if ideal>0 else 0.0)
    return dict(
    recall_at_k=recall/q,
    precision_at_k=precision/q,
    mrr=mrr/q,
    ndcg_at_k=ndcg/q
)


# --- Demo with synthetic data (replace with real Q and gt) ---
# Q = np.load("query_embeddings.npy"); Q = l2_normalize(Q)
# gt = [set([12, 98]), set([3, 17, 21]), ...]
Q = l2_normalize(np.random.normal(size=(10, d)).astype(np.float32))
gt = [set(np.random.choice(n, size=np.random.randint(1,4), replace=False)) for _ in range(len(Q))]
topk, scores = topk_cosine_search(Q, X, k=10)
metrics = retrieval_metrics_at_k(topk, gt, k=10)
metrics