# Similarity Metrics Deep Dive

This notebook explores different similarity metrics and when to use them.

## Topics:
1. Cosine similarity
2. Dot product
3. Euclidean distance
4. Normalization effects
5. Choosing the right metric

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import matplotlib.pyplot as plt

model = SentenceTransformer('all-MiniLM-L6-v2')

## 1. Understanding the Metrics

In [None]:
# Create simple 2D vectors for visualization
a = np.array([1, 2])
b = np.array([2, 4])  # Same direction as a, different magnitude
c = np.array([2, 1])  # Different direction

print("Vectors:")
print(f"a = {a}")
print(f"b = {b} (same direction as a, 2x magnitude)")
print(f"c = {c} (different direction)")

In [None]:
# Visualize vectors
plt.figure(figsize=(8, 8))
origin = np.array([0, 0])

plt.quiver(*origin, *a, angles='xy', scale_units='xy', scale=1, color='r', label=f'a={a}')
plt.quiver(*origin, *b, angles='xy', scale_units='xy', scale=1, color='g', label=f'b={b}')
plt.quiver(*origin, *c, angles='xy', scale_units='xy', scale=1, color='b', label=f'c={c}')

plt.xlim(-1, 5)
plt.ylim(-1, 5)
plt.grid(True)
plt.legend()
plt.title('Vector Visualization')
plt.show()

In [None]:
# Calculate all metrics
def calculate_metrics(v1, v2):
    # Dot product
    dot = np.dot(v1, v2)
    
    # Cosine similarity
    cosine = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    
    # Euclidean distance
    euclidean = np.linalg.norm(v1 - v2)
    
    return {
        'dot_product': dot,
        'cosine_similarity': cosine,
        'euclidean_distance': euclidean
    }

print("a vs b (same direction):")
for k, v in calculate_metrics(a, b).items():
    print(f"  {k}: {v:.4f}")

print("\na vs c (different direction):")
for k, v in calculate_metrics(a, c).items():
    print(f"  {k}: {v:.4f}")

## 2. Effect of Normalization

In [None]:
# Normalize vectors (L2 normalization)
def normalize(v):
    return v / np.linalg.norm(v)

a_norm = normalize(a)
b_norm = normalize(b)
c_norm = normalize(c)

print("Normalized vectors:")
print(f"a_norm = {a_norm}, norm = {np.linalg.norm(a_norm):.4f}")
print(f"b_norm = {b_norm}, norm = {np.linalg.norm(b_norm):.4f}")
print(f"c_norm = {c_norm}, norm = {np.linalg.norm(c_norm):.4f}")

In [None]:
# After normalization: dot product = cosine similarity
print("a_norm vs b_norm (were same direction):")
metrics = calculate_metrics(a_norm, b_norm)
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

print(f"\nDot product == Cosine? {np.isclose(metrics['dot_product'], metrics['cosine_similarity'])}")

## 3. Real Text Examples

In [None]:
texts = [
    "How to install Python",
    "Python installation guide",
    "Best restaurants in NYC",
    "How to install Python on Windows step by step",  # Longer version
]

# Get embeddings (not normalized)
embeddings = model.encode(texts, normalize_embeddings=False)

# Get normalized embeddings
embeddings_norm = model.encode(texts, normalize_embeddings=True)

print("Embedding norms (not normalized):")
for i, text in enumerate(texts):
    print(f"  {text[:40]}: {np.linalg.norm(embeddings[i]):.4f}")

In [None]:
# Compare similarity matrices
print("Cosine Similarity Matrix:")
cos_sim = cosine_similarity(embeddings)
print(np.round(cos_sim, 3))

print("\nDot Product Matrix (normalized embeddings):")
dot_sim = np.dot(embeddings_norm, embeddings_norm.T)
print(np.round(dot_sim, 3))

print("\nAre they equal?")
print(np.allclose(cos_sim, dot_sim))

## 4. When to Use Which Metric

In [None]:
# Euclidean distance for clustering
from sklearn.cluster import KMeans

cluster_texts = [
    "Python programming",
    "Python tutorial",
    "Learn Python",
    "Italian food",
    "Pizza restaurant",
    "Best pasta",
]

cluster_embeddings = model.encode(cluster_texts)

# K-means uses Euclidean distance internally
kmeans = KMeans(n_clusters=2, random_state=42)
labels = kmeans.fit_predict(cluster_embeddings)

print("Clustering results (Euclidean-based):")
for text, label in zip(cluster_texts, labels):
    print(f"  Cluster {label}: {text}")

In [None]:
# Summary comparison
print("""
METRIC COMPARISON:

| Metric              | Range      | When to use                    |
|---------------------|------------|--------------------------------|
| Cosine Similarity   | [-1, 1]    | Semantic similarity, search    |
| Dot Product         | (-inf,inf) | Normalized vectors (faster)    |
| Euclidean Distance  | [0, inf)   | Clustering, when magnitude     |
|                     |            | matters                        |

For most NLP tasks: Use COSINE SIMILARITY (or dot product with normalized vectors)
""")

## 5. Performance Considerations

In [None]:
import time

# Generate random embeddings
n = 10000
dim = 384
corpus = np.random.randn(n, dim)
query = np.random.randn(dim)

# Normalize
corpus_norm = corpus / np.linalg.norm(corpus, axis=1, keepdims=True)
query_norm = query / np.linalg.norm(query)

# Time comparison
start = time.time()
for _ in range(100):
    sims = cosine_similarity([query], corpus)[0]
cosine_time = time.time() - start

start = time.time()
for _ in range(100):
    sims = np.dot(corpus_norm, query_norm)
dot_time = time.time() - start

print(f"Time for 100 searches over {n} vectors:")
print(f"  Cosine similarity: {cosine_time:.3f}s")
print(f"  Dot product (norm): {dot_time:.3f}s")
print(f"  Speedup: {cosine_time/dot_time:.1f}x")

## Summary

- **Cosine similarity** is best for semantic similarity (direction-based)
- **Normalize embeddings** to use faster dot product
- **Euclidean distance** is useful for clustering
- **Pre-normalize** corpus embeddings for efficient search

### Next:
Try the tasks in `../tasks/` folder!