# Solution: The Wrong Distance

This is the answer key for `drill_12_wrong_similarity.ipynb`.

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.neighbors import NearestNeighbors

np.random.seed(42)

In [None]:
# Sample support tickets
tickets = [
    "I can't log in to my account",
    "Password reset not working please help",
    "Unable to access my account after password change",
    "Login credentials not accepted",
    "Where is my order it's been two weeks",
    "Package tracking shows delivered but I never received it",
    "Shipping is taking way too long",
    "My delivery is late when will it arrive",
    "I want a refund for this product",
    "How do I return this item and get my money back",
    "Request for refund - product not as described",
    "Cancel my order and refund please",
]

topics = ['password'] * 4 + ['shipping'] * 4 + ['refund'] * 4

vectorizer = TfidfVectorizer(stop_words='english')
embeddings = vectorizer.fit_transform(tickets).toarray()

In [None]:
# BUGGY: Euclidean distance
nn_euclidean = NearestNeighbors(n_neighbors=3, metric='euclidean')
nn_euclidean.fit(embeddings)

## SOLUTION: Use Cosine Similarity

In [None]:
# Fix: Use cosine similarity
nn_cosine = NearestNeighbors(n_neighbors=3, metric='cosine')
nn_cosine.fit(embeddings)

def search_cosine(query):
    """Find similar tickets using cosine distance."""
    query_emb = vectorizer.transform([query]).toarray()
    distances, indices = nn_cosine.kneighbors(query_emb)
    
    print(f"Query: '{query}'")
    print("Top matches (Cosine):")
    for dist, idx in zip(distances[0], indices[0]):
        similarity = 1 - dist
        topic = topics[idx]
        print(f"  [{similarity:.3f}] [{topic}] {tickets[idx]}")
    print()

print("=== Fixed Results (Cosine Similarity) ===")
print()
search_cosine("password problem")
search_cosine("shipping delay")
search_cosine("want refund")

In [None]:
# Compare retrieval quality
def evaluate_retrieval(nn_model, metric_name):
    """Check if nearest neighbors are from the same topic."""
    correct = 0
    total = 0
    
    for i in range(len(tickets)):
        distances, indices = nn_model.kneighbors(embeddings[i:i+1])
        for idx in indices[0][1:]:
            total += 1
            if topics[idx] == topics[i]:
                correct += 1
    
    accuracy = correct / total
    print(f"{metric_name}: {accuracy:.1%} neighbors from same topic")
    return accuracy

print("=== Retrieval Quality ===")
acc_euclidean = evaluate_retrieval(nn_euclidean, "Euclidean")
acc_cosine = evaluate_retrieval(nn_cosine, "Cosine")

In [None]:
# Self-check
assert acc_cosine > acc_euclidean, "Cosine should outperform Euclidean for text"
assert acc_cosine > 0.7, "Cosine should get most neighbors correct"

print("✓ Metric fixed!")
print(f"✓ Euclidean accuracy: {acc_euclidean:.1%}")
print(f"✓ Cosine accuracy: {acc_cosine:.1%}")
print(f"✓ Improvement: {acc_cosine - acc_euclidean:+.1%}")

## Sample Postmortem

### What happened:
- Support ticket similarity system was matching unrelated tickets, failing to identify duplicates.

### Root cause:
- Euclidean distance fails for text embeddings because:
  1. **Sparse vectors:** Most dimensions are 0, making all vectors appear similar.
  2. **Magnitude sensitivity:** Longer documents have larger vectors regardless of meaning.
  3. **High dimensionality:** In high-D space, Euclidean distances converge (curse of dimensionality).

### How to prevent:
- **Use cosine similarity for text.** It measures angle (direction) not magnitude.
- **Visualize similarity matrices** to verify the metric captures expected structure.
- **Test with known-similar pairs** before deployment.