# Task 1: Build a Semantic Search System - SOLUTION

## Scenario
You have a collection of support ticket descriptions. Build a semantic search system that:
1. Encodes all documents into embeddings
2. Finds the most similar documents for a given query
3. Returns results with similarity scores

## Setup

In [None]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load documents
with open('../fixtures/input/documents.json') as f:
    documents = json.load(f)

print(f"Loaded {len(documents)} documents")
print(f"\nSample document:")
print(documents[0])

In [None]:
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Model loaded. Embedding dimension: {model.get_sentence_embedding_dimension()}")

---
## Task 1: Create Embeddings - SOLUTION

In [None]:
# SOLUTION

# 1. Extract texts from documents
texts = [doc['text'] for doc in documents]

# 2. Encode with normalization
embeddings = model.encode(
    texts,
    normalize_embeddings=True,  # Important for dot product = cosine
    show_progress_bar=True
)

print(f"Texts: {len(texts)}")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Sample embedding norm: {np.linalg.norm(embeddings[0]):.4f} (should be 1.0)")

In [None]:
# TEST
assert 'texts' in dir(), "Variable 'texts' not found"
assert 'embeddings' in dir(), "Variable 'embeddings' not found"
assert len(texts) == 20, f"Expected 20 texts, got {len(texts)}"
assert embeddings.shape == (20, 384), f"Expected shape (20, 384), got {embeddings.shape}"

norms = np.linalg.norm(embeddings, axis=1)
assert np.allclose(norms, 1.0), "Embeddings should be normalized (norm=1)"

print("Task 1 PASSED!")

---
## Task 2: Implement Semantic Search - SOLUTION

In [None]:
# SOLUTION

def search(query: str, top_k: int = 3):
    """
    Find most similar documents to query.
    
    Args:
        query: Search query string
        top_k: Number of results to return
        
    Returns:
        List of dicts: [{'id': ..., 'text': ..., 'score': ...}, ...]
    """
    # 1. Encode query (normalized)
    query_embedding = model.encode(query, normalize_embeddings=True)
    
    # 2. Compute similarities (dot product since normalized)
    similarities = np.dot(embeddings, query_embedding)
    
    # 3. Get top-k indices (sorted descending)
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    # 4. Build results
    results = []
    for idx in top_indices:
        results.append({
            'id': documents[idx]['id'],
            'text': documents[idx]['text'],
            'score': float(similarities[idx])
        })
    
    return results

# Test search
results = search("How to install Python on my computer?")
print("Search results:")
for r in results:
    print(f"  {r['score']:.4f} | {r['text'][:60]}...")

In [None]:
# TEST
results = search("How to install Python on my computer?", top_k=3)

assert len(results) == 3, f"Expected 3 results, got {len(results)}"
assert all('id' in r and 'text' in r and 'score' in r for r in results), "Missing keys"
assert all(0 <= r['score'] <= 1 for r in results), "Scores should be between 0 and 1"
assert results[0]['score'] >= results[1]['score'] >= results[2]['score'], "Should be sorted"
assert 'python' in results[0]['text'].lower() or 'install' in results[0]['text'].lower()

print("Task 2 PASSED!")

---
## Task 3: Find Near-Duplicates - SOLUTION

In [None]:
# SOLUTION

def find_duplicates(threshold: float = 0.85):
    """
    Find document pairs with similarity above threshold.
    
    Args:
        threshold: Minimum similarity to consider as duplicate
        
    Returns:
        List of dicts: [{'doc1_id': ..., 'doc2_id': ..., 'similarity': ...}, ...]
    """
    # Compute pairwise similarities
    similarity_matrix = np.dot(embeddings, embeddings.T)
    
    duplicates = []
    
    # Check upper triangle only (avoid duplicates and self-comparison)
    for i in range(len(documents)):
        for j in range(i + 1, len(documents)):
            sim = similarity_matrix[i, j]
            if sim >= threshold:
                duplicates.append({
                    'doc1_id': documents[i]['id'],
                    'doc2_id': documents[j]['id'],
                    'similarity': float(sim)
                })
    
    # Sort by similarity descending
    duplicates.sort(key=lambda x: x['similarity'], reverse=True)
    
    return duplicates

# Test
duplicates = find_duplicates(threshold=0.85)
print(f"Found {len(duplicates)} duplicate pairs")
for d in duplicates[:5]:
    doc1 = next(doc for doc in documents if doc['id'] == d['doc1_id'])
    doc2 = next(doc for doc in documents if doc['id'] == d['doc2_id'])
    print(f"\n{d['similarity']:.4f}:")
    print(f"  1: {doc1['text'][:60]}...")
    print(f"  2: {doc2['text'][:60]}...")

In [None]:
# TEST
duplicates = find_duplicates(threshold=0.85)

assert isinstance(duplicates, list), "Should return a list"
assert len(duplicates) > 0, "Should find at least one duplicate pair"
assert all('doc1_id' in d and 'doc2_id' in d and 'similarity' in d for d in duplicates)
assert all(d['similarity'] >= 0.85 for d in duplicates)

print("Task 3 PASSED!")

---
## Task 4: Cluster Documents - SOLUTION

In [None]:
# SOLUTION

from sklearn.cluster import KMeans

def cluster_documents(n_clusters: int = 5):
    """
    Cluster documents by semantic similarity.
    
    Args:
        n_clusters: Number of clusters
        
    Returns:
        Dict mapping cluster_id to list of document ids
    """
    # Fit K-means
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(embeddings)
    
    # Group documents by cluster
    clusters = {}
    for idx, label in enumerate(labels):
        label = int(label)  # Convert numpy int to Python int
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(documents[idx]['id'])
    
    return clusters

# Test
clusters = cluster_documents(n_clusters=5)
print("Clusters:")
for cluster_id, doc_ids in clusters.items():
    print(f"\nCluster {cluster_id} ({len(doc_ids)} docs):")
    for doc_id in doc_ids:
        doc = next(d for d in documents if d['id'] == doc_id)
        print(f"  [{doc['category']}] {doc['text'][:50]}...")

In [None]:
# TEST
clusters = cluster_documents(n_clusters=5)

assert isinstance(clusters, dict), "Should return a dict"
assert len(clusters) == 5, f"Expected 5 clusters, got {len(clusters)}"

all_ids = [doc_id for ids in clusters.values() for doc_id in ids]
assert len(all_ids) == 20, f"Expected 20 documents in clusters, got {len(all_ids)}"

print("Task 4 PASSED!")

---
## Bonus: Visualize Embeddings - SOLUTION

In [None]:
# BONUS SOLUTION

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Reduce to 2D
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
embeddings_2d = tsne.fit_transform(embeddings)

# Get categories for coloring
categories = [doc['category'] for doc in documents]
unique_categories = list(set(categories))
colors = plt.cm.tab10(np.linspace(0, 1, len(unique_categories)))
category_colors = {cat: colors[i] for i, cat in enumerate(unique_categories)}

# Plot
plt.figure(figsize=(12, 8))

for cat in unique_categories:
    mask = [c == cat for c in categories]
    plt.scatter(
        embeddings_2d[mask, 0],
        embeddings_2d[mask, 1],
        c=[category_colors[cat]],
        label=cat,
        alpha=0.7,
        s=100
    )

plt.legend()
plt.title('Document Embeddings (t-SNE)')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.tight_layout()
plt.show()

---
## Summary

**Key techniques used:**

1. **Embedding creation:**
   - Use `normalize_embeddings=True` for efficient dot product
   - Batch encode for efficiency

2. **Semantic search:**
   - Encode query with same normalization
   - Use dot product (= cosine for normalized vectors)
   - Use `np.argsort` for top-k

3. **Duplicate detection:**
   - Compute pairwise similarity matrix
   - Only check upper triangle to avoid duplicates

4. **Clustering:**
   - K-means works on embedding space
   - Documents with similar topics cluster together

**Common pitfalls:**
- Forgetting to normalize embeddings
- Using cosine_similarity instead of dot product (slower for normalized vectors)
- Not handling numpy types in JSON serialization