# Introduction to Text Embeddings

This notebook covers the fundamentals of text embeddings.

## Topics:
1. What are embeddings?
2. Using Sentence Transformers
3. Similarity metrics
4. Semantic search basics

In [None]:
# Install dependencies (uncomment if needed)
# !pip install sentence-transformers torch numpy

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Model loaded: all-MiniLM-L6-v2")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

## 1. What are Embeddings?

Embeddings are dense numerical vectors that represent text in a way that captures semantic meaning.

In [None]:
# Encode a single sentence
text = "How do I install Python?"
embedding = model.encode(text)

print(f"Text: {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"First 10 values: {embedding[:10]}")
print(f"Embedding norm: {np.linalg.norm(embedding):.4f}")

In [None]:
# Encode multiple sentences at once (more efficient)
sentences = [
    "How do I install Python?",
    "Python installation guide",
    "Best restaurants in New York"
]

embeddings = model.encode(sentences)
print(f"Shape: {embeddings.shape}")  # (3, 384)

## 2. Similarity Metrics

### Cosine Similarity
Measures the angle between two vectors. Range: [-1, 1]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute pairwise similarities
similarities = cosine_similarity(embeddings)

print("Cosine Similarity Matrix:")
for i, sent in enumerate(sentences):
    print(f"\n{sent[:40]}...")
    for j, other in enumerate(sentences):
        print(f"  vs '{other[:30]}...': {similarities[i][j]:.4f}")

In [None]:
# Manual cosine similarity calculation
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compare first two sentences (both about Python)
sim = cosine_sim(embeddings[0], embeddings[1])
print(f"Similarity between Python sentences: {sim:.4f}")

# Compare Python sentence with restaurant sentence
sim = cosine_sim(embeddings[0], embeddings[2])
print(f"Similarity between Python and restaurant: {sim:.4f}")

### Dot Product vs Cosine

When vectors are normalized, dot product = cosine similarity.

In [None]:
# Normalize embeddings
embeddings_normalized = model.encode(sentences, normalize_embeddings=True)

# Now dot product = cosine similarity
dot_product = np.dot(embeddings_normalized[0], embeddings_normalized[1])
cosine = cosine_sim(embeddings[0], embeddings[1])

print(f"Dot product (normalized): {dot_product:.4f}")
print(f"Cosine similarity: {cosine:.4f}")
print(f"Same? {np.isclose(dot_product, cosine)}")

## 3. Semantic Search

In [None]:
# Define a corpus
corpus = [
    "How to install Python on Windows",
    "Python programming tutorial for beginners",
    "Best Italian restaurants in Manhattan",
    "Machine learning with Python",
    "WiFi connection problems and solutions",
    "Python virtual environments explained",
    "Top pizza places in NYC",
    "Setting up Python development environment"
]

# Encode corpus
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

In [None]:
def search(query, corpus, corpus_embeddings, top_k=3):
    """Find most similar documents to query."""
    # Encode query
    query_embedding = model.encode(query, normalize_embeddings=True)
    
    # Compute similarities (dot product since normalized)
    similarities = np.dot(corpus_embeddings, query_embedding)
    
    # Get top-k indices
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    print(f"Query: {query}\n")
    for idx in top_indices:
        print(f"  Score: {similarities[idx]:.4f} | {corpus[idx]}")

# Test searches
search("How do I set up Python?", corpus, corpus_embeddings)

In [None]:
# Different query
search("good food places", corpus, corpus_embeddings)

In [None]:
# Query about networking
search("internet not working", corpus, corpus_embeddings)

## 4. Comparing Different Models

In [None]:
# Compare embedding dimensions and speed
import time

models_to_compare = [
    'all-MiniLM-L6-v2',      # 384 dims, fast
    # 'all-mpnet-base-v2',   # 768 dims, slower but better
]

test_sentences = ["This is a test sentence."] * 100

for model_name in models_to_compare:
    m = SentenceTransformer(model_name)
    
    start = time.time()
    emb = m.encode(test_sentences)
    elapsed = time.time() - start
    
    print(f"{model_name}:")
    print(f"  Dimensions: {emb.shape[1]}")
    print(f"  Time for 100 sentences: {elapsed:.3f}s")
    print(f"  Sentences/sec: {100/elapsed:.1f}")

## Summary

- **Embeddings** convert text to vectors that capture meaning
- **Cosine similarity** measures semantic similarity
- **Normalized vectors** allow faster dot product computation
- **Semantic search** finds similar documents by meaning, not keywords

### Next:
Try the tasks in `../tasks/` folder!