# Semantic Search Demo

This notebook demonstrates how to build a simple semantic search engine using `sentence-transformers` and `chromadb`.

In [35]:
import chromadb
from sentence_transformers import SentenceTransformer
import pandas as pd
import json

## 1. Define Sample Documents
We'll create a small corpus of text to search against.

In [36]:
with open('documents.json', 'r') as f:
    documents = json.load(f)

ids = [str(i) for i in range(len(documents))]
metadatas = [{'source': 'blog'} for _ in range(len(documents))]

## 2. Initialize Vector Database and Embedding Model
We use `chromadb` to store vectors and `sentence-transformers` to generate them.

In [37]:
# Initialize ChromaDB client
client = chromadb.Client()

# Create a collection
collection_name = "blog_posts"
try:
    client.delete_collection(name=collection_name)
except:
    pass
collection = client.create_collection(name=collection_name, metadata={"hnsw:space": "cosine"})

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

## 3. Index Documents
Generate embeddings for our documents and add them to the ChromaDB collection.

In [38]:
embeddings = model.encode(documents).tolist()

collection.add(
    documents=documents,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)

print(f"Indexed {len(documents)} documents.")

Indexed 10 documents.


## 4. Search Interface
Enter a query to find the most relevant blog posts.

In [39]:
def search_blogs(query, n_results=3):
    query_embedding = model.encode([query]).tolist()
    
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=n_results
    )
    
    print(f"\nQuery: {query}\n")
    print("Top Results:")
    print("-" * 30)
    
    found_any = False
    for i in range(n_results):
        doc = results['documents'][0][i]
        distance = results['distances'][0][i]
        similarity = 1 - distance # Chroma returns cosine distance (1 - similarity)
        
        if similarity >= 0.3:
            print(f"Result {i+1} (Similarity: {similarity:.4f}):")
            print(f"\"{doc}\"")
            print("-" * 30)
            found_any = True
        else:
            # Since results are sorted, if this one is < 0.5, subsequent ones will be too
            break
    
    if not found_any:
        print("No results found with similarity >= 50%.")

def search_keyword(query, n_results=3):
    stop_words = set(["a", "an", "the", "in", "on", "of", "and", "is", "to", "with", "for", "it", "that", "this", "by", "at"])
    query_words = [w for w in query.lower().split() if w not in stop_words]
    
    if not query_words:
        print(f"\nKeyword Search Results for: '{query}'")
        print("-" * 30)
        print("Query contains only stop words.")
        return

    scores = []
    for doc in documents:
        score = 0
        doc_lower = doc.lower()
        for word in query_words:
            if word in doc_lower:
                score += 1
        scores.append(score)
    
    # Sort by score descending
    sorted_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    
    print(f"\nKeyword Search Results for: '{query}'")
    print("-" * 30)
    for i in range(min(n_results, len(documents))):
        idx = sorted_indices[i]
        if scores[idx] > 0:
            print(f"Result {i+1} (Matches: {scores[idx]}):")
            print(f"\"{documents[idx]}\"")
            print("-" * 30)
        else:
            if i == 0:
                print("No keyword matches found.")
            break

def compare_search(query):
    print("="*60)
    print(f"Comparing results for: {query}")
    print("="*60)
    
    print("\n--- SEMANTIC SEARCH ---")
    search_blogs(query, n_results=3)
    
    print("\n--- KEYWORD SEARCH ---")
    search_keyword(query, n_results=3)
    print("\n")

# Interactive loop
while True:
    user_query = input("Enter search query (or 'exit' to quit): ")
    if user_query.lower() == 'exit':
        break
    from IPython.display import clear_output
    clear_output(wait=True)
    compare_search(user_query)

Comparing results for: making bread

--- SEMANTIC SEARCH ---

Query: making bread

Top Results:
------------------------------
Result 1 (Similarity: 0.4237):
"Baking is a method of preparing food that uses dry heat, typically in an oven, but can also be done in hot ashes, or on hot stones."
------------------------------

--- KEYWORD SEARCH ---

Keyword Search Results for: 'making bread'
------------------------------
No keyword matches found.


