In [2]:
#!pip install -q sentence-transformers

In [1]:
from sentence_transformers import SentenceTransformer
import numpy as  np


print("Libraries imported successfully!1")


  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully!1


In [2]:
# Loaad a small, fast embedding model
print("Loading embedding model...")

model = SentenceTransformer ('all-MiniLM-L6-v2')
print("Model loaded!")
print(f"Model produces {model.get_sentence_embedding_dimension()} dimensional embeddings")


Loading embedding model...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded!
Model produces 384 dimensional embeddings


Generate Embeddings

In [3]:
# Simple example

text = "The cat sat on the met"

# Generate embedding
embedding = model.encode(text)

print(f"Original text: {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"Embedding type: {type(embedding)}")
print(f"\nFirst 10 values: {embedding[:10]}")

Original text: The cat sat on the met
Embedding shape: (384,)
Embedding type: <class 'numpy.ndarray'>

First 10 values: [ 0.07994753 -0.00066557  0.01172574  0.08463684 -0.10899848  0.02930526
  0.01828373  0.00536317 -0.03327655 -0.01049048]


Similarity: The Heart of RAG

Cosine Similarity Explained
cosine similarity measures how similar two vectors are.




In [5]:
def cosine_similarity(vec1, vec2):
    """
    calculate cosine similarity between two vectors
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product /  (norm1 * norm2)

print(" Similarity function ready!")

 Similarity function ready!


Testing Similarity

In [7]:
# Create test sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",      # Similar meaning, different word
    "Dogs are loyal animals",           # Differnt topic
    "Python is a programming language"   #completely unrelated
]


# Generate embeddings for all sentences
embeddings = model.encode(sentences)

# Compare first sentence to all others
print("Comparing to: 'The cat sat on the mat'\n")
for i, sentence in enumerate(sentences):
    similarity = cosine_similarity(embeddings[0], embeddings[i])
    print(f"Similarity to '{sentence}'")
    print(f"Score: {similarity:.3f}\n")

Comparing to: 'The cat sat on the mat'

Similarity to 'The cat sat on the mat'
Score: 1.000

Similarity to 'A feline rested on the rug'
Score: 0.564

Similarity to 'Dogs are loyal animals'
Score: 0.165

Similarity to 'Python is a programming language'
Score: 0.031



Building a Simple Semantic Search

Create a Document Collection

In [8]:
# Sample Knowledge base
documents = [
    "Python is high-level programming language known for simplicity",
    "Machine learning enables computers to learn from data",
    "Neural networks are inspired by biological brains",
    "Dogs are loyal and friendly pets that need exercise",
    "Cats are independent animals that make grat combinations",
    "JavaScript is used for web development and runs in browsers",
    "Deep learning uses multi_layered neural networks",
    "Puppies require training and socialization from an early age"
]

print(f"Knowlege base: {len(documents)} documents")

Knowlege base: 8 documents


Embed All Documents

In [9]:
# Generate embeddings for all documents

print("Generating embeddings for all documents....")
doc_embeddings = model.encode(documents)

print(f" Created {len(doc_embeddings)} embeddings")
print(f"Each embeddings has {doc_embeddings[0].shape[0]} dimensions")


Generating embeddings for all documents....
 Created 8 embeddings
Each embeddings has 384 dimensions


Search Function

In [10]:
def search(query, documents, doc_embeddings, top_k=3):
    """
    Search for documents similar to the query.


    Args:
        query: Search (string)
        documents: LIst of document texts
        doc_embeddings: Pre-computed document embeddings
        top_k: Number of results to return

    Returns:
        Lists of  (document, similarity_score) tuples
    """

    # Embed the query
    query_embedding = model.encode(query)

    # Calculate similarities
    similarities = []
    for i, doc_emb in enumerate(doc_embeddings):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((documents[i], similarity))

    # sort by similarity (highest first)
    similarities.sort(key=lambda x:x[1], reverse=True)
 

    # Return top k results
    return similarities[:top_k]


print("Search function ready!")

Search function ready!
