### Embedding models map text, sentences, or documents into a dense vector space, where similar texts have closer vector representations.
```
Use cases:
Text similarity
Document retrieval
Semantic search
Clustering
```

In [None]:
%pip install sentence-transformers

In [1]:
from sentence_transformers import SentenceTransformer

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for sample text
text = "LangChain is a framework for building AI applications with LLMs."
embedding = model.encode(text)

# Output vector shape
print(f"Text Embedding Shape: {embedding.shape}")
print(f"First 5 Embedding Values: {embedding[:5]}")


  from .autonotebook import tqdm as notebook_tqdm
2025-01-06 12:56:30.136141: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-06 12:56:30.335508: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736146590.410153   67281 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736146590.428997   67281 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-06 12:56:30.578067: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

Text Embedding Shape: (384,)
First 5 Embedding Values: [-0.0285371  -0.06309644  0.02467403 -0.06235595  0.02788599]


##  Test Embeddings with Text Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample texts
text1 = "LangChain is great for building AI applications."
text2 = "OpenAI's GPT-3 is a powerful language model."

# Generate embeddings
embedding1 = model.encode(text1)
embedding2 = model.encode(text2)

# Calculate cosine similarity
similarity = cosine_similarity([embedding1], [embedding2])
print(f"Cosine Similarity: {similarity[0][0]}")


## Practical Task: Build a Document Similarity Function

In [None]:
def get_similarity(text1, text2, model):
    embedding1 = model.encode(text1)
    embedding2 = model.encode(text2)
    similarity = cosine_similarity([embedding1], [embedding2])
    return similarity[0][0]

# Test similarity
doc1 = "LangChain provides tools for AI development."
doc2 = "AI tools like LangChain simplify complex tasks."
print(f"Document Similarity: {get_similarity(doc1, doc2, model)}")


## Vector Databases with ChromaDB

In [None]:
%pip install chromadb

In [None]:
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB client
client = chromadb.Client(Settings())

# Create a collection for storing embeddings
collection = client.create_collection(name="docs")

print("ChromaDB setup complete!")


In [None]:
# Sample documents
documents = [
    "LangChain is a framework for AI applications.",
    "GPT-3 is a powerful language model from OpenAI.",
    "ChromaDB is a vector database for storing embeddings."
]
ids = ["doc1", "doc2", "doc3"]

# Add documents to collection
collection.add(documents=documents, ids=ids)
print("Documents stored in ChromaDB!")



In [None]:


# Query collection for similar documents
query_text = "What is LangChain?"
results = collection.query(query_texts=[query_text], n_results=2)

# Output results
for doc_id, doc_text in zip(results["ids"][0], results["documents"][0]):
    print(f"Document ID: {doc_id}, Text: {doc_text}")


## Test Similarity with Embeddings

In [None]:
# Generate embeddings for query
query_embedding = model.encode("AI frameworks like LangChain")

# Add embeddings to collection (with metadata)
collection.add(
    documents=documents,
    embeddings=[model.encode(doc) for doc in documents],
    ids=ids
)

# Query collection using embeddings
results = collection.query(query_embeddings=[query_embedding], n_results=2)
for doc_id, doc_text in zip(results["ids"][0], results["documents"][0]):
    print(f"Document ID: {doc_id}, Text: {doc_text}")
