In [None]:
%pip install transformers sentence-transformers numpy scikit-learn


# What Are Embeddings?
Definition: Embeddings are vector representations of text that capture semantic meaning.

Why Use Them?:

To find similarity between texts.

To enable tasks like search, clustering, and classification.

In [None]:
from sentence_transformers import SentenceTransformer

# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example sentences
sentences = [
    "Artificial Intelligence is transforming industries.",
    "Machine learning is a subset of AI."
]

# Generate embeddings
embeddings = model.encode(sentences)

print("Embeddings:")
print(embeddings)
print("Shape of each embedding:", embeddings[0].shape)


In [None]:
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB client
client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",  # Local storage
    persist_directory="db/chromadb_data"  # Directory to store data
))

# Create or get a collection
collection = client.get_or_create_collection(name="documents")

# Add embeddings to the collection
embeddings = [
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9]
]
metadata = [{"id": "doc1"}, {"id": "doc2"}, {"id": "doc3"}]
documents = ["Document 1 text", "Document 2 text", "Document 3 text"]

collection.add(
    embeddings=embeddings,
    metadatas=metadata,
    documents=documents
)


In [None]:
import pinecone

# Initialize Pinecone
pinecone.init(api_key="YOUR_API_KEY", environment="us-west1-gcp")

# Create a Pinecone index
index_name = "example-index"
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=3, metric="cosine")

index = pinecone.Index(index_name)

# Add embeddings to the index
embeddings = [
    {"id": "1", "values": [0.1, 0.2, 0.3]},
    {"id": "2", "values": [0.4, 0.5, 0.6]},
    {"id": "3", "values": [0.7, 0.8, 0.9]}
]
index.upsert(vectors=embeddings)


# Query for similar vectors
query_vector = [0.15, 0.25, 0.35]
results = index.query(vector=query_vector, top_k=2, include_metadata=True)
print(results)



# MongoDB

In [None]:
from pymongo import MongoClient
import numpy as np

# 1. Connect to MongoDB
client = MongoClient("mongodb+srv://<username>:<password>@<cluster>.mongodb.net/")
db = client['vector_search_db']
collection = db['vectors']

# 2. Define the vector field schema and create an index
collection.create_index(
    [
        ("vector", "knnVector")  # Create a k-NN index on the 'vector' field
    ],
    name="vector_index",
    options={
        "dimensions": 128,  # Set the vector dimensions (e.g., 128)
        "similarity": "cosine"  # Similarity metric: 'cosine', 'euclidean', etc.
    }
)

# 3. Insert documents with vectors
documents = [
    {"name": "Document 1", "vector": list(np.random.rand(128))},
    {"name": "Document 2", "vector": list(np.random.rand(128))},
    {"name": "Document 3", "vector": list(np.random.rand(128))}
]
collection.insert_many(documents)

# 4. Perform a vector search
query_vector = list(np.random.rand(128))
result = collection.aggregate([
    {
        "$vectorSearch": {
            "queryVector": query_vector,
            "queryField": "vector",
            "k": 2,  # Number of nearest neighbors to retrieve
            "metric": "cosine"  # Similarity metric
        }
    }
])

# 5. Display results
for doc in result:
    print(f"Document: {doc['name']}, Similarity Score: {doc['score']}")


In [None]:
from pymongo import MongoClient
import numpy as np

# Connect to MongoDB
client = MongoClient("mongodb+srv://<username>:<password>@<cluster>.mongodb.net/")
db = client['ecommerce']
collection = db['products']

# Create k-NN Index for product embeddings
collection.create_index(
    [
        ("embedding", "knnVector")
    ],
    name="product_vector_index",
    options={
        "dimensions": 128,  # Dimensionality of the embedding vectors
        "similarity": "cosine"  # Metric: cosine similarity
    }
)

# Insert product data with embeddings (simulated embeddings here)
products = [
    {"product_id": 1, "name": "Red T-Shirt", "embedding": list(np.random.rand(128))},
    {"product_id": 2, "name": "Blue T-Shirt", "embedding": list(np.random.rand(128))},
    {"product_id": 3, "name": "Green Hoodie", "embedding": list(np.random.rand(128))},
    {"product_id": 4, "name": "Black Jeans", "embedding": list(np.random.rand(128))}
]
collection.insert_many(products)

# Simulate a query: User is viewing "Red T-Shirt"
query_product = collection.find_one({"product_id": 1})
query_vector = query_product["embedding"]

# Perform vector search to find similar products
recommendations = collection.aggregate([
    {
        "$vectorSearch": {
            "queryVector": query_vector,
            "queryField": "embedding",
            "k": 3,  # Get top 3 similar products
            "metric": "cosine"  # Cosine similarity
        }
    }
])

# Display recommendations
print("Recommended Products:")
for rec in recommendations:
    print(f"Product Name: {rec['name']}, Similarity Score: {rec['score']:.4f}")
