**Setup and Installation**

In [16]:
!pip install langchain langchain_openai faiss-cpu chromadb qdrant-client pinecone pymilvus weaviate-client langchain-community

import os
import time
import uuid
import logging
import numpy as np
from contextlib import contextmanager

# Required imports
from langchain.vectorstores import FAISS, Chroma, Qdrant, Pinecone, Milvus, Weaviate
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# Set up OpenAI API key - You'll need to provide your own API key
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"  # Replace with your API key

# Timing context manager for performance measurements
@contextmanager
def timing(label):
    start_time = time.time()
    try:
        yield
    finally:
        end_time = time.time()
        print(f"{label}: {end_time - start_time:.3f} seconds")

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("vector_store_metrics.log"),
        logging.StreamHandler()
    ]
)

# Monitoring wrapper for operations
@contextmanager
def monitor_operation(operation_name, metadata=None):
    """Monitor the performance of vector store operations."""
    start_time = time.time()
    error = None
    try:
        yield
    except Exception as e:
        error = e
        raise
    finally:
        duration = time.time() - start_time
        log_entry = {
            "operation": operation_name,
            "duration_seconds": duration,
            "success": error is None,
            "metadata": metadata or {}
        }
        if error:
            log_entry["error"] = str(error)

        logging.info(f"Vector Store Metric: {log_entry}")

print("Setup complete!")

Setup complete!


**Part 1: Scalability Configurations**

In [None]:
# =================================================================
# Part 1: Scalability Configurations
# =================================================================
print("\n=== Scalability Configurations ===")

# Sample document creation for demonstrations
documents = [
    Document(
        page_content=f"Sample document {i} with information about various topics.",
        metadata={"id": str(i), "source": "synthetic", "date": "2023-01-01"}
    )
    for i in range(1, 101)  # 100 sample documents
]

print(f"Created {len(documents)} sample documents")

# Choose an embedding model
try:
    embeddings = OpenAIEmbeddings()
    print("Using OpenAI embeddings")
except Exception as e:
    # Fallback to local embeddings
    print(f"Error with OpenAI embeddings: {e}")
    print("Falling back to local Hugging Face embeddings")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 1.1 Vertical Scaling Configuration (Qdrant Example)
# Note: This is demonstrating the configuration, not creating an actual instance
print("\n--- Vertical Scaling Configuration (Qdrant) ---")

try:
    from qdrant_client import QdrantClient
    from qdrant_client.models import VectorParams, Distance, OptimizersConfigDiff

    print("# Configuration for a vertically scaled Qdrant instance")
    print("""
    # Connect to a powerful single instance
    client = QdrantClient(
        url="http://localhost:6333",
        timeout=120  # Increased timeout for larger operations
    )

    # Create a collection with optimized parameters
    client.recreate_collection(
        collection_name="documents",
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
        optimizers_config=OptimizersConfigDiff(
            default_segment_number=2,  # Fewer segments for more powerful machines
            indexing_threshold=50000,  # Higher threshold for batch processing
        ),
        shard_number=1  # Single shard for vertical scaling
    )
    """)

    # Comment on what this achieves
    print("\nThis configuration optimizes for a single powerful machine by:")
    print("- Using fewer segments (better for machines with more RAM)")
    print("- Setting a higher indexing threshold (processes more vectors in batch)")
    print("- Using a single shard (no distribution)")

except ImportError:
    print("Qdrant client not installed, skipping example")

# 1.2 Horizontal Scaling Configuration (Milvus Example)
print("\n--- Horizontal Scaling Configuration (Milvus) ---")

try:
    from pymilvus import connections, utility, Collection, FieldSchema, CollectionSchema, DataType

    print("# Configuration for a horizontally scaled Milvus deployment")
    print("""
    # Connect to a Milvus cluster
    connections.connect(
        alias="default",
        host="localhost",
        port="19530"
    )

    # Create a collection with sharding configuration
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
        FieldSchema(name="metadata", dtype=DataType.JSON)
    ]
    schema = CollectionSchema(fields=fields, description="Document vectors")

    # Create collection with multiple shards for horizontal scaling
    collection = Collection(
        name="documents",
        schema=schema,
        shards_num=3,  # Distribute across 3 shards
        properties={"collection.ttl.seconds": 0}  # No time-to-live limit
    )
    """)

    # Comment on what this achieves
    print("\nThis configuration enables horizontal scaling by:")
    print("- Distributing vectors across 3 shards")
    print("- Each shard can be hosted on a separate server")
    print("- Allowing for parallel query processing")

except ImportError:
    print("Milvus client not installed, skipping example")

# 1.3 Cloud-Native Deployment (Pinecone Example)
print("\n--- Cloud-Native Deployment (Pinecone) ---")

try:
    import pinecone

    print("# Example of deploying with Pinecone (a cloud-native vector store)")
    print("""
    # Initialize Pinecone with API key
    pinecone.init(
        api_key="your-api-key",
        environment="gcp-starter"  # Choose the appropriate environment
    )

    # Create a distributed index with optimal configuration
    if "production-index" not in pinecone.list_indexes():
        pinecone.create_index(
            name="production-index",
            dimension=1536,
            metric="cosine",
            pods=3,  # Number of compute units
            pod_type="p2",  # Performance tier
            replicas=2  # For high availability and throughput
        )

    # Connect the vector store
    index = pinecone.Index("production-index")
    vectorstore = Pinecone(index, embeddings, "text")
    """)

    # Comment on what this achieves
    print("\nThis cloud configuration provides:")
    print("- Distributed architecture with 3 pods")
    print("- High availability with 2 replicas")
    print("- Managed infrastructure (no server provisioning)")
    print("- Automatic scaling and maintenance")

except ImportError:
    print("Pinecone client not installed, skipping example")

**Part 2: Data Consistency and Updates**

In [None]:
# =================================================================
# Part 2: Data Consistency and Updates
# =================================================================
print("\n=== Data Consistency and Updates ===")

# 2.1 Handling Document Updates with Explicit IDs
print("\n--- Document Updates with Explicit IDs ---")

# Create a simple FAISS index for demonstration
vectorstore = FAISS.from_documents(documents[:10], embeddings)

# Define a function to generate consistent IDs
def get_document_id(document):
    """Create a deterministic ID based on document metadata."""
    # In a real system, you might use a unique property like URL or DB ID
    return f"doc_{document.metadata['id']}"

# Example of updating documents
print("# Adding documents with explicit IDs")
print("""
# Add documents with explicit IDs
document_ids = [get_document_id(doc) for doc in documents]
vectorstore = Chroma.from_documents(
    documents,
    embeddings,
    ids=document_ids,
    persist_directory="./production_db"
)

# Later, when a document changes:
updated_documents = [updated_doc1, updated_doc2]
updated_ids = [get_document_id(doc) for doc in updated_documents]

# Delete old versions
vectorstore.delete(updated_ids)

# Add updated versions
vectorstore.add_documents(updated_documents, ids=updated_ids)
""")

# 2.2 Synchronization Strategies
print("\n--- Synchronization Strategies ---")

# Event-driven updates
print("# Event-driven update handler")
print('''
def document_change_handler(document_id, new_content, operation_type):
    """Handle updates to documents in real-time."""
    if operation_type == "CREATE" or operation_type == "UPDATE":
        # Create embedding and update vector store
        doc_object = Document(
            page_content=new_content,
            metadata={"doc_id": document_id}
        )
        vector_id = f"doc_{document_id}"
        vectorstore.add_documents([doc_object], ids=[vector_id])

    elif operation_type == "DELETE":
        # Remove from vector store
        vector_id = f"doc_{document_id}"
        vectorstore.delete([vector_id])
''')

# Batch synchronization
print("\n# Batch synchronization process")
print('''
def scheduled_synchronization():
    """Synchronize vector store with source system periodically."""
    # Get all document IDs from source system
    source_doc_ids = source_system.get_all_document_ids()

    # Get all document IDs from vector store
    vector_ids = vectorstore.get_all_ids()

    # Find documents to add (in source but not in vector store)
    docs_to_add = [id for id in source_doc_ids
                  if f"doc_{id}" not in vector_ids]

    # Find documents to remove (in vector store but not in source)
    ids_to_remove = [id for id in vector_ids
                    if id.startswith("doc_") and id[4:] not in source_doc_ids]

    # Process additions
    if docs_to_add:
        new_docs = source_system.get_documents(docs_to_add)
        vectorstore.add_documents(new_docs,
                               ids=[f"doc_{id}" for id in docs_to_add])

    # Process removals
    if ids_to_remove:
        vectorstore.delete(ids_to_remove)
''')

**Part 3: High Availability and Disaster Recovery**

In [None]:
# =================================================================
# Part 3: High Availability and Disaster Recovery
# =================================================================
print("\n=== High Availability and Disaster Recovery ===")

# 3.1 Replication with Weaviate
print("\n--- Replication Configuration (Weaviate) ---")

try:
    import weaviate

    print("# Configuring replication in Weaviate")
    print("""
    # Connect to Weaviate cluster
    client = weaviate.Client(
        url="https://your-cluster-url.weaviate.network",
        auth_client_secret=weaviate.auth.AuthApiKey(api_key="your-api-key")
    )

    # Create a class (collection) with replication configuration
    class_obj = {
        "class": "Document",
        "vectorizer": "none",  # We'll provide our own vectors
        "vectorIndexConfig": {
            "distance": "cosine",
            "ef": 256,
            "efConstruction": 512,
            "maxConnections": 64
        },
        "replicationConfig": {
            "factor": 2  # Each vector exists on 2 nodes
        }
    }

    # Create the class with replication
    client.schema.create_class(class_obj)
    """)

except ImportError:
    print("Weaviate client not installed, skipping example")

# 3.2 Backup and Restore
print("\n--- Backup and Restore ---")

# FAISS backup example
print("# Backup and restore with FAISS")
print('''
import faiss
import time

# Create a timestamped backup
backup_path = f"backups/faiss_index_{int(time.time())}.bin"

# Write the index to disk
faiss.write_index(faiss_index, backup_path)

# Later, to restore:
restored_index = faiss.read_index(backup_path)
''')

# Cloud backup example (pseudocode)
print("\n# Backing up a cloud-based index (Pinecone example)")
print('''
def backup_pinecone_index(index_name, bucket_name, prefix):
    """Back up a Pinecone index to S3."""
    # Get the index
    index = pinecone.Index(index_name)

    # Define a batch size for retrieval
    batch_size = 1000

    # Get total vector count
    stats = index.describe_index_stats()
    total_vectors = stats['total_vector_count']

    # Process in batches
    for i in range(0, total_vectors, batch_size):
        # Fetch vectors (implementation depends on index structure)
        vectors_batch = index.fetch_vectors(ids=[...])  # IDs for this batch

        # Save to S3
        batch_file = f"{prefix}/batch_{i}.json"
        s3.put_object(
            Bucket=bucket_name,
            Key=batch_file,
            Body=json.dumps(vectors_batch)
        )
''')


**Part 4: Security Considerations**

In [None]:
# =================================================================
# Part 4: Security Considerations
# =================================================================
print("\n=== Security Considerations ===")

# 4.1 Data Encryption
print("\n--- Data Encryption ---")

# Encryption in transit example
print("# Configuring encryption in transit (Qdrant example)")
print('''
# Connect using TLS for encryption in transit
client = QdrantClient(
    url="https://your-server:6333",  # HTTPS endpoint
    timeout=60,
    api_key="your-api-key"  # Authentication
)
''')

# 4.2 Access Controls
print("\n--- Access Controls ---")

# Authentication example
print("# Authentication with Milvus")
print('''
# Connect with authentication
connections.connect(
    alias="default",
    host="localhost",
    port="19530",
    user="your-username",  # Authentication
    password="your-password"
)
''')


**Part 5: Monitoring and Maintenance**

In [None]:
# =================================================================
# Part 5: Monitoring and Maintenance
# =================================================================
print("\n=== Monitoring and Maintenance ===")

# 5.1 Performance Monitoring
print("\n--- Performance Monitoring ---")

# Monitoring wrapper implementation
print("# Monitoring wrapper for vector store operations")
print('''
@contextmanager
def monitor_operation(operation_name, metadata=None):
    """Monitor the performance of vector store operations."""
    start_time = time.time()
    error = None
    try:
        yield
    except Exception as e:
        error = e
        raise
    finally:
        duration = time.time() - start_time
        log_entry = {
            "operation": operation_name,
            "duration_seconds": duration,
            "success": error is None,
            "metadata": metadata or {}
        }
        if error:
            log_entry["error"] = str(error)

        logging.info(f"Vector Store Metric: {log_entry}")

# Usage example
with monitor_operation("similarity_search", {"query": "renewable energy", "k": 5}):
    results = vectorstore.similarity_search("renewable energy", k=5)
''')

# 5.2 Health Checks
print("\n--- Health Checks ---")

# Basic health check implementation
print("# Health check for vector store")
print('''
def vector_store_health_check(vectorstore):
    """Perform basic health check on vector store."""
    try:
        # Try a simple query
        start_time = time.time()
        vectorstore.similarity_search("test query", k=1)
        query_time = time.time() - start_time

        # Check if query time is within acceptable range
        if query_time > 1.0:  # More than 1 second is slow
            return {
                "status": "warning",
                "message": f"Slow query response: {query_time:.2f} seconds",
                "query_time": query_time
            }

        return {
            "status": "healthy",
            "message": "Vector store is responding normally",
            "query_time": query_time
        }

    except Exception as e:
        return {
            "status": "unhealthy",
            "message": f"Vector store error: {str(e)}",
            "error": str(e)
        }
''')

print("\nNotebook execution complete!")