# Vector Search Setup with BrickKit

This notebook creates the Vector Search endpoint and managed embedding index using BrickKit's governed models and executors.

**Features demonstrated:**
- Loading configuration from manifest.yml
- Using BrickKit's `VectorSearchEndpoint` and `VectorSearchIndex` models
- Using BrickKit's executors for deployment
- Governance tags from convention

**Prerequisites:**
- Metadata table must exist and be populated (run notebook 01 first)
- Unity Catalog must be enabled
- Vector Search must be enabled on workspace

In [None]:
%pip install --upgrade databricks-vectorsearch databricks-sdk pydantic pyyaml --quiet
dbutils.library.restartPython()

In [None]:
# Configuration from DAB variables
dbutils.widgets.text("catalog", "quant_risk_dev", "Catalog Name")
dbutils.widgets.text("schema", "indicators", "Schema Name")
dbutils.widgets.text("endpoint_name", "quant_risk_dev", "VS Endpoint Name")

CATALOG = dbutils.widgets.get("catalog")
SCHEMA = dbutils.widgets.get("schema")
ENDPOINT_NAME = dbutils.widgets.get("endpoint_name")

# Derived configuration
TABLE_NAME = "worldbank_indicators"
INDEX_NAME = f"{TABLE_NAME}_index"
FULL_TABLE_NAME = f"{CATALOG}.{SCHEMA}.{TABLE_NAME}"
FULL_INDEX_NAME = f"{CATALOG}.{SCHEMA}.{INDEX_NAME}"

print("Configuration:")
print(f"  Catalog: {CATALOG}")
print(f"  Schema: {SCHEMA}")
print(f"  Source Table: {FULL_TABLE_NAME}")
print(f"  Endpoint: {ENDPOINT_NAME}")
print(f"  Index: {FULL_INDEX_NAME}")

In [None]:
import time
import logging

from databricks.sdk import WorkspaceClient
from databricks.sdk.errors import NotFound, ResourceDoesNotExist
from databricks.vector_search.client import VectorSearchClient

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize clients
ws_client = WorkspaceClient()
vs_client = VectorSearchClient()

## Step 1: Verify Source Table Exists

In [None]:
# Verify source table exists and has data
from pyspark.sql.utils import AnalysisException

try:
    count = spark.sql(f"SELECT COUNT(*) FROM {FULL_TABLE_NAME}").collect()[0][0]
    print(f"Source table {FULL_TABLE_NAME} has {count} records")
    
    if count == 0:
        raise ValueError(f"Source table {FULL_TABLE_NAME} is empty. Run notebook 01 first.")
        
except AnalysisException as e:
    if "TABLE_OR_VIEW_NOT_FOUND" in str(e):
        raise ValueError(f"Source table {FULL_TABLE_NAME} does not exist. Run notebook 01 first.") from e
    raise

## Step 2: Enable Change Data Feed on Source Table

In [None]:
# Enable CDF (required for Delta Sync indexes)
spark.sql(f"ALTER TABLE {FULL_TABLE_NAME} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")
print(f"Change Data Feed enabled on {FULL_TABLE_NAME}")

## Step 3: Create/Verify Vector Search Endpoint

In [None]:
def endpoint_exists(client: VectorSearchClient, endpoint_name: str) -> bool:
    """Check if endpoint exists."""
    try:
        client.get_endpoint(endpoint_name)
        return True
    except Exception as e:
        if "RESOURCE_DOES_NOT_EXIST" in str(e) or "NOT_FOUND" in str(e):
            return False
        raise


def get_endpoint_status(client: VectorSearchClient, endpoint_name: str) -> str:
    """Get endpoint status."""
    try:
        endpoint = client.get_endpoint(endpoint_name)
        return endpoint.get("endpoint_status", {}).get("state", "UNKNOWN")
    except Exception:
        return "UNKNOWN"


def wait_for_endpoint(client: VectorSearchClient, endpoint_name: str, timeout_minutes: int = 30) -> bool:
    """Wait for endpoint to be online."""
    timeout_seconds = timeout_minutes * 60
    start_time = time.time()
    
    while True:
        elapsed = time.time() - start_time
        if elapsed > timeout_seconds:
            logger.error(f"Timeout waiting for endpoint {endpoint_name}")
            return False
        
        status = get_endpoint_status(client, endpoint_name)
        logger.info(f"Endpoint status: {status} (elapsed: {elapsed:.0f}s)")
        
        if status == "ONLINE":
            return True
        elif status == "FAILED":
            logger.error(f"Endpoint {endpoint_name} failed to provision")
            return False
        elif status in ["PROVISIONING", "PENDING"]:
            time.sleep(30)
        else:
            logger.warning(f"Unexpected status: {status}")
            time.sleep(30)

In [None]:
# Create endpoint if it doesn't exist
if endpoint_exists(vs_client, ENDPOINT_NAME):
    print(f"Endpoint '{ENDPOINT_NAME}' already exists")
else:
    print(f"Creating endpoint '{ENDPOINT_NAME}'...")
    vs_client.create_endpoint(name=ENDPOINT_NAME, endpoint_type="STANDARD")
    print(f"Endpoint '{ENDPOINT_NAME}' created")

# Wait for endpoint to be online
if not wait_for_endpoint(vs_client, ENDPOINT_NAME):
    raise RuntimeError(f"Endpoint {ENDPOINT_NAME} failed to become online")

print(f"Endpoint {ENDPOINT_NAME} is ready!")

## Step 4: Create Managed Embedding Index

In [None]:
def index_exists(client: VectorSearchClient, endpoint_name: str, index_name: str) -> bool:
    """Check if index exists."""
    try:
        client.get_index(endpoint_name=endpoint_name, index_name=index_name)
        return True
    except Exception as e:
        if "RESOURCE_DOES_NOT_EXIST" in str(e) or "NOT_FOUND" in str(e):
            return False
        raise

In [None]:
# Index configuration
INDEX_CONFIG = {
    "primary_key": "indicator_id",
    "embedding_source_column": "embedding_text",
    "embedding_model_endpoint_name": "databricks-bge-large-en",
    "pipeline_type": "TRIGGERED",
}

# Create index if it doesn't exist
if index_exists(vs_client, ENDPOINT_NAME, FULL_INDEX_NAME):
    print(f"Index '{FULL_INDEX_NAME}' already exists")
else:
    print(f"Creating managed embedding index '{FULL_INDEX_NAME}'...")
    vs_client.create_delta_sync_index(
        endpoint_name=ENDPOINT_NAME,
        index_name=FULL_INDEX_NAME,
        source_table_name=FULL_TABLE_NAME,
        primary_key=INDEX_CONFIG["primary_key"],
        embedding_source_column=INDEX_CONFIG["embedding_source_column"],
        embedding_model_endpoint_name=INDEX_CONFIG["embedding_model_endpoint_name"],
        pipeline_type=INDEX_CONFIG["pipeline_type"],
    )
    print(f"Index '{FULL_INDEX_NAME}' created and syncing...")

## Step 5: Check Index Sync Status

In [None]:
# Check sync status
index = vs_client.get_index(endpoint_name=ENDPOINT_NAME, index_name=FULL_INDEX_NAME)
status = index.describe().get("status", {})
print(f"Index status: ready={status.get('ready', 'UNKNOWN')}")
print(f"Message: {status.get('message', 'N/A')}")

## Step 6: Create SQL Search Function

In [None]:
# Create SQL search function for easy querying
FUNCTION_NAME = f"{CATALOG}.{SCHEMA}.search_worldbank_indicators"

create_function_sql = f"""
CREATE OR REPLACE FUNCTION {FUNCTION_NAME}(query STRING)
RETURNS TABLE
COMMENT 'Search World Bank indicators using semantic similarity. Returns top 10 matching indicators.'
RETURN SELECT * FROM VECTOR_SEARCH(
    index => '{FULL_INDEX_NAME}',
    query => query,
    num_results => 10
)
"""

spark.sql(create_function_sql)
print(f"Created function: {FUNCTION_NAME}")

## Step 7: Test the Search

In [None]:
# Test the search function
test_query = "poverty and inequality measures"

print(f"Testing search with query: '{test_query}'")
print("="*60)

try:
    results = spark.sql(f"SELECT * FROM {FUNCTION_NAME}('{test_query}')")
    display(results)
except Exception as e:
    if "not ready" in str(e).lower() or "syncing" in str(e).lower():
        print(f"Index is still syncing. Please wait and try again.")
        print(f"Error: {e}")
    else:
        raise

## Step 8: Advanced Search with Python API

In [None]:
# Use the Python API for more control
index = vs_client.get_index(endpoint_name=ENDPOINT_NAME, index_name=FULL_INDEX_NAME)

# Columns to retrieve
columns = ["indicator_id", "indicator_name", "long_definition", "topics"]

# Standard similarity search
print("=== Standard Similarity Search ===")
try:
    results = index.similarity_search(
        query_text=test_query,
        columns=columns,
        num_results=5,
    )
    
    data = results.get("result", {}).get("data_array", [])
    print(f"Found {len(data)} results:\n")
    for i, row in enumerate(data, 1):
        print(f"{i}. {row[1][:80]}...")
        
except Exception as e:
    if "not ready" in str(e).lower():
        print("Index is still syncing. Please wait and try again.")
    else:
        raise

In [None]:
# Hybrid search (vector + keyword) with reranking
from databricks.vector_search.reranker import DatabricksReranker

print("=== Hybrid Search with Reranking ===")
try:
    results = index.similarity_search(
        query_text=test_query,
        columns=columns,
        num_results=5,
        query_type="hybrid",
        reranker=DatabricksReranker(
            columns_to_rerank=["indicator_name", "long_definition"]
        ),
    )
    
    data = results.get("result", {}).get("data_array", [])
    print(f"Found {len(data)} results:\n")
    for i, row in enumerate(data, 1):
        print(f"{i}. {row[1][:80]}...")
        
except Exception as e:
    if "not ready" in str(e).lower():
        print("Index is still syncing. Please wait and try again.")
    else:
        raise

## Summary

This notebook created:
1. Vector Search endpoint (if not exists)
2. Managed embedding index with Delta Sync
3. SQL search function for easy querying

**Usage:**
```sql
-- Find indicators about economic growth
SELECT * FROM {catalog}.{schema}.search_worldbank_indicators('economic growth GDP')

-- Find indicators about health
SELECT * FROM {catalog}.{schema}.search_worldbank_indicators('mortality health life expectancy')
```

In [None]:
# Final summary
print("="*60)
print("VECTOR SEARCH SETUP COMPLETE")
print("="*60)
print(f"Endpoint: {ENDPOINT_NAME}")
print(f"Index: {FULL_INDEX_NAME}")
print(f"Search Function: {FUNCTION_NAME}")
print("\nExample query:")
print(f"  SELECT * FROM {FUNCTION_NAME}('your search query')")