In [0]:
%pip install databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
#Define the catalog name for each user 
workspace_url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()
digits = ''.join(filter(str.isdigit, workspace_url))
catalog = 'catalog_' + digits

#Define the schema name for the workshop
schema="agent_workshop"

#define the full schema name
full_schema_name = f"{catalog}.{schema}"

#Define necessary tables
guidance_table = f"{catalog}.{schema}.guidance_unstructure"

#Define the vector search index details
vs_index_name = f"{catalog}.{schema}.guidance_gold_index"

#define the vector search endpoint name (this will be holding the vector index)
vs_endpoint_name = "agent-workshop-vs-endpoint"

#Input table with unstructured data to perform vector index upon
vs_input_table = guidance_table

## Ingest Unstructured Guidance Data

This step reads text files containing guidance on portfolio, fraud, and market volatility from the `00-input_data`. Each file is loaded as a row with its content and file path, and a unique `file_id` is added for indexing or lookup purposes.


In [0]:
# -- 05 - Store unstructured guidance data on portfolio, fraud, and market volatility 

import os
from pyspark.sql import Row
from pyspark.sql.functions import monotonically_increasing_id

unstructured = []
for root, dirs, files in os.walk('00-input_data'):
    for file in files:
        with open(os.path.join(root, file), 'r') as f:
            file_name = os.path.join(root, file)
            file_content = str(f.read())
            unstructured.append(Row(file_name=file_name, file_content=file_content))

# Create DataFrame and add file_id
df = spark.createDataFrame(unstructured).withColumn("file_id", monotonically_increasing_id())

# Store the data into Unity Catalog
df.write.format("delta").mode("overwrite").option("mergeSchema", 'true').saveAsTable(guidance_table)
display(df)


# Generate Vector Search Index
Creating a vector search endpoint (if it doesn’t already exist, you can check in **Compute** then **Vector Search**)

Creating a **[Databricks Vector Search](https://docs.databricks.com/aws/en/generative-ai/vector-search)** index on top of the unstructured guidance data (e.g. portfolio, fraud, market volatility) This index allows us to do fast similarity searches. 



In [0]:
from databricks.vector_search.client import VectorSearchClient

# The following line automatically generates a PAT Token for authentication
client = VectorSearchClient()
try:
    client.create_endpoint(
        name=vs_endpoint_name,
        endpoint_type="STANDARD"
    )
except:
    print('endpoint already exists')

# Enable change data feed for the Delta table
spark.sql(f"ALTER TABLE {vs_input_table} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

#We are creating a vector search index inside the vector search endpoint with the previous credentials 

#create_delta_sync_index for databricks managed embeddings
index = client.create_delta_sync_index(
  endpoint_name=vs_endpoint_name,
  source_table_name=vs_input_table,
  index_name=vs_index_name,
  pipeline_type="TRIGGERED",
  primary_key="file_name",
  embedding_source_column="file_content",
  embedding_model_endpoint_name="<databricks embedding model endpoint>"  #Databricks hosted embedding models 
)