In [0]:
%pip install --upgrade --force-reinstall databricks-vectorsearch
dbutils.library.restartPython()

Collecting databricks-vectorsearch
  Downloading databricks_vectorsearch-0.63-py3-none-any.whl.metadata (2.8 kB)
Collecting deprecation>=2 (from databricks-vectorsearch)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting mlflow-skinny<4,>=2.11.3 (from databricks-vectorsearch)
  Downloading mlflow_skinny-3.8.0-py3-none-any.whl.metadata (31 kB)
Collecting protobuf<7,>=3.12.0 (from databricks-vectorsearch)
  Downloading protobuf-6.33.2-cp39-abi3-manylinux2014_aarch64.whl.metadata (593 bytes)
Collecting requests>=2 (from databricks-vectorsearch)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting packaging (from deprecation>=2->databricks-vectorsearch)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting cachetools<7,>=5.0.0 (from mlflow-skinny<4,>=2.11.3->databricks-vectorsearch)
  Downloading cachetools-6.2.4-py3-none-any.whl.metadata (5.6 kB)
Collecting click<9,>=7.0 (from mlflow-skinny<4,>=2.11.3->databricks-v

## Load Vector Index

In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

In [0]:
vector_search_endpoint_name = "widlchat"
vsc.get_endpoint(
  name=vector_search_endpoint_name
)

{'name': 'widlchat',
 'creator': 'ammarbagharib@gmail.com',
 'creation_timestamp': 1765100413443,
 'last_updated_timestamp': 1765100413443,
 'endpoint_type': 'STANDARD',
 'last_updated_user': 'ammarbagharib@gmail.com',
 'id': '7bdb1ad1-2826-409a-a18b-517bc84e8a6e',
 'endpoint_status': {'state': 'ONLINE'},
 'num_indexes': 1,
 'throughput_info': {'requested_concurrency': 2.0,
  'current_concurrency': 2.0,
  'current_concurrency_utilization_percentage': 5.0,
  'change_request_state': 'CHANGE_SUCCESS',
  'requested_num_replicas': 1,
  'current_num_replicas': 1}}

In [0]:
# Overview tab for the vector index.
vs_index_fullname = "workspace.gold.idx_convos_classified"
index = vsc.get_index(endpoint_name=vector_search_endpoint_name,index_name=vs_index_fullname)
index.describe()

{'name': 'workspace.gold.idx_convos_classified',
 'endpoint_name': 'widlchat',
 'primary_key': 'conversation_id',
 'index_type': 'DELTA_SYNC',
 'delta_sync_index_spec': {'source_table': 'workspace.gold.index_creation',
  'embedding_source_columns': [{'name': 'combined_text',
    'embedding_model_endpoint_name': 'databricks-gte-large-en'}],
  'pipeline_type': 'TRIGGERED',
  'pipeline_id': 'f76d1067-b191-4461-b47a-3667c53b9831'},
 'status': {'detailed_state': 'ONLINE_TRIGGERED_UPDATE',
  'message': 'Index creation succeeded. Check latest status: https://dbc-dbab8434-5a1d.cloud.databricks.com/explore/data/workspace/gold/idx_convos_classified',
  'indexed_row_count': 660,
  'triggered_update_status': {'last_processed_commit_version': 3,
   'last_processed_commit_timestamp': '2025-12-22T16:03:31Z',
   'triggered_update_progress': {'latest_version_currently_processing': 4,
    'num_synced_rows': 450,
    'total_rows_to_sync': 5076,
    'sync_progress_completion': 0.0887,
    'estimated_compl

## Semantic classification function

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [0]:
# Load data and classify
df_gold_summarized = spark.table("gold.conversations_summarized")

df_gold_summarized_sampled = (
    df_gold_summarized
    .orderBy(F.rand())
    .limit(5000)
)

df_gold_summarized_sampled.createOrReplaceTempView("sampled_src")

In [0]:
df_semantic_classified = spark.sql("""
SELECT
  src.conversation_id,
  src.ISID,
  src.country,
  src.combined_text,
  src.char_count,
  src.convo_summary,

  -- semantic hit → category, else empty
  CASE
    WHEN vs.search_score >= 0.75 THEN vs.top_category
    ELSE ''
  END AS top_category,

  -- semantic hit → similarity score, else 0.0
  CASE
    WHEN vs.search_score >= 0.75 THEN vs.search_score
    ELSE 0.0
  END AS similarity_score,

  -- semantic hit → 'semantic', else empty
  CASE
    WHEN vs.search_score >= 0.75 THEN 'semantic'
    ELSE ''
  END AS classification_method

FROM sampled_src src

LEFT JOIN LATERAL (
  SELECT
    result.top_category,
    result.search_score
  FROM vector_search(
    index => 'workspace.gold.idx_convos_classified',
    query_text => src.combined_text,
    num_results => 1
  ) AS result
) vs
""")

# spark.sql("DROP TABLE gold.conversations_semantic_classified")
df_semantic_classified.write.mode("overwrite").saveAsTable(
    "gold.conversations_semantic_classified"
)




In [0]:
spark.table("gold.conversations_semantic_classified") \
     .groupBy("classification_method") \
     .count() \
     .orderBy(F.desc("count")) \
     .show()


+---------------------+-----+
|classification_method|count|
+---------------------+-----+
|                     | 4976|
|             semantic|   24|
+---------------------+-----+

