## Index Creation (only ran once)

In [0]:
%pip install --upgrade --force-reinstall databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
%skip
df_gold_summarized = spark.table("gold.conversations_summarized")
display(df_gold_summarized)

In [0]:
%skip
display(df_gold_summarized.limit(10))

In [0]:
%skip
top_categories = [
    "technology_and_computing",
    "arts_and_entertainment",
    "science_and_history",
    "arts_and_entertainment",
    "government_and_politics",
    "science_and_history",
    "sports_and_recreation",
    "travel_and_tourism",
    "science_and_history",
    "science_and_history"
]

from pyspark.sql import functions as F, Window

w = Window.orderBy(F.monotonically_increasing_id())

df_top10 = (
    df_gold_summarized
    .limit(10)
    .withColumn("row_id", F.row_number().over(w) - 1)
)

df_top10 = df_top10.withColumn(
    "top_category",
    F.element_at(
        F.array(*[F.lit(x) for x in top_categories]),
        F.col("row_id") + 1
    )
).drop("row_id")

display(df_top10)

In [0]:
%skip
df_top10.write.mode("overwrite").saveAsTable("gold.index_creation")

## Index Update

In [0]:
from pyspark.sql import functions as F

df_llm = (
    spark.table("gold.conversations_classified")
    .filter(F.col("classification_method") == "llm")
)


In [0]:
df_for_index = df_llm.select(
    "ISID",
    "conversation_id",
    "combined_text",
    "char_count",
    "country",
    "convo_summary",
    "top_category"
)

In [0]:
df_for_index.createOrReplaceTempView("updates")

spark.sql("""
MERGE INTO workspace.gold.index_creation AS target
USING updates AS source
ON target.conversation_id = source.conversation_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")


In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

index = vsc.get_index(index_name="workspace.gold.idx_convos_classified")
index.sync()