In [0]:
# Guide -- https://docs.databricks.com/aws/en/generative-ai/create-query-vector-search

%pip install databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
%sql
--CREATE CATALOG IF NOT EXISTS prod

--CREATE SCHEMA IF NOT EXISTS prod.storyspark;

USE CATALOG prod;
USE SCHEMA storyspark;

--DROP TABLE IF EXISTS prod.storyspark.book_inventory;

--SHOW TABLES IN prod.storyspark;

/*
CREATE TABLE IF NOT EXISTS book_inventory(
  entry_id STRING,
  book_id STRING,
  owner_id STRING,
  title STRING,
  author STRING,
  last_read DATE,
  relevant_text STRING,
  last_updated_at TIMESTAMP
)
USING DELTA;

ALTER TABLE prod.storyspark.book_inventory
  SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
*/


In [0]:
# We can reuse this endpoint so no need to create it over and over again

from databricks.vector_search.client import VectorSearchClient

# The following line automatically generates a PAT Token for authentication
client = VectorSearchClient()

# Define your index details
endpoint_name = "storyspark_book_inventory_endpoint"
source_table_name = "prod.storyspark.book_inventory"
index_name = "prod.storyspark.book_inventory_index"
primary_key = "entry_id"
embedding_source_column = "relevant_text"
embedding_model_endpoint_name = "databricks-gte-large-en"

# We are only using one end point so we do not need to continually delete/remake this one.  We can just reuse it.
# client.create_endpoint(\
#     name=endpoint_name,\
#     endpoint_type="STANDARD"\
# )

# Create the Delta Sync Index in TRIGGERED mode and use sync_book_inventory to do the sync per scheduled job
# client.list_indexes(endpoint_name)
# client.delete_index(index_name=index_name)
# client.create_delta_sync_index(\
#     endpoint_name=endpoint_name,\
#     source_table_name=source_table_name,\
#     index_name=index_name,\
#     pipeline_type="TRIGGERED",\
#     primary_key=primary_key,\
#     embedding_source_column=embedding_source_column,\
#     embedding_model_endpoint_name=embedding_model_endpoint_name\
# )

In [0]:
from pyspark.sql import Row
import json

owner1 = "amcheng@umich.edu"
owner2 = "andy.ming.kong.cheng@gmail.com"

book_data = []
book_data.append({\
    "relevant_text": "princess, giant, beanstalk, fairy tales, bedtime stories, porridge, friendship, friends",\
    "book_id": "978-0763680077",\
    "owner_id": "amcheng@umich.edu",\
    "title": "The Princess and the Giant",\
    "author": "Caryl Hart/Sarah Warburton"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, waiting",\
    "book_id": "978-1423199571",\
    "owner_id": "amcheng@umich.edu",\
    "title": "Waiting Is Not Easy!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, bird",\
    "book_id": "978-1423106869",\
    "owner_id": "amcheng@umich.edu",\
    "title": "There Is a Bird On Your Head!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, ice cream, heat, sharing",\
    "book_id": "978-1423143437",\
    "owner_id": "amcheng@umich.edu",\
    "title": "Should I Share My Ice Cream?",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, book, 4th wall",\
    "book_id": "978-1423133087",\
    "owner_id": "amcheng@umich.edu",\
    "title": "We Are in a Book!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, new friend, ball, dinosaur",\
    "book_id": "978-1423174912",\
    "owner_id": "amcheng@umich.edu",\
    "title": "A Big Guy Took My Ball!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, new friend, jealousy, scared, worried",\
    "book_id": "978-1423179580",\
    "owner_id": "amcheng@umich.edu",\
    "title": "My New Friend Is So Fun!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, piano, hippo, sister, funny, cast",\
    "book_id": "978-1423133094",\
    "owner_id": "amcheng@umich.edu",\
    "title": "I Broke My Trunk!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, party, costume, fancy, pool party",\
    "book_id": "978-1423106876",\
    "owner_id": "amcheng@umich.edu",\
    "title": "I Am Invited to a Party!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, best friend, worried, doctor, sneezing, cold, allergies",\
    "book_id": "978-1423114116",\
    "owner_id": "amcheng@umich.edu",\
    "title": "Pigs Make Me Sneeze!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, lunch, going",\
    "book_id": "978-1423119906",\
    "owner_id": "amcheng@umich.edu",\
    "title": "I Am Going!",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, sad, cheering up, robot, clown, cowboy",\
    "book_id": "978-1423102977",\
    "owner_id": "amcheng@umich.edu",\
    "title": "My Friend is Sad",\
    "author": "Mo Willems"\
})
book_data.append({\
    "relevant_text": "elephant, pig, piggy, gerald, friendship, friends, train, toys, food, hill, children",\
    "book_id": "978-0448405209",\
    "owner_id": "amcheng@umich.edu",\
    "title": "The Little Engine That Could",\
    "author": "Watty Piper/George Hauman/Doris Hauman"\
})
book_data.append({\
    "relevant_text": "princess, giant, beanstalk, fairy tales, bedtime stories, porridge, friendship, friends",\
    "book_id": "978-0763680077",\
    "owner_id": "andy.ming.kong.cheng@gmail.com",\
    "title": "The Princess and the Giant",\
    "author": "Caryl Hart/Sarah Warburton"\
})

# Create the primary key from concatentation
for bd in book_data:
    bd["entry_id"] = f"{bd["owner_id"]}_{bd["book_id"]}"

rows = [Row(**book_dict) for book_dict in book_data]
df = spark.createDataFrame(rows)
display(df)

df.write.mode("overwrite").saveAsTable("prod.storyspark.book_inventory")

In [0]:
display(spark.read.table("prod.storyspark.book_inventory"))