In [0]:
# # Define fully qualified table identifiers
# catalog_name = "dataroom_eval_bird"
# schema_name = "california_schools"
# table_name = "frpm"

# # Retrieve detailed table description (includes columns, data types, comments, etc.)
# df_description = spark.sql(
#     f"DESCRIBE EXTENDED {catalog_name}.{schema_name}.{table_name}"
# )
# # Display the metadata
# display(df_description)

In [0]:
import json
import pandas as pd

In [0]:
#: Databricks notebook widgets for parameterization
# catalog/schema/table refers to source table.

dbutils.widgets.text("catalog_name", "dataroom_eval_bird")
dbutils.widgets.text("schema_name", "california_schools")
dbutils.widgets.text("table_name", "frpm")

catalog_name = dbutils.widgets.get("catalog_name")
schema_name = dbutils.widgets.get("schema_name")
table_name = dbutils.widgets.get("table_name")

# Retrieve detailed table description (includes columns, data types, comments, etc.)
df_description = spark.sql(
    f"DESCRIBE EXTENDED {catalog_name}.{schema_name}.{table_name}"
)
# Display the metadata
display(df_description)

In [0]:
# I dont need rows including col_name in ( '# Delta Statistics Columns', 'Column Names', 'Column Selection Method', 'Created Time', 'Last Access', 'Created By', 'Statistics', 'Type', 'Location', Provider, Owner, Is_managed_location, Predictive Optimization, Table Properties, NULL)

# # Preview the DESCRIBE EXTENDED output (optional, for debugging)
# display(df_description.limit(20))

# Keep only rows where data_type is not null and col_name does not start with '#'
from pyspark.sql.functions import col

# Implement a robust filter for df_description to keep only real columns and you can also manually append the colnames you want to skip.
df_description_clean = df_description.filter(
    (col('data_type').isNotNull()) & (~col('col_name').startswith('#')) & (col('col_name') != '') & (~col('col_name').isin(
    '# Delta Statistics Columns', 'Column Names', 'Column Selection Method', 'Created Time', 'Last Access', 'Created By', 'Statistics', 'Type', 'Location', 'Provider', 'Owner', 'Is_managed_location', 'Predictive Optimization', 'Table Properties')))

display(df_description_clean)

In [0]:
# 1. Collect column metadata
columns = df_description_clean.select('col_name', 'data_type', 'comment').toPandas().to_dict('records')

In [0]:
prompt = (
    "You will receive a list of columns in JSON: "
    + json.dumps(columns)
    + "\n Use all these information to improve the 'comment' of a single entry, e.g., for {\"col_name\": \"CDSCode\", \"data_type\": \"string\", \"comment\": \"CDSCode\"}, improve the field {\"comment\": \"CDSCode\"} to be {\"comment\": \"A CDS (County-District-School) code is a unique 14-digit identification number assigned to each school within California by the California Department of Education (CDE) for tracking and administrative purposes related to funding, assessments, and grants. \"}. Return the JSON object with exactly the same format as before, just add another field 'updated_comment' for each entry's improved comment generated by you. Only return the JSON object in your answer. Dont add any explanations and thoughts."
)

In [0]:
prompt

In [0]:
llm_result = spark.sql(f"SELECT ai_query('databricks-llama-4-maverick', ? ) as result", [prompt]).collect()[0]['result']

In [0]:
display(llm_result)

In [0]:
llm_result_str = llm_result.replace('```json', '').replace('```','') 

In [0]:
print(llm_result_str)

In [0]:
import json
col2comment = json.loads(llm_result_str)

In [0]:
df_table_with_comment = spark.createDataFrame(col2comment)

In [0]:
# TODO: parameterize destination_table_name
destination_table_name = f"yyang.hackathon_2025q3_project_geniepromptautocompletion_genieeng_feindustryhls.{table_name}_ai_comment"
df_table_with_comment.write.mode("overwrite") \
                     .saveAsTable(destination_table_name)